From 01095a5d43bbfde13731688ddcf6048ebb8b7721 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sat, 23 Jul 2016 20:41:05 +0000 Subject: Vendor import of llvm release_39 branch r276489: https://llvm.org/svn/llvm-project/llvm/branches/release_39@276489 --- lib/Target/AArch64/AArch64.h | 3 + lib/Target/AArch64/AArch64.td | 187 +- lib/Target/AArch64/AArch64A53Fix835769.cpp | 10 +- lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp | 40 +- lib/Target/AArch64/AArch64AddressTypePromotion.cpp | 14 +- lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp | 128 +- lib/Target/AArch64/AArch64AsmPrinter.cpp | 67 +- lib/Target/AArch64/AArch64BranchRelaxation.cpp | 30 +- lib/Target/AArch64/AArch64CallLowering.cpp | 104 + lib/Target/AArch64/AArch64CallLowering.h | 36 + lib/Target/AArch64/AArch64CallingConvention.td | 18 + .../AArch64/AArch64CleanupLocalDynamicTLSPass.cpp | 25 +- lib/Target/AArch64/AArch64CollectLOH.cpp | 23 +- lib/Target/AArch64/AArch64ConditionOptimizer.cpp | 26 +- lib/Target/AArch64/AArch64ConditionalCompares.cpp | 24 +- .../AArch64/AArch64DeadRegisterDefinitionsPass.cpp | 18 +- lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp | 215 +- lib/Target/AArch64/AArch64FastISel.cpp | 81 +- lib/Target/AArch64/AArch64FrameLowering.cpp | 895 +- lib/Target/AArch64/AArch64FrameLowering.h | 15 +- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 2400 ++-- lib/Target/AArch64/AArch64ISelLowering.cpp | 963 +- lib/Target/AArch64/AArch64ISelLowering.h | 60 +- lib/Target/AArch64/AArch64InstrAtomics.td | 51 +- lib/Target/AArch64/AArch64InstrFormats.td | 25 +- lib/Target/AArch64/AArch64InstrInfo.cpp | 1743 ++- lib/Target/AArch64/AArch64InstrInfo.h | 107 +- lib/Target/AArch64/AArch64InstrInfo.td | 193 +- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp | 1050 +- lib/Target/AArch64/AArch64MachineFunctionInfo.h | 38 +- lib/Target/AArch64/AArch64PBQPRegAlloc.cpp | 2 +- lib/Target/AArch64/AArch64PromoteConstant.cpp | 327 +- .../AArch64/AArch64RedundantCopyElimination.cpp | 182 + lib/Target/AArch64/AArch64RegisterBankInfo.cpp | 168 + lib/Target/AArch64/AArch64RegisterBankInfo.h | 69 + lib/Target/AArch64/AArch64RegisterInfo.cpp | 26 +- lib/Target/AArch64/AArch64RegisterInfo.td | 2 +- lib/Target/AArch64/AArch64SchedA53.td | 4 +- lib/Target/AArch64/AArch64SchedA57.td | 3 + lib/Target/AArch64/AArch64SchedCyclone.td | 14 +- lib/Target/AArch64/AArch64SchedKryo.td | 133 + lib/Target/AArch64/AArch64SchedKryoDetails.td | 2358 ++++ lib/Target/AArch64/AArch64SchedM1.td | 29 +- lib/Target/AArch64/AArch64SchedVulcan.td | 855 ++ lib/Target/AArch64/AArch64Schedule.td | 8 +- lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 10 +- lib/Target/AArch64/AArch64SelectionDAGInfo.h | 14 +- lib/Target/AArch64/AArch64StorePairSuppress.cpp | 9 +- lib/Target/AArch64/AArch64Subtarget.cpp | 109 +- lib/Target/AArch64/AArch64Subtarget.h | 122 +- lib/Target/AArch64/AArch64SystemOperands.td | 1018 ++ lib/Target/AArch64/AArch64TargetMachine.cpp | 138 +- lib/Target/AArch64/AArch64TargetMachine.h | 12 +- lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 77 +- lib/Target/AArch64/AArch64TargetTransformInfo.h | 11 + lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 214 +- lib/Target/AArch64/AsmParser/Makefile | 15 - lib/Target/AArch64/CMakeLists.txt | 19 + .../AArch64/Disassembler/AArch64Disassembler.cpp | 15 +- .../AArch64/Disassembler/AArch64Disassembler.h | 2 +- .../Disassembler/AArch64ExternalSymbolizer.cpp | 8 +- .../Disassembler/AArch64ExternalSymbolizer.h | 2 +- lib/Target/AArch64/Disassembler/Makefile | 16 - .../AArch64/InstPrinter/AArch64InstPrinter.cpp | 149 +- .../AArch64/InstPrinter/AArch64InstPrinter.h | 4 +- lib/Target/AArch64/InstPrinter/Makefile | 15 - lib/Target/AArch64/LLVMBuild.txt | 2 +- .../AArch64/MCTargetDesc/AArch64AddressingModes.h | 43 + .../AArch64/MCTargetDesc/AArch64AsmBackend.cpp | 230 +- .../MCTargetDesc/AArch64ELFObjectWriter.cpp | 63 +- .../AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp | 53 - .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp | 22 +- .../AArch64/MCTargetDesc/AArch64MCTargetDesc.h | 1 - lib/Target/AArch64/MCTargetDesc/Makefile | 16 - lib/Target/AArch64/Makefile | 25 - lib/Target/AArch64/TargetInfo/Makefile | 15 - lib/Target/AArch64/Utils/AArch64BaseInfo.cpp | 943 +- lib/Target/AArch64/Utils/AArch64BaseInfo.h | 1003 +- lib/Target/AArch64/Utils/Makefile | 16 - lib/Target/AMDGPU/AMDGPU.h | 57 +- lib/Target/AMDGPU/AMDGPU.td | 417 +- lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp | 3 +- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp | 122 +- lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp | 21 +- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 469 +- lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 28 +- lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 42 + lib/Target/AMDGPU/AMDGPUCallLowering.h | 36 + lib/Target/AMDGPU/AMDGPUCallingConv.td | 8 +- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 82 + .../AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp | 26 - .../AMDGPU/AMDGPUDiagnosticInfoUnsupported.h | 48 - lib/Target/AMDGPU/AMDGPUFrameLowering.cpp | 32 +- lib/Target/AMDGPU/AMDGPUFrameLowering.h | 10 +- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 793 +- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1488 +-- lib/Target/AMDGPU/AMDGPUISelLowering.h | 78 +- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp | 307 +- lib/Target/AMDGPU/AMDGPUInstrInfo.h | 143 +- lib/Target/AMDGPU/AMDGPUInstrInfo.td | 27 +- lib/Target/AMDGPU/AMDGPUInstructions.td | 263 +- lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp | 38 +- lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h | 6 +- lib/Target/AMDGPU/AMDGPUIntrinsics.td | 75 +- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 57 +- lib/Target/AMDGPU/AMDGPUMCInstLower.h | 4 +- lib/Target/AMDGPU/AMDGPUMachineFunction.cpp | 17 +- lib/Target/AMDGPU/AMDGPUMachineFunction.h | 34 +- .../AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp | 1 - lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 667 +- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp | 19 +- lib/Target/AMDGPU/AMDGPURegisterInfo.h | 21 +- lib/Target/AMDGPU/AMDGPURuntimeMetadata.h | 138 + lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 227 +- lib/Target/AMDGPU/AMDGPUSubtarget.h | 389 +- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 370 +- lib/Target/AMDGPU/AMDGPUTargetMachine.h | 61 +- lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp | 56 - lib/Target/AMDGPU/AMDGPUTargetObjectFile.h | 18 - lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 163 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 49 +- lib/Target/AMDGPU/AMDILCFGStructurizer.cpp | 201 +- lib/Target/AMDGPU/AMDKernelCodeT.h | 31 +- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 2535 ++-- lib/Target/AMDGPU/AsmParser/CMakeLists.txt | 2 + lib/Target/AMDGPU/AsmParser/Makefile | 15 - lib/Target/AMDGPU/CIInstructions.td | 148 +- lib/Target/AMDGPU/CMakeLists.txt | 25 +- lib/Target/AMDGPU/CaymanInstructions.td | 47 +- .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 437 + .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 93 + lib/Target/AMDGPU/Disassembler/CMakeLists.txt | 7 + lib/Target/AMDGPU/Disassembler/LLVMBuild.txt | 23 + lib/Target/AMDGPU/EvergreenInstructions.td | 67 +- lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 264 + lib/Target/AMDGPU/GCNHazardRecognizer.h | 62 + .../AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp | 385 +- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h | 38 +- lib/Target/AMDGPU/InstPrinter/CMakeLists.txt | 2 + lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt | 2 +- lib/Target/AMDGPU/InstPrinter/Makefile | 15 - lib/Target/AMDGPU/LLVMBuild.txt | 7 +- .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp | 48 +- .../AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp | 55 +- .../AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp | 5 - lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h | 3 +- lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h | 7 +- lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp | 5 +- lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h | 4 +- .../AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h | 4 +- .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp | 11 - .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h | 7 +- .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp | 4 - .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h | 8 +- lib/Target/AMDGPU/MCTargetDesc/Makefile | 16 - .../AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp | 26 +- lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp | 42 +- lib/Target/AMDGPU/Makefile | 23 - lib/Target/AMDGPU/Processors.td | 23 +- lib/Target/AMDGPU/R600ClauseMergePass.cpp | 121 +- lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp | 158 +- lib/Target/AMDGPU/R600Defines.h | 4 +- lib/Target/AMDGPU/R600EmitClauseMarkers.cpp | 65 +- lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp | 30 +- lib/Target/AMDGPU/R600FrameLowering.cpp | 15 + lib/Target/AMDGPU/R600FrameLowering.h | 30 + lib/Target/AMDGPU/R600ISelLowering.cpp | 1006 +- lib/Target/AMDGPU/R600ISelLowering.h | 53 +- lib/Target/AMDGPU/R600InstrInfo.cpp | 509 +- lib/Target/AMDGPU/R600InstrInfo.h | 199 +- lib/Target/AMDGPU/R600Instructions.td | 124 +- lib/Target/AMDGPU/R600Intrinsics.td | 114 +- lib/Target/AMDGPU/R600MachineFunctionInfo.h | 7 +- lib/Target/AMDGPU/R600MachineScheduler.cpp | 136 +- lib/Target/AMDGPU/R600MachineScheduler.h | 11 +- lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp | 40 +- lib/Target/AMDGPU/R600Packetizer.cpp | 103 +- lib/Target/AMDGPU/R600RegisterInfo.cpp | 11 +- lib/Target/AMDGPU/R600RegisterInfo.h | 15 +- lib/Target/AMDGPU/R600Schedule.td | 2 +- .../AMDGPU/R600TextureIntrinsicsReplacer.cpp | 303 - lib/Target/AMDGPU/SIAnnotateControlFlow.cpp | 88 +- lib/Target/AMDGPU/SIDebuggerInsertNops.cpp | 96 + lib/Target/AMDGPU/SIDefines.h | 148 +- lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 9 +- lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp | 219 - lib/Target/AMDGPU/SIFoldOperands.cpp | 28 +- lib/Target/AMDGPU/SIFrameLowering.cpp | 144 +- lib/Target/AMDGPU/SIFrameLowering.h | 6 + lib/Target/AMDGPU/SIISelLowering.cpp | 1725 ++- lib/Target/AMDGPU/SIISelLowering.h | 74 +- lib/Target/AMDGPU/SIInsertWaits.cpp | 198 +- lib/Target/AMDGPU/SIInstrFormats.td | 126 +- lib/Target/AMDGPU/SIInstrInfo.cpp | 1736 +-- lib/Target/AMDGPU/SIInstrInfo.h | 254 +- lib/Target/AMDGPU/SIInstrInfo.td | 1852 ++- lib/Target/AMDGPU/SIInstructions.td | 1298 +- lib/Target/AMDGPU/SIIntrinsics.td | 32 +- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 50 +- lib/Target/AMDGPU/SILowerControlFlow.cpp | 645 +- lib/Target/AMDGPU/SILowerI1Copies.cpp | 17 +- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 104 +- lib/Target/AMDGPU/SIMachineFunctionInfo.h | 129 +- lib/Target/AMDGPU/SIMachineScheduler.cpp | 168 +- lib/Target/AMDGPU/SIMachineScheduler.h | 10 +- lib/Target/AMDGPU/SIRegisterInfo.cpp | 604 +- lib/Target/AMDGPU/SIRegisterInfo.h | 69 +- lib/Target/AMDGPU/SIRegisterInfo.td | 122 +- lib/Target/AMDGPU/SISchedule.td | 63 +- lib/Target/AMDGPU/SIShrinkInstructions.cpp | 152 +- lib/Target/AMDGPU/SITypeRewriter.cpp | 2 +- lib/Target/AMDGPU/SIWholeQuadMode.cpp | 509 + lib/Target/AMDGPU/TargetInfo/Makefile | 15 - lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp | 69 + lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h | 31 + lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 28 +- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 7 +- lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h | 165 + lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp | 166 + lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h | 39 + lib/Target/AMDGPU/Utils/CMakeLists.txt | 2 + lib/Target/AMDGPU/Utils/Makefile | 16 - lib/Target/AMDGPU/VIInstrFormats.td | 143 +- lib/Target/AMDGPU/VIInstructions.td | 50 +- lib/Target/ARM/A15SDOptimizer.cpp | 74 +- lib/Target/ARM/ARM.h | 4 + lib/Target/ARM/ARM.td | 218 +- lib/Target/ARM/ARMAsmPrinter.cpp | 134 +- lib/Target/ARM/ARMAsmPrinter.h | 1 + lib/Target/ARM/ARMBaseInstrInfo.cpp | 1309 +- lib/Target/ARM/ARMBaseInstrInfo.h | 131 +- lib/Target/ARM/ARMBaseRegisterInfo.cpp | 41 +- lib/Target/ARM/ARMBaseRegisterInfo.h | 12 +- lib/Target/ARM/ARMCallingConv.h | 2 +- lib/Target/ARM/ARMCallingConv.td | 52 +- lib/Target/ARM/ARMConstantIslandPass.cpp | 160 +- lib/Target/ARM/ARMConstantPoolValue.cpp | 25 +- lib/Target/ARM/ARMConstantPoolValue.h | 11 +- lib/Target/ARM/ARMExpandPseudoInsts.cpp | 283 +- lib/Target/ARM/ARMFastISel.cpp | 143 +- lib/Target/ARM/ARMFrameLowering.cpp | 162 +- lib/Target/ARM/ARMFrameLowering.h | 2 +- lib/Target/ARM/ARMHazardRecognizer.cpp | 3 +- lib/Target/ARM/ARMISelDAGToDAG.cpp | 1043 +- lib/Target/ARM/ARMISelLowering.cpp | 1484 ++- lib/Target/ARM/ARMISelLowering.h | 99 +- lib/Target/ARM/ARMInstrFormats.td | 212 +- lib/Target/ARM/ARMInstrInfo.cpp | 22 +- lib/Target/ARM/ARMInstrInfo.h | 3 +- lib/Target/ARM/ARMInstrInfo.td | 355 +- lib/Target/ARM/ARMInstrThumb.td | 157 +- lib/Target/ARM/ARMInstrThumb2.td | 346 +- lib/Target/ARM/ARMInstrVFP.td | 430 +- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 265 +- lib/Target/ARM/ARMMCInstLower.cpp | 43 +- lib/Target/ARM/ARMMachineFunctionInfo.cpp | 2 +- lib/Target/ARM/ARMMachineFunctionInfo.h | 1 - lib/Target/ARM/ARMOptimizeBarriersPass.cpp | 8 + lib/Target/ARM/ARMSchedule.td | 14 +- lib/Target/ARM/ARMScheduleA8.td | 2 +- lib/Target/ARM/ARMScheduleA9.td | 4 +- lib/Target/ARM/ARMScheduleSwift.td | 2 +- lib/Target/ARM/ARMSelectionDAGInfo.cpp | 58 +- lib/Target/ARM/ARMSelectionDAGInfo.h | 42 +- lib/Target/ARM/ARMSubtarget.cpp | 157 +- lib/Target/ARM/ARMSubtarget.h | 273 +- lib/Target/ARM/ARMTargetMachine.cpp | 54 +- lib/Target/ARM/ARMTargetMachine.h | 31 +- lib/Target/ARM/ARMTargetObjectFile.h | 8 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 42 +- lib/Target/ARM/ARMTargetTransformInfo.h | 17 +- lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 309 +- lib/Target/ARM/AsmParser/Makefile | 15 - lib/Target/ARM/Disassembler/ARMDisassembler.cpp | 108 +- lib/Target/ARM/Disassembler/Makefile | 16 - lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp | 120 +- lib/Target/ARM/InstPrinter/ARMInstPrinter.h | 9 + lib/Target/ARM/InstPrinter/Makefile | 15 - lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h | 51 +- lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp | 83 +- lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h | 11 +- lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h | 8 +- lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h | 3 + lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h | 1 + lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h | 14 +- lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp | 19 +- lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp | 6 + lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h | 10 + lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp | 3 +- lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp | 120 +- lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp | 16 - .../ARM/MCTargetDesc/ARMMachORelocationInfo.cpp | 2 +- .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp | 9 +- lib/Target/ARM/MCTargetDesc/Makefile | 16 - lib/Target/ARM/MLxExpansionPass.cpp | 6 +- lib/Target/ARM/Makefile | 24 - lib/Target/ARM/README.txt | 18 +- lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp | 1 - lib/Target/ARM/TargetInfo/Makefile | 15 - lib/Target/ARM/Thumb1FrameLowering.cpp | 56 +- lib/Target/ARM/Thumb1FrameLowering.h | 2 +- lib/Target/ARM/Thumb1InstrInfo.cpp | 19 +- lib/Target/ARM/Thumb1InstrInfo.h | 8 +- lib/Target/ARM/Thumb2ITBlockPass.cpp | 11 +- lib/Target/ARM/Thumb2InstrInfo.cpp | 55 +- lib/Target/ARM/Thumb2InstrInfo.h | 12 +- lib/Target/ARM/Thumb2SizeReduction.cpp | 68 +- lib/Target/ARM/ThumbRegisterInfo.cpp | 120 +- lib/Target/ARM/ThumbRegisterInfo.h | 5 +- lib/Target/AVR/AVR.h | 4 +- lib/Target/AVR/AVR.td | 6 +- lib/Target/AVR/AVRConfig.h | 15 - lib/Target/AVR/AVRFrameLowering.h | 46 + lib/Target/AVR/AVRISelLowering.h | 152 + lib/Target/AVR/AVRInstrFormats.td | 577 + lib/Target/AVR/AVRInstrInfo.cpp | 466 + lib/Target/AVR/AVRInstrInfo.h | 110 + lib/Target/AVR/AVRInstrInfo.td | 1981 +++ lib/Target/AVR/AVRMachineFunctionInfo.h | 6 +- lib/Target/AVR/AVRRegisterInfo.cpp | 256 + lib/Target/AVR/AVRRegisterInfo.h | 56 + lib/Target/AVR/AVRSelectionDAGInfo.h | 11 +- lib/Target/AVR/AVRSubtarget.cpp | 47 + lib/Target/AVR/AVRSubtarget.h | 119 + lib/Target/AVR/AVRTargetMachine.cpp | 101 + lib/Target/AVR/AVRTargetMachine.h | 51 + lib/Target/AVR/AVRTargetObjectFile.h | 5 +- lib/Target/AVR/CMakeLists.txt | 12 +- lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp | 66 + lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h | 29 + lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp | 28 + lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h | 31 + lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h | 57 + lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp | 24 + lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h | 32 + lib/Target/AVR/MCTargetDesc/CMakeLists.txt | 8 + lib/Target/AVR/MCTargetDesc/LLVMBuild.txt | 23 + lib/Target/AVR/Makefile | 19 - lib/Target/AVR/TODO.md | 7 + lib/Target/AVR/TargetInfo/CMakeLists.txt | 2 + lib/Target/AVR/TargetInfo/Makefile | 16 - lib/Target/BPF/BPFAsmPrinter.cpp | 28 - lib/Target/BPF/BPFFrameLowering.h | 4 +- lib/Target/BPF/BPFISelDAGToDAG.cpp | 37 +- lib/Target/BPF/BPFISelLowering.cpp | 148 +- lib/Target/BPF/BPFISelLowering.h | 13 +- lib/Target/BPF/BPFInstrInfo.cpp | 12 +- lib/Target/BPF/BPFInstrInfo.h | 6 +- lib/Target/BPF/BPFMCInstLower.cpp | 3 +- lib/Target/BPF/BPFSubtarget.h | 6 +- lib/Target/BPF/BPFTargetMachine.cpp | 15 +- lib/Target/BPF/BPFTargetMachine.h | 5 +- lib/Target/BPF/InstPrinter/Makefile | 16 - lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp | 3 +- lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp | 8 +- lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h | 4 +- lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp | 1 + lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp | 12 - lib/Target/BPF/MCTargetDesc/Makefile | 16 - lib/Target/BPF/Makefile | 21 - lib/Target/BPF/TargetInfo/Makefile | 16 - lib/Target/CppBackend/CMakeLists.txt | 5 - lib/Target/CppBackend/CPPBackend.cpp | 2143 ---- lib/Target/CppBackend/CPPTargetMachine.h | 44 - lib/Target/CppBackend/LLVMBuild.txt | 31 - lib/Target/CppBackend/Makefile | 16 - lib/Target/CppBackend/TargetInfo/CMakeLists.txt | 3 - .../CppBackend/TargetInfo/CppBackendTargetInfo.cpp | 29 - lib/Target/CppBackend/TargetInfo/LLVMBuild.txt | 23 - lib/Target/CppBackend/TargetInfo/Makefile | 15 - lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp | 408 +- lib/Target/Hexagon/AsmParser/Makefile | 15 - lib/Target/Hexagon/BitTracker.cpp | 220 +- lib/Target/Hexagon/BitTracker.h | 10 +- lib/Target/Hexagon/CMakeLists.txt | 6 +- .../Hexagon/Disassembler/HexagonDisassembler.cpp | 7 +- lib/Target/Hexagon/Disassembler/Makefile | 16 - lib/Target/Hexagon/Hexagon.td | 27 +- lib/Target/Hexagon/HexagonAsmPrinter.cpp | 51 +- lib/Target/Hexagon/HexagonBitSimplify.cpp | 50 +- lib/Target/Hexagon/HexagonBitTracker.cpp | 75 +- lib/Target/Hexagon/HexagonBitTracker.h | 8 +- lib/Target/Hexagon/HexagonBlockRanges.cpp | 483 + lib/Target/Hexagon/HexagonBlockRanges.h | 239 + lib/Target/Hexagon/HexagonBranchRelaxation.cpp | 211 + lib/Target/Hexagon/HexagonCFGOptimizer.cpp | 48 +- lib/Target/Hexagon/HexagonCommonGEP.cpp | 14 +- lib/Target/Hexagon/HexagonCopyToCombine.cpp | 278 +- lib/Target/Hexagon/HexagonEarlyIfConv.cpp | 19 +- lib/Target/Hexagon/HexagonExpandCondsets.cpp | 1062 +- lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp | 357 - lib/Target/Hexagon/HexagonFixupHwLoops.cpp | 20 +- lib/Target/Hexagon/HexagonFrameLowering.cpp | 1269 +- lib/Target/Hexagon/HexagonFrameLowering.h | 61 +- lib/Target/Hexagon/HexagonGenExtract.cpp | 3 + lib/Target/Hexagon/HexagonGenInsert.cpp | 13 +- lib/Target/Hexagon/HexagonGenMux.cpp | 21 +- lib/Target/Hexagon/HexagonGenPredicate.cpp | 28 +- lib/Target/Hexagon/HexagonHardwareLoops.cpp | 34 +- lib/Target/Hexagon/HexagonISelDAGToDAG.cpp | 1237 +- lib/Target/Hexagon/HexagonISelLowering.cpp | 458 +- lib/Target/Hexagon/HexagonISelLowering.h | 54 +- lib/Target/Hexagon/HexagonInstrAlias.td | 192 + lib/Target/Hexagon/HexagonInstrFormats.td | 14 +- lib/Target/Hexagon/HexagonInstrFormatsV4.td | 7 +- lib/Target/Hexagon/HexagonInstrInfo.cpp | 1067 +- lib/Target/Hexagon/HexagonInstrInfo.h | 62 +- lib/Target/Hexagon/HexagonInstrInfo.td | 249 +- lib/Target/Hexagon/HexagonInstrInfoV3.td | 25 +- lib/Target/Hexagon/HexagonInstrInfoV4.td | 707 +- lib/Target/Hexagon/HexagonInstrInfoV60.td | 116 +- lib/Target/Hexagon/HexagonInstrInfoVector.td | 55 +- lib/Target/Hexagon/HexagonIntrinsics.td | 511 +- lib/Target/Hexagon/HexagonIntrinsicsV4.td | 193 +- lib/Target/Hexagon/HexagonIntrinsicsV5.td | 24 +- lib/Target/Hexagon/HexagonIntrinsicsV60.td | 4 +- lib/Target/Hexagon/HexagonMCInstLower.cpp | 66 +- lib/Target/Hexagon/HexagonMachineFunctionInfo.h | 19 +- lib/Target/Hexagon/HexagonMachineScheduler.cpp | 406 +- lib/Target/Hexagon/HexagonMachineScheduler.h | 20 +- lib/Target/Hexagon/HexagonNewValueJump.cpp | 150 +- lib/Target/Hexagon/HexagonOperands.td | 72 +- lib/Target/Hexagon/HexagonOptAddrMode.cpp | 663 + lib/Target/Hexagon/HexagonOptimizeSZextends.cpp | 3 + lib/Target/Hexagon/HexagonPeephole.cpp | 101 +- lib/Target/Hexagon/HexagonRDF.h | 4 +- lib/Target/Hexagon/HexagonRDFOpt.cpp | 88 +- lib/Target/Hexagon/HexagonRegisterInfo.cpp | 108 +- lib/Target/Hexagon/HexagonRegisterInfo.h | 18 +- lib/Target/Hexagon/HexagonRegisterInfo.td | 51 +- lib/Target/Hexagon/HexagonScheduleV4.td | 13 +- lib/Target/Hexagon/HexagonScheduleV55.td | 186 +- lib/Target/Hexagon/HexagonScheduleV60.td | 11 +- lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp | 13 +- lib/Target/Hexagon/HexagonSelectionDAGInfo.h | 16 +- .../Hexagon/HexagonSplitConst32AndConst64.cpp | 63 +- lib/Target/Hexagon/HexagonSplitDouble.cpp | 24 +- lib/Target/Hexagon/HexagonStoreWidening.cpp | 3 + lib/Target/Hexagon/HexagonSubtarget.cpp | 244 + lib/Target/Hexagon/HexagonSubtarget.h | 29 +- lib/Target/Hexagon/HexagonSystemInst.td | 21 + lib/Target/Hexagon/HexagonTargetMachine.cpp | 54 +- lib/Target/Hexagon/HexagonTargetMachine.h | 2 +- lib/Target/Hexagon/HexagonTargetObjectFile.cpp | 393 +- lib/Target/Hexagon/HexagonTargetObjectFile.h | 37 +- lib/Target/Hexagon/HexagonVLIWPacketizer.cpp | 172 +- lib/Target/Hexagon/HexagonVLIWPacketizer.h | 13 +- .../Hexagon/MCTargetDesc/HexagonAsmBackend.cpp | 638 +- lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h | 22 +- .../MCTargetDesc/HexagonELFObjectWriter.cpp | 60 +- .../Hexagon/MCTargetDesc/HexagonFixupKinds.h | 1 + .../Hexagon/MCTargetDesc/HexagonInstPrinter.cpp | 12 +- .../Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp | 1 + lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h | 1 - .../Hexagon/MCTargetDesc/HexagonMCChecker.cpp | 6 +- lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h | 5 +- .../Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp | 451 +- .../Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp | 6 +- .../Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp | 19 +- lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp | 55 +- lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h | 18 +- .../Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp | 182 +- .../Hexagon/MCTargetDesc/HexagonMCInstrInfo.h | 14 +- .../Hexagon/MCTargetDesc/HexagonMCShuffler.cpp | 1 - .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp | 70 +- lib/Target/Hexagon/MCTargetDesc/Makefile | 16 - lib/Target/Hexagon/Makefile | 26 - lib/Target/Hexagon/RDFCopy.cpp | 217 +- lib/Target/Hexagon/RDFCopy.h | 12 +- lib/Target/Hexagon/RDFDeadCode.cpp | 50 +- lib/Target/Hexagon/RDFDeadCode.h | 12 +- lib/Target/Hexagon/RDFGraph.cpp | 55 +- lib/Target/Hexagon/RDFGraph.h | 33 +- lib/Target/Hexagon/RDFLiveness.cpp | 100 +- lib/Target/Hexagon/RDFLiveness.h | 8 +- lib/Target/Hexagon/TargetInfo/Makefile | 15 - lib/Target/LLVMBuild.txt | 2 +- lib/Target/Lanai/AsmParser/CMakeLists.txt | 7 + lib/Target/Lanai/AsmParser/LLVMBuild.txt | 23 + lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp | 1213 ++ lib/Target/Lanai/CMakeLists.txt | 35 + lib/Target/Lanai/Disassembler/CMakeLists.txt | 3 + lib/Target/Lanai/Disassembler/LLVMBuild.txt | 23 + .../Lanai/Disassembler/LanaiDisassembler.cpp | 240 + lib/Target/Lanai/Disassembler/LanaiDisassembler.h | 41 + lib/Target/Lanai/InstPrinter/CMakeLists.txt | 3 + lib/Target/Lanai/InstPrinter/LLVMBuild.txt | 23 + lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp | 305 + lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h | 65 + lib/Target/Lanai/LLVMBuild.txt | 45 + lib/Target/Lanai/Lanai.h | 51 + lib/Target/Lanai/Lanai.td | 47 + lib/Target/Lanai/LanaiAluCode.h | 148 + lib/Target/Lanai/LanaiAsmPrinter.cpp | 243 + lib/Target/Lanai/LanaiCallingConv.td | 50 + lib/Target/Lanai/LanaiCondCode.h | 100 + lib/Target/Lanai/LanaiDelaySlotFiller.cpp | 263 + lib/Target/Lanai/LanaiFrameLowering.cpp | 220 + lib/Target/Lanai/LanaiFrameLowering.h | 57 + lib/Target/Lanai/LanaiISelDAGToDAG.cpp | 317 + lib/Target/Lanai/LanaiISelLowering.cpp | 1437 +++ lib/Target/Lanai/LanaiISelLowering.h | 148 + lib/Target/Lanai/LanaiInstrFormats.td | 561 + lib/Target/Lanai/LanaiInstrInfo.cpp | 803 ++ lib/Target/Lanai/LanaiInstrInfo.h | 184 + lib/Target/Lanai/LanaiInstrInfo.td | 892 ++ lib/Target/Lanai/LanaiMCInstLower.cpp | 140 + lib/Target/Lanai/LanaiMCInstLower.h | 48 + lib/Target/Lanai/LanaiMachineFunctionInfo.cpp | 23 + lib/Target/Lanai/LanaiMachineFunctionInfo.h | 58 + lib/Target/Lanai/LanaiMemAluCombiner.cpp | 422 + lib/Target/Lanai/LanaiRegisterInfo.cpp | 287 + lib/Target/Lanai/LanaiRegisterInfo.h | 63 + lib/Target/Lanai/LanaiRegisterInfo.td | 64 + lib/Target/Lanai/LanaiSchedule.td | 70 + lib/Target/Lanai/LanaiSelectionDAGInfo.cpp | 35 + lib/Target/Lanai/LanaiSelectionDAGInfo.h | 36 + lib/Target/Lanai/LanaiSubtarget.cpp | 47 + lib/Target/Lanai/LanaiSubtarget.h | 76 + lib/Target/Lanai/LanaiTargetMachine.cpp | 112 + lib/Target/Lanai/LanaiTargetMachine.h | 55 + lib/Target/Lanai/LanaiTargetObjectFile.cpp | 123 + lib/Target/Lanai/LanaiTargetObjectFile.h | 46 + lib/Target/Lanai/LanaiTargetTransformInfo.h | 86 + lib/Target/Lanai/MCTargetDesc/CMakeLists.txt | 8 + lib/Target/Lanai/MCTargetDesc/LLVMBuild.txt | 23 + lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp | 172 + lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h | 119 + .../Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp | 95 + lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h | 43 + lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp | 43 + lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h | 31 + .../Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp | 326 + lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp | 60 + lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h | 56 + .../Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp | 149 + lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h | 59 + lib/Target/Lanai/TargetInfo/CMakeLists.txt | 3 + lib/Target/Lanai/TargetInfo/LLVMBuild.txt | 23 + lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp | 20 + lib/Target/MSP430/InstPrinter/Makefile | 15 - .../MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp | 14 - lib/Target/MSP430/MCTargetDesc/Makefile | 16 - lib/Target/MSP430/MSP430BranchSelector.cpp | 25 +- lib/Target/MSP430/MSP430FrameLowering.cpp | 40 +- lib/Target/MSP430/MSP430FrameLowering.h | 6 +- lib/Target/MSP430/MSP430ISelDAGToDAG.cpp | 134 +- lib/Target/MSP430/MSP430ISelLowering.cpp | 126 +- lib/Target/MSP430/MSP430ISelLowering.h | 31 +- lib/Target/MSP430/MSP430InstrInfo.cpp | 45 +- lib/Target/MSP430/MSP430InstrInfo.h | 16 +- lib/Target/MSP430/MSP430Subtarget.h | 6 +- lib/Target/MSP430/MSP430TargetMachine.cpp | 12 +- lib/Target/MSP430/MSP430TargetMachine.h | 2 +- lib/Target/MSP430/Makefile | 23 - lib/Target/MSP430/TargetInfo/Makefile | 15 - lib/Target/Makefile | 20 - lib/Target/Mips/AsmParser/Makefile | 15 - lib/Target/Mips/AsmParser/MipsAsmParser.cpp | 1724 +-- lib/Target/Mips/CMakeLists.txt | 1 + lib/Target/Mips/Disassembler/Makefile | 16 - lib/Target/Mips/Disassembler/MipsDisassembler.cpp | 376 +- lib/Target/Mips/InstPrinter/Makefile | 16 - lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp | 96 +- lib/Target/Mips/InstPrinter/MipsInstPrinter.h | 4 +- lib/Target/Mips/MCTargetDesc/Makefile | 17 - .../Mips/MCTargetDesc/MipsABIFlagsSection.cpp | 22 +- lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h | 17 +- lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp | 47 +- lib/Target/Mips/MCTargetDesc/MipsABIInfo.h | 9 +- lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp | 77 +- lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h | 5 +- lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h | 10 +- .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp | 614 +- lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h | 19 +- lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp | 5 + lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp | 415 +- lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h | 27 + lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp | 295 +- lib/Target/Mips/MCTargetDesc/MipsMCExpr.h | 66 +- lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp | 55 +- .../Mips/MCTargetDesc/MipsTargetStreamer.cpp | 378 +- lib/Target/Mips/Makefile | 25 - lib/Target/Mips/MicroMips32r6InstrFormats.td | 238 +- lib/Target/Mips/MicroMips32r6InstrInfo.td | 717 +- lib/Target/Mips/MicroMips64r6InstrFormats.td | 141 +- lib/Target/Mips/MicroMips64r6InstrInfo.td | 404 +- lib/Target/Mips/MicroMipsDSPInstrFormats.td | 58 + lib/Target/Mips/MicroMipsDSPInstrInfo.td | 79 +- lib/Target/Mips/MicroMipsInstrFPU.td | 39 +- lib/Target/Mips/MicroMipsInstrFormats.td | 9 +- lib/Target/Mips/MicroMipsInstrInfo.td | 360 +- lib/Target/Mips/Mips.h | 3 +- lib/Target/Mips/Mips.td | 5 + lib/Target/Mips/Mips16FrameLowering.cpp | 1 - lib/Target/Mips/Mips16HardFloat.cpp | 28 +- lib/Target/Mips/Mips16ISelDAGToDAG.cpp | 182 +- lib/Target/Mips/Mips16ISelDAGToDAG.h | 26 +- lib/Target/Mips/Mips16ISelLowering.cpp | 137 +- lib/Target/Mips/Mips16ISelLowering.h | 28 +- lib/Target/Mips/Mips16InstrInfo.cpp | 25 +- lib/Target/Mips/Mips16InstrInfo.h | 16 +- lib/Target/Mips/Mips16InstrInfo.td | 85 +- lib/Target/Mips/Mips16RegisterInfo.cpp | 3 - lib/Target/Mips/Mips32r6InstrFormats.td | 2 +- lib/Target/Mips/Mips32r6InstrInfo.td | 343 +- lib/Target/Mips/Mips64InstrInfo.td | 475 +- lib/Target/Mips/Mips64r6InstrInfo.td | 98 +- lib/Target/Mips/MipsAsmPrinter.cpp | 65 +- lib/Target/Mips/MipsAsmPrinter.h | 2 - lib/Target/Mips/MipsCCState.cpp | 4 +- lib/Target/Mips/MipsCallingConv.td | 48 - lib/Target/Mips/MipsCondMov.td | 12 +- lib/Target/Mips/MipsConstantIslandPass.cpp | 114 +- lib/Target/Mips/MipsDSPInstrFormats.td | 4 + lib/Target/Mips/MipsDSPInstrInfo.td | 69 +- lib/Target/Mips/MipsDelaySlotFiller.cpp | 144 +- lib/Target/Mips/MipsEVAInstrInfo.td | 73 +- lib/Target/Mips/MipsFastISel.cpp | 39 +- lib/Target/Mips/MipsFrameLowering.cpp | 16 +- lib/Target/Mips/MipsFrameLowering.h | 2 +- lib/Target/Mips/MipsHazardSchedule.cpp | 147 + lib/Target/Mips/MipsISelDAGToDAG.cpp | 53 +- lib/Target/Mips/MipsISelDAGToDAG.h | 25 +- lib/Target/Mips/MipsISelLowering.cpp | 419 +- lib/Target/Mips/MipsISelLowering.h | 96 +- lib/Target/Mips/MipsInstrFPU.td | 130 +- lib/Target/Mips/MipsInstrFormats.td | 7 +- lib/Target/Mips/MipsInstrInfo.cpp | 236 +- lib/Target/Mips/MipsInstrInfo.h | 26 +- lib/Target/Mips/MipsInstrInfo.td | 1036 +- lib/Target/Mips/MipsLongBranch.cpp | 72 +- lib/Target/Mips/MipsMCInstLower.cpp | 155 +- lib/Target/Mips/MipsMCInstLower.h | 8 +- lib/Target/Mips/MipsMSAInstrInfo.td | 220 +- lib/Target/Mips/MipsMachineFunction.cpp | 16 +- lib/Target/Mips/MipsMachineFunction.h | 15 +- lib/Target/Mips/MipsOs16.cpp | 26 +- lib/Target/Mips/MipsRegisterInfo.cpp | 20 +- lib/Target/Mips/MipsRegisterInfo.h | 22 +- lib/Target/Mips/MipsRegisterInfo.td | 13 + lib/Target/Mips/MipsSEFrameLowering.cpp | 12 +- lib/Target/Mips/MipsSEISelDAGToDAG.cpp | 141 +- lib/Target/Mips/MipsSEISelDAGToDAG.h | 34 +- lib/Target/Mips/MipsSEISelLowering.cpp | 256 +- lib/Target/Mips/MipsSEISelLowering.h | 24 +- lib/Target/Mips/MipsSEInstrInfo.cpp | 94 +- lib/Target/Mips/MipsSEInstrInfo.h | 13 +- lib/Target/Mips/MipsSERegisterInfo.cpp | 54 +- lib/Target/Mips/MipsSchedule.td | 226 +- lib/Target/Mips/MipsScheduleP5600.td | 2 +- lib/Target/Mips/MipsSubtarget.cpp | 9 +- lib/Target/Mips/MipsSubtarget.h | 14 +- lib/Target/Mips/MipsTargetMachine.cpp | 34 +- lib/Target/Mips/MipsTargetMachine.h | 13 +- lib/Target/Mips/MipsTargetObjectFile.cpp | 16 +- lib/Target/Mips/MipsTargetObjectFile.h | 3 +- lib/Target/Mips/MipsTargetStreamer.h | 77 +- lib/Target/Mips/TargetInfo/Makefile | 15 - lib/Target/NVPTX/CMakeLists.txt | 2 + lib/Target/NVPTX/InstPrinter/Makefile | 15 - lib/Target/NVPTX/MCTargetDesc/Makefile | 16 - lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp | 7 +- .../NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp | 16 - lib/Target/NVPTX/Makefile | 23 - lib/Target/NVPTX/NVPTX.h | 8 +- lib/Target/NVPTX/NVPTX.td | 14 +- lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 52 +- lib/Target/NVPTX/NVPTXAsmPrinter.h | 6 +- .../NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp | 7 +- lib/Target/NVPTX/NVPTXFrameLowering.cpp | 7 +- lib/Target/NVPTX/NVPTXFrameLowering.h | 2 +- lib/Target/NVPTX/NVPTXGenericToNVVM.cpp | 16 +- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 528 +- lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 42 +- lib/Target/NVPTX/NVPTXISelLowering.cpp | 126 +- lib/Target/NVPTX/NVPTXISelLowering.h | 20 +- lib/Target/NVPTX/NVPTXImageOptimizer.cpp | 3 + lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp | 586 + lib/Target/NVPTX/NVPTXInstrInfo.cpp | 61 +- lib/Target/NVPTX/NVPTXInstrInfo.h | 19 +- lib/Target/NVPTX/NVPTXInstrInfo.td | 3025 +++-- lib/Target/NVPTX/NVPTXIntrinsics.td | 349 +- lib/Target/NVPTX/NVPTXLowerAlloca.cpp | 3 + lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp | 2 +- lib/Target/NVPTX/NVPTXMCExpr.cpp | 4 +- lib/Target/NVPTX/NVPTXMCExpr.h | 9 +- lib/Target/NVPTX/NVPTXPeephole.cpp | 3 + lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp | 9 +- lib/Target/NVPTX/NVPTXSection.h | 1 - lib/Target/NVPTX/NVPTXSubtarget.h | 6 +- lib/Target/NVPTX/NVPTXTargetMachine.cpp | 129 +- lib/Target/NVPTX/NVPTXTargetMachine.h | 11 +- lib/Target/NVPTX/NVPTXTargetObjectFile.h | 4 +- lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 2 +- lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 4 + lib/Target/NVPTX/NVPTXUtilities.cpp | 4 +- lib/Target/NVPTX/NVPTXUtilities.h | 5 +- lib/Target/NVPTX/NVVMIntrRange.cpp | 148 + lib/Target/NVPTX/NVVMReflect.cpp | 178 +- lib/Target/NVPTX/TargetInfo/Makefile | 15 - lib/Target/PowerPC/AsmParser/Makefile | 15 - lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp | 66 +- lib/Target/PowerPC/CMakeLists.txt | 3 +- lib/Target/PowerPC/Disassembler/Makefile | 16 - .../PowerPC/Disassembler/PPCDisassembler.cpp | 17 +- lib/Target/PowerPC/InstPrinter/Makefile | 16 - lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp | 25 +- lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h | 2 + lib/Target/PowerPC/MCTargetDesc/Makefile | 16 - lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp | 4 +- .../PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp | 18 +- .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp | 23 +- .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp | 18 +- .../PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp | 2 +- lib/Target/PowerPC/Makefile | 24 - lib/Target/PowerPC/PPC.h | 22 +- lib/Target/PowerPC/PPC.td | 71 +- lib/Target/PowerPC/PPCAsmPrinter.cpp | 237 +- lib/Target/PowerPC/PPCBoolRetToInt.cpp | 13 +- lib/Target/PowerPC/PPCBranchSelector.cpp | 15 +- lib/Target/PowerPC/PPCCCState.cpp | 36 + lib/Target/PowerPC/PPCCCState.h | 42 + lib/Target/PowerPC/PPCCTRLoops.cpp | 39 +- lib/Target/PowerPC/PPCCallingConv.td | 25 +- lib/Target/PowerPC/PPCEarlyReturn.cpp | 20 +- lib/Target/PowerPC/PPCFastISel.cpp | 71 +- lib/Target/PowerPC/PPCFrameLowering.cpp | 131 +- lib/Target/PowerPC/PPCFrameLowering.h | 13 +- lib/Target/PowerPC/PPCHazardRecognizers.cpp | 7 +- lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 452 +- lib/Target/PowerPC/PPCISelLowering.cpp | 2122 ++-- lib/Target/PowerPC/PPCISelLowering.h | 273 +- lib/Target/PowerPC/PPCInstr64Bit.td | 61 +- lib/Target/PowerPC/PPCInstrAltivec.td | 184 + lib/Target/PowerPC/PPCInstrFormats.td | 258 + lib/Target/PowerPC/PPCInstrInfo.cpp | 410 +- lib/Target/PowerPC/PPCInstrInfo.h | 65 +- lib/Target/PowerPC/PPCInstrInfo.td | 112 +- lib/Target/PowerPC/PPCInstrVSX.td | 516 +- lib/Target/PowerPC/PPCLoopDataPrefetch.cpp | 233 - lib/Target/PowerPC/PPCLoopPreIncPrep.cpp | 3 + lib/Target/PowerPC/PPCMCInstLower.cpp | 42 +- lib/Target/PowerPC/PPCMIPeephole.cpp | 2 + lib/Target/PowerPC/PPCMachineFunctionInfo.h | 10 +- lib/Target/PowerPC/PPCQPXLoadSplat.cpp | 166 + lib/Target/PowerPC/PPCRegisterInfo.cpp | 38 +- lib/Target/PowerPC/PPCRegisterInfo.h | 1 + lib/Target/PowerPC/PPCSchedule.td | 7 + lib/Target/PowerPC/PPCSchedule440.td | 3 +- lib/Target/PowerPC/PPCScheduleA2.td | 7 +- lib/Target/PowerPC/PPCScheduleE500mc.td | 15 +- lib/Target/PowerPC/PPCScheduleE5500.td | 19 +- lib/Target/PowerPC/PPCScheduleG5.td | 17 +- lib/Target/PowerPC/PPCScheduleP7.td | 3 +- lib/Target/PowerPC/PPCScheduleP8.td | 3 +- lib/Target/PowerPC/PPCSubtarget.cpp | 34 +- lib/Target/PowerPC/PPCSubtarget.h | 29 +- lib/Target/PowerPC/PPCTLSDynamicCall.cpp | 5 +- lib/Target/PowerPC/PPCTOCRegDeps.cpp | 5 +- lib/Target/PowerPC/PPCTargetMachine.cpp | 63 +- lib/Target/PowerPC/PPCTargetMachine.h | 15 +- lib/Target/PowerPC/PPCTargetObjectFile.cpp | 2 +- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 30 +- lib/Target/PowerPC/PPCTargetTransformInfo.h | 2 + lib/Target/PowerPC/PPCVSXCopy.cpp | 5 +- lib/Target/PowerPC/PPCVSXFMAMutate.cpp | 68 +- lib/Target/PowerPC/PPCVSXSwapRemoval.cpp | 43 +- lib/Target/PowerPC/README.txt | 11 + lib/Target/PowerPC/README_P9.txt | 605 + lib/Target/PowerPC/TargetInfo/Makefile | 15 - lib/Target/PowerPC/p9-instrs.txt | 442 + lib/Target/README.txt | 2 +- lib/Target/Sparc/AsmParser/Makefile | 15 - lib/Target/Sparc/AsmParser/SparcAsmParser.cpp | 99 +- lib/Target/Sparc/CMakeLists.txt | 1 + lib/Target/Sparc/DelaySlotFiller.cpp | 39 +- lib/Target/Sparc/Disassembler/Makefile | 16 - .../Sparc/Disassembler/SparcDisassembler.cpp | 121 +- lib/Target/Sparc/InstPrinter/Makefile | 16 - lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp | 23 +- lib/Target/Sparc/LeonFeatures.td | 91 + lib/Target/Sparc/LeonPasses.cpp | 933 ++ lib/Target/Sparc/LeonPasses.h | 199 + lib/Target/Sparc/MCTargetDesc/Makefile | 16 - lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp | 3 +- .../Sparc/MCTargetDesc/SparcELFObjectWriter.cpp | 7 +- .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp | 1 + .../Sparc/MCTargetDesc/SparcMCTargetDesc.cpp | 35 +- lib/Target/Sparc/Makefile | 24 - lib/Target/Sparc/README.txt | 1 - lib/Target/Sparc/Sparc.h | 35 +- lib/Target/Sparc/Sparc.td | 140 +- lib/Target/Sparc/SparcAsmPrinter.cpp | 8 +- lib/Target/Sparc/SparcFrameLowering.cpp | 8 +- lib/Target/Sparc/SparcFrameLowering.h | 2 +- lib/Target/Sparc/SparcISelDAGToDAG.cpp | 36 +- lib/Target/Sparc/SparcISelLowering.cpp | 935 +- lib/Target/Sparc/SparcISelLowering.h | 107 +- lib/Target/Sparc/SparcInstr64Bit.td | 43 +- lib/Target/Sparc/SparcInstrAliases.td | 117 +- lib/Target/Sparc/SparcInstrFormats.td | 122 +- lib/Target/Sparc/SparcInstrInfo.cpp | 107 +- lib/Target/Sparc/SparcInstrInfo.h | 18 +- lib/Target/Sparc/SparcInstrInfo.td | 509 +- lib/Target/Sparc/SparcMCInstLower.cpp | 1 - lib/Target/Sparc/SparcRegisterInfo.cpp | 10 +- lib/Target/Sparc/SparcRegisterInfo.h | 3 - lib/Target/Sparc/SparcRegisterInfo.td | 78 +- lib/Target/Sparc/SparcSchedule.td | 124 + lib/Target/Sparc/SparcSubtarget.cpp | 25 +- lib/Target/Sparc/SparcSubtarget.h | 50 +- lib/Target/Sparc/SparcTargetMachine.cpp | 104 +- lib/Target/Sparc/SparcTargetMachine.h | 21 +- lib/Target/Sparc/TargetInfo/Makefile | 15 - lib/Target/SystemZ/AsmParser/Makefile | 16 - lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp | 5 +- lib/Target/SystemZ/CMakeLists.txt | 1 + lib/Target/SystemZ/Disassembler/Makefile | 16 - .../SystemZ/Disassembler/SystemZDisassembler.cpp | 57 +- lib/Target/SystemZ/InstPrinter/Makefile | 16 - lib/Target/SystemZ/MCTargetDesc/Makefile | 16 - .../SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp | 3 +- .../SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp | 7 +- .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp | 22 +- lib/Target/SystemZ/Makefile | 28 - lib/Target/SystemZ/README.txt | 30 +- lib/Target/SystemZ/SystemZ.h | 41 + lib/Target/SystemZ/SystemZAsmPrinter.cpp | 200 + lib/Target/SystemZ/SystemZCallingConv.cpp | 4 +- lib/Target/SystemZ/SystemZCallingConv.h | 50 +- lib/Target/SystemZ/SystemZCallingConv.td | 15 + lib/Target/SystemZ/SystemZElimCompare.cpp | 214 +- lib/Target/SystemZ/SystemZFrameLowering.cpp | 26 +- lib/Target/SystemZ/SystemZFrameLowering.h | 7 +- lib/Target/SystemZ/SystemZISelDAGToDAG.cpp | 266 +- lib/Target/SystemZ/SystemZISelLowering.cpp | 1043 +- lib/Target/SystemZ/SystemZISelLowering.h | 97 +- lib/Target/SystemZ/SystemZInstrBuilder.h | 2 +- lib/Target/SystemZ/SystemZInstrFP.td | 11 + lib/Target/SystemZ/SystemZInstrFormats.td | 206 +- lib/Target/SystemZ/SystemZInstrInfo.cpp | 607 +- lib/Target/SystemZ/SystemZInstrInfo.h | 85 +- lib/Target/SystemZ/SystemZInstrInfo.td | 356 +- lib/Target/SystemZ/SystemZLDCleanup.cpp | 7 +- lib/Target/SystemZ/SystemZLongBranch.cpp | 19 +- lib/Target/SystemZ/SystemZMachineFunctionInfo.h | 9 +- lib/Target/SystemZ/SystemZOperands.td | 8 +- lib/Target/SystemZ/SystemZOperators.td | 29 + lib/Target/SystemZ/SystemZProcessors.td | 8 +- lib/Target/SystemZ/SystemZRegisterInfo.cpp | 16 +- lib/Target/SystemZ/SystemZRegisterInfo.h | 9 + lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp | 102 +- lib/Target/SystemZ/SystemZSelectionDAGInfo.h | 36 +- lib/Target/SystemZ/SystemZShortenInst.cpp | 34 +- lib/Target/SystemZ/SystemZSubtarget.cpp | 14 +- lib/Target/SystemZ/SystemZSubtarget.h | 9 +- lib/Target/SystemZ/SystemZTDC.cpp | 382 + lib/Target/SystemZ/SystemZTargetMachine.cpp | 20 +- lib/Target/SystemZ/SystemZTargetMachine.h | 2 +- lib/Target/SystemZ/TargetInfo/Makefile | 15 - lib/Target/Target.cpp | 22 +- lib/Target/TargetLoweringObjectFile.cpp | 23 +- lib/Target/TargetMachine.cpp | 117 +- lib/Target/TargetMachineC.cpp | 23 +- lib/Target/TargetRecip.cpp | 10 +- lib/Target/TargetSubtargetInfo.cpp | 2 - lib/Target/WebAssembly/CMakeLists.txt | 7 +- lib/Target/WebAssembly/Disassembler/Makefile | 16 - .../Disassembler/WebAssemblyDisassembler.cpp | 6 +- lib/Target/WebAssembly/InstPrinter/Makefile | 16 - .../InstPrinter/WebAssemblyInstPrinter.cpp | 52 +- .../InstPrinter/WebAssemblyInstPrinter.h | 5 +- lib/Target/WebAssembly/MCTargetDesc/Makefile | 16 - .../MCTargetDesc/WebAssemblyAsmBackend.cpp | 9 +- .../MCTargetDesc/WebAssemblyELFObjectWriter.cpp | 7 +- .../MCTargetDesc/WebAssemblyMCAsmInfo.cpp | 4 +- .../MCTargetDesc/WebAssemblyMCCodeEmitter.cpp | 13 +- .../MCTargetDesc/WebAssemblyMCTargetDesc.cpp | 19 +- .../MCTargetDesc/WebAssemblyMCTargetDesc.h | 56 +- .../MCTargetDesc/WebAssemblyTargetStreamer.cpp | 12 +- .../MCTargetDesc/WebAssemblyTargetStreamer.h | 9 + lib/Target/WebAssembly/Makefile | 26 - lib/Target/WebAssembly/README.txt | 95 +- lib/Target/WebAssembly/Relooper.cpp | 984 -- lib/Target/WebAssembly/Relooper.h | 186 - lib/Target/WebAssembly/TargetInfo/Makefile | 15 - lib/Target/WebAssembly/WebAssembly.h | 11 +- lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp | 14 +- lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp | 86 +- lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp | 338 +- lib/Target/WebAssembly/WebAssemblyFastISel.cpp | 1108 +- .../WebAssemblyFixIrreducibleControlFlow.cpp | 296 + .../WebAssembly/WebAssemblyFrameLowering.cpp | 206 +- lib/Target/WebAssembly/WebAssemblyFrameLowering.h | 21 +- lib/Target/WebAssembly/WebAssemblyISD.def | 2 +- lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp | 18 +- lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 361 +- lib/Target/WebAssembly/WebAssemblyISelLowering.h | 25 +- lib/Target/WebAssembly/WebAssemblyInstrControl.td | 34 +- lib/Target/WebAssembly/WebAssemblyInstrFloat.td | 16 +- lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp | 64 +- lib/Target/WebAssembly/WebAssemblyInstrInfo.h | 12 +- lib/Target/WebAssembly/WebAssemblyInstrInfo.td | 41 +- lib/Target/WebAssembly/WebAssemblyInstrInteger.td | 35 +- lib/Target/WebAssembly/WebAssemblyInstrMemory.td | 608 +- .../WebAssembly/WebAssemblyLowerBrUnless.cpp | 21 +- .../WebAssembly/WebAssemblyMachineFunctionInfo.h | 40 +- .../WebAssemblyOptimizeLiveIntervals.cpp | 105 + lib/Target/WebAssembly/WebAssemblyPEI.cpp | 1066 -- lib/Target/WebAssembly/WebAssemblyPeephole.cpp | 135 +- .../WebAssemblyPrepareForLiveIntervals.cpp | 136 + lib/Target/WebAssembly/WebAssemblyRegColoring.cpp | 4 +- lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp | 32 +- lib/Target/WebAssembly/WebAssemblyRegStackify.cpp | 722 +- lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp | 73 +- .../WebAssembly/WebAssemblyReplacePhysRegs.cpp | 97 + .../WebAssembly/WebAssemblySelectionDAGInfo.h | 6 +- .../WebAssembly/WebAssemblySetP2AlignOperands.cpp | 114 + lib/Target/WebAssembly/WebAssemblyStoreResults.cpp | 158 +- lib/Target/WebAssembly/WebAssemblySubtarget.cpp | 12 +- .../WebAssembly/WebAssemblyTargetMachine.cpp | 104 +- lib/Target/WebAssembly/WebAssemblyTargetMachine.h | 4 +- .../WebAssembly/WebAssemblyTargetTransformInfo.cpp | 56 + .../WebAssembly/WebAssemblyTargetTransformInfo.h | 10 +- lib/Target/WebAssembly/known_gcc_test_failures.txt | 248 +- lib/Target/X86/AsmParser/Makefile | 15 - lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp | 4 +- lib/Target/X86/AsmParser/X86AsmParser.cpp | 366 +- lib/Target/X86/AsmParser/X86AsmParserCommon.h | 2 + lib/Target/X86/AsmParser/X86Operand.h | 61 +- lib/Target/X86/CMakeLists.txt | 7 +- lib/Target/X86/Disassembler/Makefile | 18 - lib/Target/X86/Disassembler/X86Disassembler.cpp | 108 +- lib/Target/X86/Disassembler/X86Disassembler.h | 112 - .../X86/Disassembler/X86DisassemblerDecoder.cpp | 24 +- .../X86/Disassembler/X86DisassemblerDecoder.h | 7 + .../Disassembler/X86DisassemblerDecoderCommon.h | 10 - lib/Target/X86/InstPrinter/Makefile | 15 - lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp | 23 +- lib/Target/X86/InstPrinter/X86InstComments.cpp | 837 +- lib/Target/X86/MCTargetDesc/CMakeLists.txt | 2 - lib/Target/X86/MCTargetDesc/Makefile | 16 - lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 114 +- lib/Target/X86/MCTargetDesc/X86BaseInfo.h | 31 +- lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp | 75 +- .../X86/MCTargetDesc/X86ELFRelocationInfo.cpp | 141 - lib/Target/X86/MCTargetDesc/X86FixupKinds.h | 6 + lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp | 2 +- lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp | 618 +- lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp | 101 +- lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h | 10 +- .../X86/MCTargetDesc/X86MachORelocationInfo.cpp | 119 - .../X86/MCTargetDesc/X86MachObjectWriter.cpp | 7 +- .../X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp | 8 + lib/Target/X86/Makefile | 23 - lib/Target/X86/README-X86-64.txt | 2 +- lib/Target/X86/README.txt | 6 +- lib/Target/X86/TargetInfo/Makefile | 16 - lib/Target/X86/Utils/Makefile | 15 - lib/Target/X86/Utils/X86ShuffleDecode.cpp | 162 +- lib/Target/X86/Utils/X86ShuffleDecode.h | 94 +- lib/Target/X86/X86.h | 15 + lib/Target/X86/X86.td | 395 +- lib/Target/X86/X86AsmPrinter.cpp | 81 +- lib/Target/X86/X86AsmPrinter.h | 59 +- lib/Target/X86/X86CallFrameOptimization.cpp | 190 +- lib/Target/X86/X86CallingConv.td | 60 +- lib/Target/X86/X86ExpandPseudo.cpp | 82 +- lib/Target/X86/X86FastISel.cpp | 332 +- lib/Target/X86/X86FixupBWInsts.cpp | 371 + lib/Target/X86/X86FixupLEAs.cpp | 124 +- lib/Target/X86/X86FixupSetCC.cpp | 186 + lib/Target/X86/X86FloatingPoint.cpp | 410 +- lib/Target/X86/X86FrameLowering.cpp | 529 +- lib/Target/X86/X86FrameLowering.h | 53 +- lib/Target/X86/X86ISelDAGToDAG.cpp | 444 +- lib/Target/X86/X86ISelLowering.cpp | 12248 ++++++++++++------- lib/Target/X86/X86ISelLowering.h | 306 +- lib/Target/X86/X86InstrAVX512.td | 3106 +++-- lib/Target/X86/X86InstrBuilder.h | 30 +- lib/Target/X86/X86InstrCompiler.td | 216 +- lib/Target/X86/X86InstrControl.td | 12 +- lib/Target/X86/X86InstrFPStack.td | 10 +- lib/Target/X86/X86InstrFormats.td | 2 +- lib/Target/X86/X86InstrFragmentsSIMD.td | 244 +- lib/Target/X86/X86InstrInfo.cpp | 2229 ++-- lib/Target/X86/X86InstrInfo.h | 268 +- lib/Target/X86/X86InstrInfo.td | 367 +- lib/Target/X86/X86InstrMMX.td | 2 +- lib/Target/X86/X86InstrMPX.td | 4 +- lib/Target/X86/X86InstrSSE.td | 1473 ++- lib/Target/X86/X86InstrSystem.td | 40 +- lib/Target/X86/X86InstrVMX.td | 6 +- lib/Target/X86/X86InstrXOP.td | 214 +- lib/Target/X86/X86IntrinsicsInfo.h | 928 +- lib/Target/X86/X86MCInstLower.cpp | 635 +- lib/Target/X86/X86MachineFunctionInfo.h | 13 +- lib/Target/X86/X86OptimizeLEAs.cpp | 480 +- lib/Target/X86/X86PadShortFunction.cpp | 14 +- lib/Target/X86/X86RegisterInfo.cpp | 52 +- lib/Target/X86/X86RegisterInfo.td | 63 +- lib/Target/X86/X86Schedule.td | 15 +- lib/Target/X86/X86ScheduleAtom.td | 1 + lib/Target/X86/X86SelectionDAGInfo.cpp | 17 +- lib/Target/X86/X86SelectionDAGInfo.h | 26 +- lib/Target/X86/X86ShuffleDecodeConstantPool.cpp | 218 +- lib/Target/X86/X86ShuffleDecodeConstantPool.h | 15 +- lib/Target/X86/X86Subtarget.cpp | 175 +- lib/Target/X86/X86Subtarget.h | 97 +- lib/Target/X86/X86TargetMachine.cpp | 94 +- lib/Target/X86/X86TargetMachine.h | 5 +- lib/Target/X86/X86TargetObjectFile.cpp | 75 +- lib/Target/X86/X86TargetObjectFile.h | 13 +- lib/Target/X86/X86TargetTransformInfo.cpp | 405 +- lib/Target/X86/X86TargetTransformInfo.h | 5 + lib/Target/X86/X86VZeroUpper.cpp | 88 +- lib/Target/X86/X86WinAllocaExpander.cpp | 294 + lib/Target/X86/X86WinEHState.cpp | 464 +- lib/Target/XCore/Disassembler/Makefile | 16 - .../XCore/Disassembler/XCoreDisassembler.cpp | 2 +- lib/Target/XCore/InstPrinter/Makefile | 16 - lib/Target/XCore/MCTargetDesc/Makefile | 16 - .../XCore/MCTargetDesc/XCoreMCTargetDesc.cpp | 18 +- lib/Target/XCore/Makefile | 23 - lib/Target/XCore/TargetInfo/Makefile | 16 - lib/Target/XCore/XCoreAsmPrinter.cpp | 3 +- lib/Target/XCore/XCoreFrameLowering.cpp | 37 +- lib/Target/XCore/XCoreFrameLowering.h | 4 +- lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp | 5 +- lib/Target/XCore/XCoreISelDAGToDAG.cpp | 77 +- lib/Target/XCore/XCoreISelLowering.cpp | 242 +- lib/Target/XCore/XCoreISelLowering.h | 39 +- lib/Target/XCore/XCoreInstrInfo.cpp | 68 +- lib/Target/XCore/XCoreInstrInfo.h | 13 +- lib/Target/XCore/XCoreLowerThreadLocal.cpp | 12 +- lib/Target/XCore/XCoreSelectionDAGInfo.cpp | 13 +- lib/Target/XCore/XCoreSelectionDAGInfo.h | 21 +- lib/Target/XCore/XCoreTargetMachine.cpp | 14 +- lib/Target/XCore/XCoreTargetMachine.h | 2 +- lib/Target/XCore/XCoreTargetObjectFile.cpp | 8 +- lib/Target/XCore/XCoreTargetObjectFile.h | 3 +- 1042 files changed, 106628 insertions(+), 53181 deletions(-) create mode 100644 lib/Target/AArch64/AArch64CallLowering.cpp create mode 100644 lib/Target/AArch64/AArch64CallLowering.h create mode 100644 lib/Target/AArch64/AArch64RedundantCopyElimination.cpp create mode 100644 lib/Target/AArch64/AArch64RegisterBankInfo.cpp create mode 100644 lib/Target/AArch64/AArch64RegisterBankInfo.h create mode 100644 lib/Target/AArch64/AArch64SchedKryo.td create mode 100644 lib/Target/AArch64/AArch64SchedKryoDetails.td create mode 100644 lib/Target/AArch64/AArch64SchedVulcan.td create mode 100644 lib/Target/AArch64/AArch64SystemOperands.td delete mode 100644 lib/Target/AArch64/AsmParser/Makefile delete mode 100644 lib/Target/AArch64/Disassembler/Makefile delete mode 100644 lib/Target/AArch64/InstPrinter/Makefile delete mode 100644 lib/Target/AArch64/MCTargetDesc/Makefile delete mode 100644 lib/Target/AArch64/Makefile delete mode 100644 lib/Target/AArch64/TargetInfo/Makefile delete mode 100644 lib/Target/AArch64/Utils/Makefile create mode 100644 lib/Target/AMDGPU/AMDGPUCallLowering.cpp create mode 100644 lib/Target/AMDGPU/AMDGPUCallLowering.h create mode 100644 lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp delete mode 100644 lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp delete mode 100644 lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h create mode 100644 lib/Target/AMDGPU/AMDGPURuntimeMetadata.h delete mode 100644 lib/Target/AMDGPU/AsmParser/Makefile create mode 100644 lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp create mode 100644 lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h create mode 100644 lib/Target/AMDGPU/Disassembler/CMakeLists.txt create mode 100644 lib/Target/AMDGPU/Disassembler/LLVMBuild.txt create mode 100644 lib/Target/AMDGPU/GCNHazardRecognizer.cpp create mode 100644 lib/Target/AMDGPU/GCNHazardRecognizer.h delete mode 100644 lib/Target/AMDGPU/InstPrinter/Makefile delete mode 100644 lib/Target/AMDGPU/MCTargetDesc/Makefile delete mode 100644 lib/Target/AMDGPU/Makefile create mode 100644 lib/Target/AMDGPU/R600FrameLowering.cpp create mode 100644 lib/Target/AMDGPU/R600FrameLowering.h delete mode 100644 lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp create mode 100644 lib/Target/AMDGPU/SIDebuggerInsertNops.cpp delete mode 100644 lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp create mode 100644 lib/Target/AMDGPU/SIWholeQuadMode.cpp delete mode 100644 lib/Target/AMDGPU/TargetInfo/Makefile create mode 100644 lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp create mode 100644 lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h create mode 100644 lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h create mode 100644 lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp create mode 100644 lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h delete mode 100644 lib/Target/AMDGPU/Utils/Makefile delete mode 100644 lib/Target/ARM/AsmParser/Makefile delete mode 100644 lib/Target/ARM/Disassembler/Makefile delete mode 100644 lib/Target/ARM/InstPrinter/Makefile delete mode 100644 lib/Target/ARM/MCTargetDesc/Makefile delete mode 100644 lib/Target/ARM/Makefile delete mode 100644 lib/Target/ARM/TargetInfo/Makefile delete mode 100644 lib/Target/AVR/AVRConfig.h create mode 100644 lib/Target/AVR/AVRFrameLowering.h create mode 100644 lib/Target/AVR/AVRISelLowering.h create mode 100644 lib/Target/AVR/AVRInstrFormats.td create mode 100644 lib/Target/AVR/AVRInstrInfo.cpp create mode 100644 lib/Target/AVR/AVRInstrInfo.h create mode 100644 lib/Target/AVR/AVRInstrInfo.td create mode 100644 lib/Target/AVR/AVRRegisterInfo.cpp create mode 100644 lib/Target/AVR/AVRRegisterInfo.h create mode 100644 lib/Target/AVR/AVRSubtarget.cpp create mode 100644 lib/Target/AVR/AVRSubtarget.h create mode 100644 lib/Target/AVR/AVRTargetMachine.h create mode 100644 lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp create mode 100644 lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h create mode 100644 lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp create mode 100644 lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h create mode 100644 lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h create mode 100644 lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp create mode 100644 lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h create mode 100644 lib/Target/AVR/MCTargetDesc/CMakeLists.txt create mode 100644 lib/Target/AVR/MCTargetDesc/LLVMBuild.txt delete mode 100644 lib/Target/AVR/Makefile create mode 100644 lib/Target/AVR/TODO.md delete mode 100644 lib/Target/AVR/TargetInfo/Makefile delete mode 100644 lib/Target/BPF/InstPrinter/Makefile delete mode 100644 lib/Target/BPF/MCTargetDesc/Makefile delete mode 100644 lib/Target/BPF/Makefile delete mode 100644 lib/Target/BPF/TargetInfo/Makefile delete mode 100644 lib/Target/CppBackend/CMakeLists.txt delete mode 100644 lib/Target/CppBackend/CPPBackend.cpp delete mode 100644 lib/Target/CppBackend/CPPTargetMachine.h delete mode 100644 lib/Target/CppBackend/LLVMBuild.txt delete mode 100644 lib/Target/CppBackend/Makefile delete mode 100644 lib/Target/CppBackend/TargetInfo/CMakeLists.txt delete mode 100644 lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp delete mode 100644 lib/Target/CppBackend/TargetInfo/LLVMBuild.txt delete mode 100644 lib/Target/CppBackend/TargetInfo/Makefile delete mode 100644 lib/Target/Hexagon/AsmParser/Makefile delete mode 100644 lib/Target/Hexagon/Disassembler/Makefile create mode 100644 lib/Target/Hexagon/HexagonBlockRanges.cpp create mode 100644 lib/Target/Hexagon/HexagonBlockRanges.h create mode 100644 lib/Target/Hexagon/HexagonBranchRelaxation.cpp delete mode 100644 lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp create mode 100644 lib/Target/Hexagon/HexagonOptAddrMode.cpp delete mode 100644 lib/Target/Hexagon/MCTargetDesc/Makefile delete mode 100644 lib/Target/Hexagon/Makefile delete mode 100644 lib/Target/Hexagon/TargetInfo/Makefile create mode 100644 lib/Target/Lanai/AsmParser/CMakeLists.txt create mode 100644 lib/Target/Lanai/AsmParser/LLVMBuild.txt create mode 100644 lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp create mode 100644 lib/Target/Lanai/CMakeLists.txt create mode 100644 lib/Target/Lanai/Disassembler/CMakeLists.txt create mode 100644 lib/Target/Lanai/Disassembler/LLVMBuild.txt create mode 100644 lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp create mode 100644 lib/Target/Lanai/Disassembler/LanaiDisassembler.h create mode 100644 lib/Target/Lanai/InstPrinter/CMakeLists.txt create mode 100644 lib/Target/Lanai/InstPrinter/LLVMBuild.txt create mode 100644 lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp create mode 100644 lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h create mode 100644 lib/Target/Lanai/LLVMBuild.txt create mode 100644 lib/Target/Lanai/Lanai.h create mode 100644 lib/Target/Lanai/Lanai.td create mode 100644 lib/Target/Lanai/LanaiAluCode.h create mode 100644 lib/Target/Lanai/LanaiAsmPrinter.cpp create mode 100644 lib/Target/Lanai/LanaiCallingConv.td create mode 100644 lib/Target/Lanai/LanaiCondCode.h create mode 100644 lib/Target/Lanai/LanaiDelaySlotFiller.cpp create mode 100644 lib/Target/Lanai/LanaiFrameLowering.cpp create mode 100644 lib/Target/Lanai/LanaiFrameLowering.h create mode 100644 lib/Target/Lanai/LanaiISelDAGToDAG.cpp create mode 100644 lib/Target/Lanai/LanaiISelLowering.cpp create mode 100644 lib/Target/Lanai/LanaiISelLowering.h create mode 100644 lib/Target/Lanai/LanaiInstrFormats.td create mode 100644 lib/Target/Lanai/LanaiInstrInfo.cpp create mode 100644 lib/Target/Lanai/LanaiInstrInfo.h create mode 100644 lib/Target/Lanai/LanaiInstrInfo.td create mode 100644 lib/Target/Lanai/LanaiMCInstLower.cpp create mode 100644 lib/Target/Lanai/LanaiMCInstLower.h create mode 100644 lib/Target/Lanai/LanaiMachineFunctionInfo.cpp create mode 100644 lib/Target/Lanai/LanaiMachineFunctionInfo.h create mode 100644 lib/Target/Lanai/LanaiMemAluCombiner.cpp create mode 100644 lib/Target/Lanai/LanaiRegisterInfo.cpp create mode 100644 lib/Target/Lanai/LanaiRegisterInfo.h create mode 100644 lib/Target/Lanai/LanaiRegisterInfo.td create mode 100644 lib/Target/Lanai/LanaiSchedule.td create mode 100644 lib/Target/Lanai/LanaiSelectionDAGInfo.cpp create mode 100644 lib/Target/Lanai/LanaiSelectionDAGInfo.h create mode 100644 lib/Target/Lanai/LanaiSubtarget.cpp create mode 100644 lib/Target/Lanai/LanaiSubtarget.h create mode 100644 lib/Target/Lanai/LanaiTargetMachine.cpp create mode 100644 lib/Target/Lanai/LanaiTargetMachine.h create mode 100644 lib/Target/Lanai/LanaiTargetObjectFile.cpp create mode 100644 lib/Target/Lanai/LanaiTargetObjectFile.h create mode 100644 lib/Target/Lanai/LanaiTargetTransformInfo.h create mode 100644 lib/Target/Lanai/MCTargetDesc/CMakeLists.txt create mode 100644 lib/Target/Lanai/MCTargetDesc/LLVMBuild.txt create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp create mode 100644 lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h create mode 100644 lib/Target/Lanai/TargetInfo/CMakeLists.txt create mode 100644 lib/Target/Lanai/TargetInfo/LLVMBuild.txt create mode 100644 lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp delete mode 100644 lib/Target/MSP430/InstPrinter/Makefile delete mode 100644 lib/Target/MSP430/MCTargetDesc/Makefile delete mode 100644 lib/Target/MSP430/Makefile delete mode 100644 lib/Target/MSP430/TargetInfo/Makefile delete mode 100644 lib/Target/Makefile delete mode 100644 lib/Target/Mips/AsmParser/Makefile delete mode 100644 lib/Target/Mips/Disassembler/Makefile delete mode 100644 lib/Target/Mips/InstPrinter/Makefile delete mode 100644 lib/Target/Mips/MCTargetDesc/Makefile delete mode 100644 lib/Target/Mips/Makefile create mode 100644 lib/Target/Mips/MipsHazardSchedule.cpp delete mode 100644 lib/Target/Mips/TargetInfo/Makefile delete mode 100644 lib/Target/NVPTX/InstPrinter/Makefile delete mode 100644 lib/Target/NVPTX/MCTargetDesc/Makefile delete mode 100644 lib/Target/NVPTX/Makefile create mode 100644 lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp create mode 100644 lib/Target/NVPTX/NVVMIntrRange.cpp delete mode 100644 lib/Target/NVPTX/TargetInfo/Makefile delete mode 100644 lib/Target/PowerPC/AsmParser/Makefile delete mode 100644 lib/Target/PowerPC/Disassembler/Makefile delete mode 100644 lib/Target/PowerPC/InstPrinter/Makefile delete mode 100644 lib/Target/PowerPC/MCTargetDesc/Makefile delete mode 100644 lib/Target/PowerPC/Makefile create mode 100644 lib/Target/PowerPC/PPCCCState.cpp create mode 100644 lib/Target/PowerPC/PPCCCState.h delete mode 100644 lib/Target/PowerPC/PPCLoopDataPrefetch.cpp create mode 100644 lib/Target/PowerPC/PPCQPXLoadSplat.cpp create mode 100644 lib/Target/PowerPC/README_P9.txt delete mode 100644 lib/Target/PowerPC/TargetInfo/Makefile create mode 100644 lib/Target/PowerPC/p9-instrs.txt delete mode 100644 lib/Target/Sparc/AsmParser/Makefile delete mode 100644 lib/Target/Sparc/Disassembler/Makefile delete mode 100644 lib/Target/Sparc/InstPrinter/Makefile create mode 100755 lib/Target/Sparc/LeonFeatures.td create mode 100755 lib/Target/Sparc/LeonPasses.cpp create mode 100755 lib/Target/Sparc/LeonPasses.h delete mode 100644 lib/Target/Sparc/MCTargetDesc/Makefile delete mode 100644 lib/Target/Sparc/Makefile create mode 100755 lib/Target/Sparc/SparcSchedule.td delete mode 100644 lib/Target/Sparc/TargetInfo/Makefile delete mode 100644 lib/Target/SystemZ/AsmParser/Makefile delete mode 100644 lib/Target/SystemZ/Disassembler/Makefile delete mode 100644 lib/Target/SystemZ/InstPrinter/Makefile delete mode 100644 lib/Target/SystemZ/MCTargetDesc/Makefile delete mode 100644 lib/Target/SystemZ/Makefile create mode 100644 lib/Target/SystemZ/SystemZTDC.cpp delete mode 100644 lib/Target/SystemZ/TargetInfo/Makefile delete mode 100644 lib/Target/WebAssembly/Disassembler/Makefile delete mode 100644 lib/Target/WebAssembly/InstPrinter/Makefile delete mode 100644 lib/Target/WebAssembly/MCTargetDesc/Makefile delete mode 100644 lib/Target/WebAssembly/Makefile delete mode 100644 lib/Target/WebAssembly/Relooper.cpp delete mode 100644 lib/Target/WebAssembly/Relooper.h delete mode 100644 lib/Target/WebAssembly/TargetInfo/Makefile create mode 100644 lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp create mode 100644 lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp delete mode 100644 lib/Target/WebAssembly/WebAssemblyPEI.cpp create mode 100644 lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp create mode 100644 lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp create mode 100644 lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp delete mode 100644 lib/Target/X86/AsmParser/Makefile delete mode 100644 lib/Target/X86/Disassembler/Makefile delete mode 100644 lib/Target/X86/Disassembler/X86Disassembler.h delete mode 100644 lib/Target/X86/InstPrinter/Makefile delete mode 100644 lib/Target/X86/MCTargetDesc/Makefile delete mode 100644 lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp delete mode 100644 lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp delete mode 100644 lib/Target/X86/Makefile delete mode 100644 lib/Target/X86/TargetInfo/Makefile delete mode 100644 lib/Target/X86/Utils/Makefile create mode 100644 lib/Target/X86/X86FixupBWInsts.cpp create mode 100644 lib/Target/X86/X86FixupSetCC.cpp create mode 100644 lib/Target/X86/X86WinAllocaExpander.cpp delete mode 100644 lib/Target/XCore/Disassembler/Makefile delete mode 100644 lib/Target/XCore/InstPrinter/Makefile delete mode 100644 lib/Target/XCore/MCTargetDesc/Makefile delete mode 100644 lib/Target/XCore/Makefile delete mode 100644 lib/Target/XCore/TargetInfo/Makefile (limited to 'lib/Target') diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 21106c9ad29a..c767c75fce57 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -27,6 +27,7 @@ class FunctionPass; class MachineFunctionPass; FunctionPass *createAArch64DeadRegisterDefinitions(); +FunctionPass *createAArch64RedundantCopyEliminationPass(); FunctionPass *createAArch64ConditionalCompares(); FunctionPass *createAArch64AdvSIMDScalar(); FunctionPass *createAArch64BranchRelaxation(); @@ -44,6 +45,8 @@ FunctionPass *createAArch64A53Fix835769(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); + +void initializeAArch64ExpandPseudoPass(PassRegistry&); } // end namespace llvm #endif diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index cd3e84d38fe2..b1e881685b0c 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -11,7 +11,7 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// Target-independent interfaces which we are implementing +// Target-independent interfaces which we are implementing. //===----------------------------------------------------------------------===// include "llvm/Target/Target.td" @@ -32,6 +32,9 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable ARMv8 CRC-32 checksum instructions">; +def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", + "Enable ARMv8 Reliability, Availability and Serviceability Extensions">; + def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", "Enable ARMv8 PMUv3 Performance Monitors extension">; @@ -58,6 +61,50 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true", "Reserve X18, making it unavailable " "as a GPR">; +def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld", + "MergeNarrowLoads", "true", + "Merge narrow load instructions">; + +def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", + "Use alias analysis during codegen">; + +def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps", + "true", + "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">; + +def FeaturePredictableSelectIsExpensive : SubtargetFeature< + "predictable-select-expensive", "PredictableSelectIsExpensive", "true", + "Prefer likely predicted branches over selects">; + +def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move", + "CustomAsCheapAsMove", "true", + "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">; + +def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", + "UsePostRAScheduler", "true", "Schedule again after register allocation">; + +def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store", + "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">; + +def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs", + "AvoidQuadLdStPairs", "true", + "Do not form quad load/store pair operations">; + +def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature< + "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern", + "true", "Use alternative pattern for sextload convert to f32">; + +def FeatureMacroOpFusion : SubtargetFeature< + "macroop-fusion", "HasMacroOpFusion", "true", + "CPU supports macro op fusion">; + +def FeatureDisableLatencySchedHeuristic : SubtargetFeature< + "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", + "Disable latency scheduling heuristic">; + +def FeatureUseRSqrt : SubtargetFeature< + "use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -66,7 +113,7 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", [FeatureCRC]>; def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", - "Support ARM v8.2a instructions", [HasV8_1aOps]>; + "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>; //===----------------------------------------------------------------------===// // Register File Description @@ -84,6 +131,12 @@ include "AArch64InstrInfo.td" def AArch64InstrInfo : InstrInfo; +//===----------------------------------------------------------------------===// +// Named operands for MRS/MSR/TLBI/... +//===----------------------------------------------------------------------===// + +include "AArch64SystemOperands.td" + //===----------------------------------------------------------------------===// // AArch64 Processors supported. // @@ -91,61 +144,133 @@ include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" include "AArch64SchedM1.td" +include "AArch64SchedKryo.td" +include "AArch64SchedVulcan.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", - "Cortex-A35 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Cortex-A35 ARM processors", [ FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon + ]>; def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", - "Cortex-A53 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Cortex-A53 ARM processors", [ + FeatureBalanceFPOps, FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureUseAA + ]>; def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", - "Cortex-A57 ARM processors", - [FeatureFPARMv8, + "Cortex-A57 ARM processors", [ + FeatureBalanceFPOps, + FeatureCRC, + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureMergeNarrowLd, FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive + ]>; + +def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", + "Cortex-A72 ARM processors", [ + FeatureCRC, FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon + ]>; + +def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", + "Cortex-A73 ARM processors", [ FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon + ]>; def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", - "Cyclone", - [FeatureFPARMv8, - FeatureNEON, + "Cyclone", [ + FeatureAlternateSExtLoadCVTF32Pattern, FeatureCrypto, - FeatureCRC, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureMacroOpFusion, + FeatureNEON, FeaturePerfMon, - FeatureZCRegMove, FeatureZCZeroing]>; + FeatureSlowMisaligned128Store, + FeatureZCRegMove, + FeatureZCZeroing + ]>; def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", - "Samsung Exynos-M1 processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Samsung Exynos-M1 processors", [ + FeatureAvoidQuadLdStPairs, FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureUseRSqrt + ]>; + +def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", + "Qualcomm Kryo processors", [ + FeatureCRC, + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureMergeNarrowLd, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureZCZeroing + ]>; + +def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan", + "Broadcom Vulcan processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureMacroOpFusion, + FeatureNEON, + FeaturePostRAScheduler, + HasV8_1aOps]>; -def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, - FeatureNEON, - FeatureCRC, - FeaturePerfMon]>; +def : ProcessorModel<"generic", NoSchedModel, [ + FeatureCRC, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler + ]>; // FIXME: Cortex-A35 is currently modelled as a Cortex-A53 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; -// FIXME: Cortex-A72 is currently modelled as an Cortex-A57. -def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>; +// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57. +def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>; +def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; +def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; +def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>; //===----------------------------------------------------------------------===// // Assembly parser diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp index d215d9e831c0..c2cca63f4977 100644 --- a/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -22,7 +22,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -87,6 +86,11 @@ public: bool runOnMachineFunction(MachineFunction &F) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return "Workaround A53 erratum 835769 pass"; } @@ -133,8 +137,8 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB, MachineBasicBlock *PrevBB = &*std::prev(MBBI); for (MachineBasicBlock *S : MBB->predecessors()) - if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) && - !TBB && !FBB) + if (S == PrevBB && !TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) && !TBB && + !FBB) return S; return nullptr; diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 3d1ab4e3fc2b..0465e59dc54a 100644 --- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -43,7 +43,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include using namespace llvm; #define DEBUG_TYPE "aarch64-a57-fp-load-balancing" @@ -125,6 +124,11 @@ public: bool runOnMachineFunction(MachineFunction &F) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return "A57 FP Anti-dependency breaker"; } @@ -222,7 +226,7 @@ public: } /// Return true if MI is a member of the chain. - bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; } + bool contains(MachineInstr &MI) { return Insts.count(&MI) > 0; } /// Return the number of instructions in the chain. unsigned size() const { @@ -248,9 +252,10 @@ public: MachineInstr *getKill() const { return KillInst; } /// Return an instruction that can be used as an iterator for the end /// of the chain. This is the maximum of KillInst (if set) and LastInst. - MachineBasicBlock::iterator getEnd() const { + MachineBasicBlock::iterator end() const { return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst); } + MachineBasicBlock::iterator begin() const { return getStart(); } /// Can the Kill instruction (assuming one exists) be modified? bool isKillImmutable() const { return KillIsImmutable; } @@ -307,9 +312,10 @@ public: //===----------------------------------------------------------------------===// bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { - // Don't do anything if this isn't an A53 or A57. - if (!(F.getSubtarget().isCortexA53() || - F.getSubtarget().isCortexA57())) + if (skipFunction(*F.getFunction())) + return false; + + if (!F.getSubtarget().balanceFPOps()) return false; bool Changed = false; @@ -492,15 +498,14 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector GV, int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB) { RegScavenger RS; - RS.enterBasicBlock(&MBB); + RS.enterBasicBlock(MBB); RS.forward(MachineBasicBlock::iterator(G->getStart())); // Can we find an appropriate register that is available throughout the life // of the chain? unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass; BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID)); - for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd(); - I != E; ++I) { + for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) { RS.forward(I); AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID)); @@ -530,8 +535,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, for (auto Reg : Ord) { if (!AvailableRegs[Reg]) continue; - if ((C == Color::Even && (Reg % 2) == 0) || - (C == Color::Odd && (Reg % 2) == 1)) + if (C == getColor(Reg)) return Reg; } @@ -554,16 +558,14 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n"); std::map Substs; - for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd(); - I != E; ++I) { - if (!G->contains(I) && - (&*I != G->getKill() || G->isKillImmutable())) + for (MachineInstr &I : *G) { + if (!G->contains(I) && (&I != G->getKill() || G->isKillImmutable())) continue; // I is a member of G, or I is a mutable instruction that kills G. std::vector ToErase; - for (auto &U : I->operands()) { + for (auto &U : I.operands()) { if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) { unsigned OrigReg = U.getReg(); U.setReg(Substs[OrigReg]); @@ -583,11 +585,11 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, Substs.erase(J); // Only change the def if this isn't the last instruction. - if (&*I != G->getKill()) { - MachineOperand &MO = I->getOperand(0); + if (&I != G->getKill()) { + MachineOperand &MO = I.getOperand(0); bool Change = TransformAll || getColor(MO.getReg()) != C; - if (G->requiresFixup() && &*I == G->getLast()) + if (G->requiresFixup() && &I == G->getLast()) Change = false; if (Change) { diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp index 3afcdfb8b930..4846ef08c983 100644 --- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -20,10 +20,9 @@ // e = getelementptr ..., i64 a // // This is legal to do if the computations are marked with either nsw or nuw -// markers. -// Moreover, the current heuristic is simple: it does not create new sext -// operations, i.e., it gives up when a sext would have forked (e.g., if -// a = add i32 b, c, two sexts are required to promote the computation). +// markers. Moreover, the current heuristic is simple: it does not create new +// sext operations, i.e., it gives up when a sext would have forked (e.g., if a +// = add i32 b, c, two sexts are required to promote the computation). // // FIXME: This pass may be useful for other targets too. // ===---------------------------------------------------------------------===// @@ -207,9 +206,7 @@ bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) { } static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) { - if (isa(Inst) && OpIdx == 0) - return false; - return true; + return !(isa(Inst) && OpIdx == 0); } bool @@ -481,6 +478,9 @@ void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) { } bool AArch64AddressTypePromotion::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + if (!EnableAddressTypePromotion || F.isDeclaration()) return false; Func = &F; diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 1644d71d2821..d0a2dd3fa1fc 100644 --- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -76,12 +76,12 @@ private: // isProfitableToTransform - Predicate function to determine whether an // instruction should be transformed to its equivalent AdvSIMD scalar // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. - bool isProfitableToTransform(const MachineInstr *MI) const; + bool isProfitableToTransform(const MachineInstr &MI) const; // transformInstruction - Perform the transformation of an instruction // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs // to be the correct register class, minimizing cross-class copies. - void transformInstruction(MachineInstr *MI); + void transformInstruction(MachineInstr &MI); // processMachineBasicBlock - Main optimzation loop. bool processMachineBasicBlock(MachineBasicBlock *MBB); @@ -132,19 +132,19 @@ static bool isFPR64(unsigned Reg, unsigned SubReg, // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64 // copy instruction. Return zero_reg if the instruction is not a copy. -static unsigned getSrcFromCopy(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - unsigned &SubReg) { +static MachineOperand *getSrcFromCopy(MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &SubReg) { SubReg = 0; // The "FMOV Xd, Dn" instruction is the typical form. if (MI->getOpcode() == AArch64::FMOVDXr || MI->getOpcode() == AArch64::FMOVXDr) - return MI->getOperand(1).getReg(); + return &MI->getOperand(1); // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see // these at this stage, but it's easy to check for. if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) { SubReg = AArch64::dsub; - return MI->getOperand(1).getReg(); + return &MI->getOperand(1); } // Or just a plain COPY instruction. This can be directly to/from FPR64, // or it can be a dsub subreg reference to an FPR128. @@ -152,18 +152,18 @@ static unsigned getSrcFromCopy(const MachineInstr *MI, if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), MRI) && isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI)) - return MI->getOperand(1).getReg(); + return &MI->getOperand(1); if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), MRI) && isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI)) { SubReg = MI->getOperand(1).getSubReg(); - return MI->getOperand(1).getReg(); + return &MI->getOperand(1); } } // Otherwise, this is some other kind of instruction. - return 0; + return nullptr; } // getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent @@ -189,16 +189,16 @@ static unsigned getTransformOpcode(unsigned Opc) { return Opc; } -static bool isTransformable(const MachineInstr *MI) { - unsigned Opc = MI->getOpcode(); +static bool isTransformable(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); return Opc != getTransformOpcode(Opc); } // isProfitableToTransform - Predicate function to determine whether an // instruction should be transformed to its equivalent AdvSIMD scalar // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. -bool -AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { +bool AArch64AdvSIMDScalar::isProfitableToTransform( + const MachineInstr &MI) const { // If this instruction isn't eligible to be transformed (no SIMD equivalent), // early exit since that's the common case. if (!isTransformable(MI)) @@ -209,33 +209,33 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { unsigned NumNewCopies = 3; unsigned NumRemovableCopies = 0; - unsigned OrigSrc0 = MI->getOperand(1).getReg(); - unsigned OrigSrc1 = MI->getOperand(2).getReg(); - unsigned Src0 = 0, SubReg0; - unsigned Src1 = 0, SubReg1; + unsigned OrigSrc0 = MI.getOperand(1).getReg(); + unsigned OrigSrc1 = MI.getOperand(2).getReg(); + unsigned SubReg0; + unsigned SubReg1; if (!MRI->def_empty(OrigSrc0)) { MachineRegisterInfo::def_instr_iterator Def = MRI->def_instr_begin(OrigSrc0); assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); + MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0); // If the source was from a copy, we don't need to insert a new copy. - if (Src0) + if (MOSrc0) --NumNewCopies; // If there are no other users of the original source, we can delete // that instruction. - if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) + if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0)) ++NumRemovableCopies; } if (!MRI->def_empty(OrigSrc1)) { MachineRegisterInfo::def_instr_iterator Def = MRI->def_instr_begin(OrigSrc1); assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); - if (Src1) + MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1); + if (MOSrc1) --NumNewCopies; // If there are no other users of the original source, we can delete // that instruction. - if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) + if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1)) ++NumRemovableCopies; } @@ -244,14 +244,14 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { // any of the uses is a transformable instruction, it's likely the tranforms // will chain, enabling us to save a copy there, too. This is an aggressive // heuristic that approximates the graph based cost analysis described above. - unsigned Dst = MI->getOperand(0).getReg(); + unsigned Dst = MI.getOperand(0).getReg(); bool AllUsesAreCopies = true; for (MachineRegisterInfo::use_instr_nodbg_iterator Use = MRI->use_instr_nodbg_begin(Dst), E = MRI->use_instr_nodbg_end(); Use != E; ++Use) { unsigned SubReg; - if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use)) + if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use)) ++NumRemovableCopies; // If the use is an INSERT_SUBREG, that's still something that can // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's @@ -279,12 +279,11 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { return TransformAll; } -static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI, +static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI, unsigned Dst, unsigned Src, bool IsKill) { - MachineInstrBuilder MIB = - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY), - Dst) - .addReg(Src, getKillRegState(IsKill)); + MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AArch64::COPY), Dst) + .addReg(Src, getKillRegState(IsKill)); DEBUG(dbgs() << " adding copy: " << *MIB); ++NumCopiesInserted; return MIB; @@ -293,43 +292,56 @@ static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI, // transformInstruction - Perform the transformation of an instruction // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs // to be the correct register class, minimizing cross-class copies. -void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { - DEBUG(dbgs() << "Scalar transform: " << *MI); +void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) { + DEBUG(dbgs() << "Scalar transform: " << MI); - MachineBasicBlock *MBB = MI->getParent(); - unsigned OldOpc = MI->getOpcode(); + MachineBasicBlock *MBB = MI.getParent(); + unsigned OldOpc = MI.getOpcode(); unsigned NewOpc = getTransformOpcode(OldOpc); assert(OldOpc != NewOpc && "transform an instruction to itself?!"); // Check if we need a copy for the source registers. - unsigned OrigSrc0 = MI->getOperand(1).getReg(); - unsigned OrigSrc1 = MI->getOperand(2).getReg(); + unsigned OrigSrc0 = MI.getOperand(1).getReg(); + unsigned OrigSrc1 = MI.getOperand(2).getReg(); unsigned Src0 = 0, SubReg0; unsigned Src1 = 0, SubReg1; + bool KillSrc0 = false, KillSrc1 = false; if (!MRI->def_empty(OrigSrc0)) { MachineRegisterInfo::def_instr_iterator Def = MRI->def_instr_begin(OrigSrc0); assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); + MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0); // If there are no other users of the original source, we can delete // that instruction. - if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) { - assert(Src0 && "Can't delete copy w/o a valid original source!"); - Def->eraseFromParent(); - ++NumCopiesDeleted; + if (MOSrc0) { + Src0 = MOSrc0->getReg(); + KillSrc0 = MOSrc0->isKill(); + // Src0 is going to be reused, thus, it cannot be killed anymore. + MOSrc0->setIsKill(false); + if (MRI->hasOneNonDBGUse(OrigSrc0)) { + assert(MOSrc0 && "Can't delete copy w/o a valid original source!"); + Def->eraseFromParent(); + ++NumCopiesDeleted; + } } } if (!MRI->def_empty(OrigSrc1)) { MachineRegisterInfo::def_instr_iterator Def = MRI->def_instr_begin(OrigSrc1); assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); + MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1); // If there are no other users of the original source, we can delete // that instruction. - if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) { - assert(Src1 && "Can't delete copy w/o a valid original source!"); - Def->eraseFromParent(); - ++NumCopiesDeleted; + if (MOSrc1) { + Src1 = MOSrc1->getReg(); + KillSrc1 = MOSrc1->isKill(); + // Src0 is going to be reused, thus, it cannot be killed anymore. + MOSrc1->setIsKill(false); + if (MRI->hasOneNonDBGUse(OrigSrc1)) { + assert(MOSrc1 && "Can't delete copy w/o a valid original source!"); + Def->eraseFromParent(); + ++NumCopiesDeleted; + } } } // If we weren't able to reference the original source directly, create a @@ -337,12 +349,14 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { if (!Src0) { SubReg0 = 0; Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); - insertCopy(TII, MI, Src0, OrigSrc0, true); + insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0); + KillSrc0 = true; } if (!Src1) { SubReg1 = 0; Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); - insertCopy(TII, MI, Src1, OrigSrc1, true); + insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1); + KillSrc1 = true; } // Create a vreg for the destination. @@ -353,17 +367,17 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { // For now, all of the new instructions have the same simple three-register // form, so no need to special case based on what instruction we're // building. - BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst) - .addReg(Src0, getKillRegState(true), SubReg0) - .addReg(Src1, getKillRegState(true), SubReg1); + BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst) + .addReg(Src0, getKillRegState(KillSrc0), SubReg0) + .addReg(Src1, getKillRegState(KillSrc1), SubReg1); // Now copy the result back out to a GPR. // FIXME: Try to avoid this if all uses could actually just use the FPR64 // directly. - insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true); + insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true); // Erase the old instruction. - MI->eraseFromParent(); + MI.eraseFromParent(); ++NumScalarInsnsUsed; } @@ -372,8 +386,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) { bool Changed = false; for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { - MachineInstr *MI = I; - ++I; + MachineInstr &MI = *I++; if (isProfitableToTransform(MI)) { transformInstruction(MI); Changed = true; @@ -387,6 +400,9 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { bool Changed = false; DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n"); + if (skipFunction(*mf.getFunction())) + return false; + MRI = &mf.getRegInfo(); TII = mf.getSubtarget().getInstrInfo(); diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index ada995bad37e..22374f754603 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -49,6 +49,7 @@ namespace { class AArch64AsmPrinter : public AsmPrinter { AArch64MCInstLower MCInstLowering; StackMaps SM; + const AArch64Subtarget *STI; public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) @@ -83,11 +84,11 @@ public: bool runOnMachineFunction(MachineFunction &F) override { AArch64FI = F.getInfo(); + STI = static_cast(&F.getSubtarget()); return AsmPrinter::runOnMachineFunction(F); } private: - MachineLocation getDebugValueLocation(const MachineInstr *MI) const; void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O); bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); bool printAsmRegInClass(const MachineOperand &MO, @@ -112,6 +113,9 @@ private: /// \brief Emit the LOHs contained in AArch64FI. void EmitLOHs(); + /// Emit instruction to set float register to zero. + void EmitFMov0(const MachineInstr &MI); + typedef std::map MInstToMCSymbol; MInstToMCSymbol LOHInstToLabel; }; @@ -133,19 +137,6 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { } } -MachineLocation -AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { - MachineLocation Location; - assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!"); - // Frame address. Currently handles register +- offset only. - if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) - Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); - else { - DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); - } - return Location; -} - void AArch64AsmPrinter::EmitLOHs() { SmallVector MCArgs; @@ -238,8 +229,7 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, const TargetRegisterClass *RC, bool isVector, raw_ostream &O) { assert(MO.isReg() && "Should only get here with a register!"); - const AArch64RegisterInfo *RI = - MF->getSubtarget().getRegisterInfo(); + const TargetRegisterInfo *RI = STI->getRegisterInfo(); unsigned Reg = MO.getReg(); unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); assert(RI->regsOverlap(RegToPrint, Reg)); @@ -404,16 +394,16 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); EncodedBytes = 16; // Materialize the jump address: - EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi) .addReg(ScratchReg) .addImm((CallTarget >> 32) & 0xFFFF) .addImm(32)); - EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi) .addReg(ScratchReg) .addReg(ScratchReg) .addImm((CallTarget >> 16) & 0xFFFF) .addImm(16)); - EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi) .addReg(ScratchReg) .addReg(ScratchReg) .addImm(CallTarget & 0xFFFF) @@ -430,6 +420,40 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); } +void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { + unsigned DestReg = MI.getOperand(0).getReg(); + if (STI->hasZeroCycleZeroing()) { + // Convert S/D register to corresponding Q register + if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) { + DestReg = AArch64::Q0 + (DestReg - AArch64::S0); + } else { + assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31); + DestReg = AArch64::Q0 + (DestReg - AArch64::D0); + } + MCInst MOVI; + MOVI.setOpcode(AArch64::MOVIv2d_ns); + MOVI.addOperand(MCOperand::createReg(DestReg)); + MOVI.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MOVI); + } else { + MCInst FMov; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unexpected opcode"); + case AArch64::FMOVS0: + FMov.setOpcode(AArch64::FMOVWSr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::WZR)); + break; + case AArch64::FMOVD0: + FMov.setOpcode(AArch64::FMOVXDr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::XZR)); + break; + } + EmitToStreamer(*OutStreamer, FMov); + } +} + // Simple pseudo-instructions have their lowering (with expansion to real // instructions) auto-generated. #include "AArch64GenMCPseudoLowering.inc" @@ -535,6 +559,11 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } + case AArch64::FMOVS0: + case AArch64::FMOVD0: + EmitFMov0(*MI); + return; + case TargetOpcode::STACKMAP: return LowerSTACKMAP(*OutStreamer, SM, *MI); diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp index a614f555a4e9..9ec6ae4118a4 100644 --- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp +++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -177,7 +177,7 @@ void AArch64BranchRelaxation::scanFunction() { void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) { unsigned Size = 0; for (const MachineInstr &MI : MBB) - Size += TII->GetInstSizeInBytes(&MI); + Size += TII->GetInstSizeInBytes(MI); BlockInfo[MBB.getNumber()].Size = Size; } @@ -195,7 +195,7 @@ unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const { // Sum instructions before MI in MBB. for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) { assert(I != MBB->end() && "Didn't find MI in its own basic block?"); - Offset += TII->GetInstSizeInBytes(I); + Offset += TII->GetInstSizeInBytes(*I); } return Offset; } @@ -415,12 +415,12 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) { // Analyze the branch so we know how to update the successor lists. MachineBasicBlock *TBB, *FBB; SmallVector Cond; - TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false); + TII->analyzeBranch(*MBB, TBB, FBB, Cond, false); MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI); // No need for the branch to the next block. We're adding an unconditional // branch to the destination. - int delta = TII->GetInstSizeInBytes(&MBB->back()); + int delta = TII->GetInstSizeInBytes(MBB->back()); BlockInfo[MBB->getNumber()].Size -= delta; MBB->back().eraseFromParent(); // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below @@ -446,12 +446,12 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) { if (MI->getOpcode() == AArch64::Bcc) invertBccCondition(MIB); MIB.addMBB(NextBB); - BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back()); BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB); - BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back()); // Remove the old conditional branch. It may or may not still be in MBB. - BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI); + BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(*MI); MI->eraseFromParent(); // Finally, keep the block offsets up to date. @@ -463,12 +463,13 @@ bool AArch64BranchRelaxation::relaxBranchInstructions() { bool Changed = false; // Relaxing branches involves creating new basic blocks, so re-eval // end() for termination. - for (auto &MBB : *MF) { - MachineInstr *MI = MBB.getFirstTerminator(); - if (isConditionalBranch(MI->getOpcode()) && - !isBlockInRange(MI, getDestBlock(MI), - getBranchDisplacementBits(MI->getOpcode()))) { - fixupConditionalBranch(MI); + for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) { + MachineBasicBlock &MBB = *I; + MachineInstr &MI = *MBB.getFirstTerminator(); + if (isConditionalBranch(MI.getOpcode()) && + !isBlockInRange(&MI, getDestBlock(&MI), + getBranchDisplacementBits(MI.getOpcode()))) { + fixupConditionalBranch(&MI); ++NumRelaxed; Changed = true; } @@ -513,8 +514,7 @@ bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) { return MadeChange; } -/// createAArch64BranchRelaxation - returns an instance of the constpool -/// island pass. +/// Returns an instance of the AArch64 Branch Relaxation pass. FunctionPass *llvm::createAArch64BranchRelaxation() { return new AArch64BranchRelaxation(); } diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp new file mode 100644 index 000000000000..e3522e63c21c --- /dev/null +++ b/lib/Target/AArch64/AArch64CallLowering.cpp @@ -0,0 +1,104 @@ +//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the lowering of LLVM calls to machine code calls for +/// GlobalISel. +/// +//===----------------------------------------------------------------------===// + +#include "AArch64CallLowering.h" +#include "AArch64ISelLowering.h" + +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "This shouldn't be built without GISel" +#endif + +AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) + : CallLowering(&TLI) { +} + +bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, unsigned VReg) const { + MachineInstr *Return = MIRBuilder.buildInstr(AArch64::RET_ReallyLR); + assert(Return && "Unable to build a return instruction?!"); + + assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg"); + if (VReg) { + assert(Val->getType()->isIntegerTy() && "Type not supported yet"); + unsigned Size = Val->getType()->getPrimitiveSizeInBits(); + assert((Size == 64 || Size == 32) && "Size not supported yet"); + unsigned ResReg = (Size == 32) ? AArch64::W0 : AArch64::X0; + // Set the insertion point to be right before Return. + MIRBuilder.setInstr(*Return, /* Before */ true); + MachineInstr *Copy = + MIRBuilder.buildInstr(TargetOpcode::COPY, ResReg, VReg); + (void)Copy; + assert(Copy->getNextNode() == Return && + "The insertion did not happen where we expected"); + MachineInstrBuilder(MIRBuilder.getMF(), Return) + .addReg(ResReg, RegState::Implicit); + } + return true; +} + +bool AArch64CallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args, + const SmallVectorImpl &VRegs) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = *MF.getFunction(); + + SmallVector ArgLocs; + CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + + unsigned NumArgs = Args.size(); + Function::const_arg_iterator CurOrigArg = Args.begin(); + const AArch64TargetLowering &TLI = *getTLI(); + for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { + MVT ValVT = MVT::getVT(CurOrigArg->getType()); + CCAssignFn *AssignFn = + TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); + bool Res = + AssignFn(i, ValVT, ValVT, CCValAssign::Full, ISD::ArgFlagsTy(), CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + assert(ArgLocs.size() == Args.size() && + "We have a different number of location and args?!"); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + + assert(VA.isRegLoc() && "Not yet implemented"); + // Transform the arguments in physical registers into virtual ones. + MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); + MIRBuilder.buildInstr(TargetOpcode::COPY, VRegs[i], VA.getLocReg()); + + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + // We don't care about bitcast. + break; + case CCValAssign::AExt: + case CCValAssign::SExt: + case CCValAssign::ZExt: + // Zero/Sign extend the register. + assert(0 && "Not yet implemented"); + break; + } + } + return true; +} diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h new file mode 100644 index 000000000000..411622803461 --- /dev/null +++ b/lib/Target/AArch64/AArch64CallLowering.h @@ -0,0 +1,36 @@ +//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file describes how to lower LLVM calls to machine code calls. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING +#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING + +#include "llvm/CodeGen/GlobalISel/CallLowering.h" + +namespace llvm { + +class AArch64TargetLowering; + +class AArch64CallLowering: public CallLowering { + public: + AArch64CallLowering(const AArch64TargetLowering &TLI); + + bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, + unsigned VReg) const override; + bool + lowerFormalArguments(MachineIRBuilder &MIRBuilder, + const Function::ArgumentListType &Args, + const SmallVectorImpl &VRegs) const override; +}; +} // End of namespace llvm; +#endif diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 388d64ec4e99..178e3971640e 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -45,6 +45,9 @@ def CC_AArch64_AAPCS : CallingConv<[ // supported there. CCIfNest>, + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf>>, + CCIfConsecutiveRegs>, // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, @@ -86,6 +89,8 @@ def RetCC_AArch64_AAPCS : CallingConv<[ CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32], CCBitConvertToType>, + CCIfSwiftError>>, + // Big endian vectors must be passed as if they were 1-element vectors so that // their lanes are in a consistent order. CCIfBigEndian>, + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf>>, + + // A SwiftError is passed in X19. + CCIfSwiftError>>, + CCIfConsecutiveRegs>, // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, @@ -270,6 +281,9 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, // case) def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>; +def CSR_AArch64_AAPCS_SwiftError + : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X19)>; + // The function used by Darwin to obtain the address of a thread-local variable // guarantees more than a normal AAPCS function. x16 and x17 are used on the // fast path for calculation, but other registers except X0 (argument/return) @@ -310,3 +324,7 @@ def CSR_AArch64_AllRegs (sequence "Q%u", 0, 31))>; def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>; + +def CSR_AArch64_RT_MostRegs : CalleeSavedRegs<(add CSR_AArch64_AAPCS, + (sequence "X%u", 9, 15))>; + diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 9310ac4a44a2..011a03622ba5 100644 --- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -39,6 +39,9 @@ struct LDTLSCleanup : public MachineFunctionPass { LDTLSCleanup() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(*MF.getFunction())) + return false; + AArch64FunctionInfo *AFI = MF.getInfo(); if (AFI->getNumLocalDynamicTLSAccesses() < 2) { // No point folding accesses if there isn't at least two. @@ -69,9 +72,9 @@ struct LDTLSCleanup : public MachineFunctionPass { break; if (TLSBaseAddrReg) - I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg); + I = replaceTLSBaseAddrCall(*I, TLSBaseAddrReg); else - I = setRegister(I, &TLSBaseAddrReg); + I = setRegister(*I, &TLSBaseAddrReg); Changed = true; break; default: @@ -89,27 +92,27 @@ struct LDTLSCleanup : public MachineFunctionPass { // Replace the TLS_base_addr instruction I with a copy from // TLSBaseAddrReg, returning the new instruction. - MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I, + MachineInstr *replaceTLSBaseAddrCall(MachineInstr &I, unsigned TLSBaseAddrReg) { - MachineFunction *MF = I->getParent()->getParent(); + MachineFunction *MF = I.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the // code sequence assumes the address will be. - MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), - TII->get(TargetOpcode::COPY), - AArch64::X0).addReg(TLSBaseAddrReg); + MachineInstr *Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII->get(TargetOpcode::COPY), AArch64::X0) + .addReg(TLSBaseAddrReg); // Erase the TLS_base_addr instruction. - I->eraseFromParent(); + I.eraseFromParent(); return Copy; } // Create a virtal register in *TLSBaseAddrReg, and populate it by // inserting a copy instruction after I. Returns the new instruction. - MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { - MachineFunction *MF = I->getParent()->getParent(); + MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) { + MachineFunction *MF = I.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); // Create a virtual register for the TLS base address. @@ -118,7 +121,7 @@ struct LDTLSCleanup : public MachineFunctionPass { // Insert a copy from X0 to TLSBaseAddrReg for later. MachineInstr *Copy = - BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(), + BuildMI(*I.getParent(), ++I.getIterator(), I.getDebugLoc(), TII->get(TargetOpcode::COPY), *TLSBaseAddrReg) .addReg(AArch64::X0); diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp index 78c239b11ef3..5eecb3a86856 100644 --- a/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -179,6 +179,11 @@ struct AArch64CollectLOH : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return AARCH64_COLLECT_LOH_NAME; } @@ -623,10 +628,7 @@ static void computeADRP(const InstrToInstrs &UseToDefs, continue; } DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n'); - SmallVector Args; - Args.push_back(L2); - Args.push_back(L1); - AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args); + AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, {L2, L1}); ++NumADRPSimpleCandidate; } #ifdef DEBUG @@ -760,13 +762,9 @@ static bool registerADRCandidate(const MachineInstr &Use, "ADD already involved in LOH."); DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n'); - SmallVector Args; - Args.push_back(&Def); - Args.push_back(&Use); - - AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd - : MCLOH_AdrpLdrGot, - Args); + AArch64FI.addLOHDirective( + Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd : MCLOH_AdrpLdrGot, + {&Def, &Use}); return true; } @@ -1036,6 +1034,9 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId, } bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const MachineDominatorTree *MDT = &getAnalysis(); diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index fc27bfee73d1..8fff381d391e 100644 --- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -70,7 +70,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -144,10 +143,18 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( if (I->getOpcode() != AArch64::Bcc) return nullptr; + // Since we may modify cmp of this MBB, make sure NZCV does not live out. + for (auto SuccBB : MBB->successors()) + if (SuccBB->isLiveIn(AArch64::NZCV)) + return nullptr; + // Now find the instruction controlling the terminator. for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { --I; assert(!I->isTerminator() && "Spurious terminator"); + // Check if there is any use of NZCV between CMP and Bcc. + if (I->readsRegister(AArch64::NZCV)) + return nullptr; switch (I->getOpcode()) { // cmp is an alias for subs with a dead destination register. case AArch64::SUBSWri: @@ -166,7 +173,7 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); return nullptr; } - return I; + return &*I; } // Prevent false positive case like: // cmp w19, #0 @@ -268,13 +275,13 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI, // The fact that this comparison was picked ensures that it's related to the // first terminator instruction. - MachineInstr *BrMI = MBB->getFirstTerminator(); + MachineInstr &BrMI = *MBB->getFirstTerminator(); // Change condition in branch instruction. - BuildMI(*MBB, BrMI, BrMI->getDebugLoc(), TII->get(AArch64::Bcc)) + BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc)) .addImm(Cmp) - .addOperand(BrMI->getOperand(1)); - BrMI->eraseFromParent(); + .addOperand(BrMI.getOperand(1)); + BrMI.eraseFromParent(); MBB->updateTerminator(); @@ -311,6 +318,9 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI, bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" << "********** Function: " << MF.getName() << '\n'); + if (skipFunction(*MF.getFunction())) + return false; + TII = MF.getSubtarget().getInstrInfo(); DomTree = &getAnalysis(); MRI = &MF.getRegInfo(); @@ -327,7 +337,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { SmallVector HeadCond; MachineBasicBlock *TBB = nullptr, *FBB = nullptr; - if (TII->AnalyzeBranch(*HBB, TBB, FBB, HeadCond)) { + if (TII->analyzeBranch(*HBB, TBB, FBB, HeadCond)) { continue; } @@ -338,7 +348,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { SmallVector TrueCond; MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr; - if (TII->AnalyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) { + if (TII->analyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) { continue; } diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp index df1320fbd4c9..e1b0dc724b39 100644 --- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -18,13 +18,10 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SparseSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -307,7 +304,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { case AArch64::CBNZW: case AArch64::CBNZX: // These can be converted into a ccmp against #0. - return I; + return &*I; } ++NumCmpTermRejs; DEBUG(dbgs() << "Flags not used by terminator: " << *I); @@ -338,7 +335,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { case AArch64::ADDSWrr: case AArch64::ADDSXrr: if (isDeadDef(I->getOperand(0).getReg())) - return I; + return &*I; DEBUG(dbgs() << "Can't convert compare with live destination: " << *I); ++NumLiveDstRejs; return nullptr; @@ -346,12 +343,12 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { case AArch64::FCMPDrr: case AArch64::FCMPESrr: case AArch64::FCMPEDrr: - return I; + return &*I; } // Check for flag reads and clobbers. MIOperands::PhysRegInfo PRI = - MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI); + MIOperands(*I).analyzePhysReg(AArch64::NZCV, TRI); if (PRI.Read) { // The ccmp doesn't produce exactly the same flags as the original @@ -496,7 +493,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { // The branch we're looking to eliminate must be analyzable. HeadCond.clear(); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; - if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) { + if (TII->analyzeBranch(*Head, TBB, FBB, HeadCond)) { DEBUG(dbgs() << "Head branch not analyzable.\n"); ++NumHeadBranchRejs; return false; @@ -524,7 +521,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { CmpBBCond.clear(); TBB = FBB = nullptr; - if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) { + if (TII->analyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) { DEBUG(dbgs() << "CmpBB branch not analyzable.\n"); ++NumCmpBranchRejs; return false; @@ -759,7 +756,6 @@ void initializeAArch64ConditionalComparesPass(PassRegistry &); INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp", "AArch64 CCMP Pass", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp", @@ -770,7 +766,6 @@ FunctionPass *llvm::createAArch64ConditionalCompares() { } void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); @@ -849,9 +844,9 @@ bool AArch64ConditionalCompares::shouldConvert() { // Instruction depths can be computed for all trace instructions above CmpBB. unsigned HeadDepth = - Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth; + Trace.getInstrCycles(*CmpConv.Head->getFirstTerminator()).Depth; unsigned CmpBBDepth = - Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth; + Trace.getInstrCycles(*CmpConv.CmpBB->getFirstTerminator()).Depth; DEBUG(dbgs() << "Head depth: " << HeadDepth << "\nCmpBB depth: " << CmpBBDepth << '\n'); if (CmpBBDepth > HeadDepth + DelayLimit) { @@ -891,6 +886,9 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) { bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" << "********** Function: " << MF.getName() << '\n'); + if (skipFunction(*MF.getFunction())) + return false; + TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); SchedModel = MF.getSubtarget().getSchedModel(); diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 576cf4a74167..7a6f7669db5f 100644 --- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -48,6 +48,11 @@ public: bool runOnMachineFunction(MachineFunction &F) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -88,6 +93,12 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock( DEBUG(dbgs() << " Ignoring, operand is frame index\n"); continue; } + if (MI.definesRegister(AArch64::XZR) || MI.definesRegister(AArch64::WZR)) { + // It is not allowed to write to the same register (not even the zero + // register) twice in a single instruction. + DEBUG(dbgs() << " Ignoring, XZR or WZR already used by the instruction\n"); + continue; + } for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); if (MO.isReg() && MO.isDead() && MO.isDef()) { @@ -100,7 +111,7 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock( continue; } // Don't change the register if there's an implicit def of a subreg or - // supperreg. + // superreg. if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) { DEBUG(dbgs() << " Ignoring, implicitly defines overlap reg.\n"); continue; @@ -123,6 +134,8 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock( MO.setReg(NewReg); DEBUG(MI.print(dbgs())); ++NumDeadDefsReplaced; + // Only replace one dead register, see check for zero register above. + break; } } } @@ -136,6 +149,9 @@ bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n"); + if (skipFunction(*MF.getFunction())) + return false; + for (auto &MBB : MF) if (processMachineBasicBlock(MBB)) Changed = true; diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index d24e42a93763..5e477d39e074 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -17,6 +17,7 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Support/MathExtras.h" @@ -46,9 +47,18 @@ public: private: bool expandMBB(MachineBasicBlock &MBB); - bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize); + + bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, + unsigned ExtendImm, unsigned ZeroReg, + MachineBasicBlock::iterator &NextMBBI); + bool expandCMP_SWAP_128(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); }; char AArch64ExpandPseudo::ID = 0; } @@ -403,9 +413,17 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize) { MachineInstr &MI = *MBBI; + unsigned DstReg = MI.getOperand(0).getReg(); uint64_t Imm = MI.getOperand(1).getImm(); const unsigned Mask = 0xFFFF; + if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) { + // Useless def, and we don't want to risk creating an invalid ORR (which + // would really write to sp). + MI.eraseFromParent(); + return true; + } + // Try a MOVI instruction (aka ORR-immediate with the zero register). uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); uint64_t Encoding; @@ -531,7 +549,6 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, LastShift = (TZ / 16) * 16; } unsigned Imm16 = (Imm >> Shift) & Mask; - unsigned DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc)) @@ -572,10 +589,178 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, return true; } +static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) { + for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) + MBB->addLiveIn(*I); +} + +bool AArch64ExpandPseudo::expandCMP_SWAP( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, + unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, + MachineBasicBlock::iterator &NextMBBI) { + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + unsigned StatusReg = MI.getOperand(1).getReg(); + MachineOperand &Addr = MI.getOperand(2); + MachineOperand &Desired = MI.getOperand(3); + MachineOperand &New = MI.getOperand(4); + + LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + for (auto I = std::prev(MBB.end()); I != MBBI; --I) + LiveRegs.stepBackward(*I); + + MachineFunction *MF = MBB.getParent(); + auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), LoadCmpBB); + MF->insert(++LoadCmpBB->getIterator(), StoreBB); + MF->insert(++StoreBB->getIterator(), DoneBB); + + // .Lloadcmp: + // ldaxr xDest, [xAddr] + // cmp xDest, xDesired + // b.ne .Ldone + LoadCmpBB->addLiveIn(Addr.getReg()); + LoadCmpBB->addLiveIn(Dest.getReg()); + LoadCmpBB->addLiveIn(Desired.getReg()); + addPostLoopLiveIns(LoadCmpBB, LiveRegs); + + BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg()) + .addReg(Addr.getReg()); + BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg) + .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) + .addOperand(Desired) + .addImm(ExtendImm); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(DoneBB) + .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill); + LoadCmpBB->addSuccessor(DoneBB); + LoadCmpBB->addSuccessor(StoreBB); + + // .Lstore: + // stlxr wStatus, xNew, [xAddr] + // cbnz wStatus, .Lloadcmp + StoreBB->addLiveIn(Addr.getReg()); + StoreBB->addLiveIn(New.getReg()); + addPostLoopLiveIns(StoreBB, LiveRegs); + + BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg) + .addOperand(New) + .addOperand(Addr); + BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) + .addReg(StatusReg, RegState::Kill) + .addMBB(LoadCmpBB); + StoreBB->addSuccessor(LoadCmpBB); + StoreBB->addSuccessor(DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + addPostLoopLiveIns(DoneBB, LiveRegs); + + MBB.addSuccessor(LoadCmpBB); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + +bool AArch64ExpandPseudo::expandCMP_SWAP_128( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineOperand &DestLo = MI.getOperand(0); + MachineOperand &DestHi = MI.getOperand(1); + unsigned StatusReg = MI.getOperand(2).getReg(); + MachineOperand &Addr = MI.getOperand(3); + MachineOperand &DesiredLo = MI.getOperand(4); + MachineOperand &DesiredHi = MI.getOperand(5); + MachineOperand &NewLo = MI.getOperand(6); + MachineOperand &NewHi = MI.getOperand(7); + + LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + for (auto I = std::prev(MBB.end()); I != MBBI; --I) + LiveRegs.stepBackward(*I); + + MachineFunction *MF = MBB.getParent(); + auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), LoadCmpBB); + MF->insert(++LoadCmpBB->getIterator(), StoreBB); + MF->insert(++StoreBB->getIterator(), DoneBB); + + // .Lloadcmp: + // ldaxp xDestLo, xDestHi, [xAddr] + // cmp xDestLo, xDesiredLo + // sbcs xDestHi, xDesiredHi + // b.ne .Ldone + LoadCmpBB->addLiveIn(Addr.getReg()); + LoadCmpBB->addLiveIn(DestLo.getReg()); + LoadCmpBB->addLiveIn(DestHi.getReg()); + LoadCmpBB->addLiveIn(DesiredLo.getReg()); + LoadCmpBB->addLiveIn(DesiredHi.getReg()); + addPostLoopLiveIns(LoadCmpBB, LiveRegs); + + BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX)) + .addReg(DestLo.getReg(), RegState::Define) + .addReg(DestHi.getReg(), RegState::Define) + .addReg(Addr.getReg()); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) + .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead())) + .addOperand(DesiredLo) + .addImm(0); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::SBCSXr), AArch64::XZR) + .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead())) + .addOperand(DesiredHi); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(DoneBB) + .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill); + LoadCmpBB->addSuccessor(DoneBB); + LoadCmpBB->addSuccessor(StoreBB); + + // .Lstore: + // stlxp wStatus, xNewLo, xNewHi, [xAddr] + // cbnz wStatus, .Lloadcmp + StoreBB->addLiveIn(Addr.getReg()); + StoreBB->addLiveIn(NewLo.getReg()); + StoreBB->addLiveIn(NewHi.getReg()); + addPostLoopLiveIns(StoreBB, LiveRegs); + BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg) + .addOperand(NewLo) + .addOperand(NewHi) + .addOperand(Addr); + BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) + .addReg(StatusReg, RegState::Kill) + .addMBB(LoadCmpBB); + StoreBB->addSuccessor(LoadCmpBB); + StoreBB->addSuccessor(DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + addPostLoopLiveIns(DoneBB, LiveRegs); + + MBB.addSuccessor(LoadCmpBB); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + /// \brief If MBBI references a pseudo instruction that should be expanded here, /// do the expansion and return true. Otherwise return false. bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) { + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); switch (Opcode) { @@ -717,6 +902,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case AArch64::CMP_SWAP_8: + return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB, + AArch64::SUBSWrx, + AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0), + AArch64::WZR, NextMBBI); + case AArch64::CMP_SWAP_16: + return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH, + AArch64::SUBSWrx, + AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0), + AArch64::WZR, NextMBBI); + case AArch64::CMP_SWAP_32: + return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW, + AArch64::SUBSWrs, + AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), + AArch64::WZR, NextMBBI); + case AArch64::CMP_SWAP_64: + return expandCMP_SWAP(MBB, MBBI, + AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs, + AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), + AArch64::XZR, NextMBBI); + case AArch64::CMP_SWAP_128: + return expandCMP_SWAP_128(MBB, MBBI, NextMBBI); } return false; } @@ -729,7 +936,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); while (MBBI != E) { MachineBasicBlock::iterator NMBBI = std::next(MBBI); - Modified |= expandMI(MBB, MBBI); + Modified |= expandMI(MBB, MBBI, NMBBI); MBBI = NMBBI; } diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 0ac4b39b0357..e2ab7ab79be1 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -37,7 +37,6 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/CommandLine.h" using namespace llvm; namespace { @@ -144,8 +143,8 @@ private: bool computeCallAddress(const Value *V, Address &Addr); bool simplifyAddress(Address &Addr, MVT VT); void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB, - unsigned Flags, unsigned ScaleFactor, - MachineMemOperand *MMO); + MachineMemOperand::Flags Flags, + unsigned ScaleFactor, MachineMemOperand *MMO); bool isMemCpySmall(uint64_t Len, unsigned Alignment); bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len, unsigned Alignment); @@ -439,9 +438,6 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { .addReg(ADRPReg) .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - } else if (OpFlags & AArch64II::MO_CONSTPOOL) { - // We can't handle addresses loaded from a constant pool quickly yet. - return 0; } else { // ADRP + ADDX BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), @@ -555,10 +551,9 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) // Iterate through the GEP folding the constants into offsets where // we can. - gep_type_iterator GTI = gep_type_begin(U); - for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; - ++i, ++GTI) { - const Value *Op = *i; + for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U); + GTI != E; ++GTI) { + const Value *Op = GTI.getOperand(); if (StructType *STy = dyn_cast(*GTI)) { const StructLayout *SL = DL.getStructLayout(STy); unsigned Idx = cast(Op)->getZExtValue(); @@ -947,10 +942,7 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const { return true; const auto *I = cast(V); - if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) - return true; - - return false; + return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB; } bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { @@ -1048,7 +1040,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { void AArch64FastISel::addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB, - unsigned Flags, + MachineMemOperand::Flags Flags, unsigned ScaleFactor, MachineMemOperand *MMO) { int64_t Offset = Addr.getOffset() / ScaleFactor; @@ -1612,8 +1604,8 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm) { - assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) && - "ISD nodes are not consecutive!"); + static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR), + "ISD nodes are not consecutive!"); static const unsigned OpcTable[3][2] = { { AArch64::ANDWri, AArch64::ANDXri }, { AArch64::ORRWri, AArch64::ORRXri }, @@ -1659,8 +1651,8 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg, bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, uint64_t ShiftImm) { - assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) && - "ISD nodes are not consecutive!"); + static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR), + "ISD nodes are not consecutive!"); static const unsigned OpcTable[3][2] = { { AArch64::ANDWrs, AArch64::ANDXrs }, { AArch64::ORRWrs, AArch64::ORRXrs }, @@ -1904,6 +1896,21 @@ bool AArch64FastISel::selectLoad(const Instruction *I) { cast(I)->isAtomic()) return false; + const Value *SV = I->getOperand(0); + if (TLI.supportSwiftError()) { + // Swifterror values can come from either a function parameter with + // swifterror attribute or an alloca with swifterror attribute. + if (const Argument *Arg = dyn_cast(SV)) { + if (Arg->hasSwiftErrorAttr()) + return false; + } + + if (const AllocaInst *Alloca = dyn_cast(SV)) { + if (Alloca->isSwiftError()) + return false; + } + } + // See if we can handle this address. Address Addr; if (!computeAddress(I->getOperand(0), Addr, I->getType())) @@ -2068,6 +2075,21 @@ bool AArch64FastISel::selectStore(const Instruction *I) { cast(I)->isAtomic()) return false; + const Value *PtrV = I->getOperand(1); + if (TLI.supportSwiftError()) { + // Swifterror values can come from either a function parameter with + // swifterror attribute or an alloca with swifterror attribute. + if (const Argument *Arg = dyn_cast(PtrV)) { + if (Arg->hasSwiftErrorAttr()) + return false; + } + + if (const AllocaInst *Alloca = dyn_cast(PtrV)) { + if (Alloca->isSwiftError()) + return false; + } + } + // Get the value to be stored into a register. Use the zero register directly // when possible to avoid an unnecessary copy and a wasted register. unsigned SrcReg = 0; @@ -2813,6 +2835,8 @@ bool AArch64FastISel::fastLowerArguments() { if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || F->getAttributes().hasAttribute(Idx, Attribute::InReg) || F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || + F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) || + F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) || F->getAttributes().hasAttribute(Idx, Attribute::Nest)) return false; @@ -3064,7 +3088,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { return false; for (auto Flag : CLI.OutFlags) - if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal()) + if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal() || + Flag.isSwiftSelf() || Flag.isSwiftError()) return false; // Set up the argument vectors. @@ -3646,6 +3671,10 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (F.isVarArg()) return false; + if (TLI.supportSwiftError() && + F.getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) return false; @@ -4814,18 +4843,18 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { // Keep a running tab of the total offset to coalesce multiple N = N + Offset // into a single N = N + TotalOffset. uint64_t TotalOffs = 0; - Type *Ty = I->getOperand(0)->getType(); MVT VT = TLI.getPointerTy(DL); - for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) { - const Value *Idx = *OI; - if (auto *StTy = dyn_cast(Ty)) { + for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I); + GTI != E; ++GTI) { + const Value *Idx = GTI.getOperand(); + if (auto *StTy = dyn_cast(*GTI)) { unsigned Field = cast(Idx)->getZExtValue(); // N = N + Offset if (Field) TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field); - Ty = StTy->getElementType(Field); } else { - Ty = cast(Ty)->getElementType(); + Type *Ty = GTI.getIndexedType(); + // If this is a constant subscript, handle it quickly. if (const auto *CI = dyn_cast(Idx)) { if (CI->isZero()) diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 3f63d049c34e..82111e5c7259 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -93,6 +93,7 @@ #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -127,12 +128,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo(); unsigned NumBytes = AFI->getLocalStackSize(); - // Note: currently hasFP() is always true for hasCalls(), but that's an - // implementation detail of the current code, not a strict requirement, - // so stay safe here and check both. - if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128) - return false; - return true; + return !(MFI->hasCalls() || hasFP(MF) || NumBytes > 128); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -140,9 +136,12 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - return (MFI->hasCalls() || MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken() || MFI->hasStackMap() || - MFI->hasPatchPoint() || RegInfo->needsStackRealignment(MF)); + // Retain behavior of always omitting the FP for leaf functions when possible. + return (MFI->hasCalls() && + MF.getTarget().Options.DisableFramePointerElim(MF)) || + MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || + MFI->hasStackMap() || MFI->hasPatchPoint() || + RegInfo->needsStackRealignment(MF); } /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is @@ -155,7 +154,7 @@ AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { return !MF.getFrameInfo()->hasVarSizedObjects(); } -void AArch64FrameLowering::eliminateCallFramePseudoInstr( +MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { const AArch64InstrInfo *TII = @@ -170,7 +169,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( unsigned Align = getStackAlignment(); int64_t Amount = I->getOperand(0).getImm(); - Amount = RoundUpToAlignment(Amount, Align); + Amount = alignTo(Amount, Align); if (!IsDestroy) Amount = -Amount; @@ -186,7 +185,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses // LSL #0, and the other uses LSL #12. // - // Mostly call frames will be allocated at the start of a function so + // Most call frames will be allocated at the start of a function so // this is OK, but it is a limitation that needs dealing with. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII); @@ -198,12 +197,11 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount, TII); } - MBB.erase(I); + return MBB.erase(I); } void AArch64FrameLowering::emitCalleeSavedFrameMoves( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned FramePtr) const { + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); @@ -216,75 +214,194 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( if (CSI.empty()) return; - const DataLayout &TD = MF.getDataLayout(); - bool HasFP = hasFP(MF); - - // Calculate amount of bytes used for return address storing. - int stackGrowth = -TD.getPointerSize(0); - - // Calculate offsets. - int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth; - unsigned TotalSkipped = 0; for (const auto &Info : CSI) { unsigned Reg = Info.getReg(); - int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) - - getOffsetOfLocalArea() + saveAreaOffset; - - // Don't output a new CFI directive if we're re-saving the frame pointer or - // link register. This happens when the PrologEpilogInserter has inserted an - // extra "STP" of the frame pointer and link register -- the "emitPrologue" - // method automatically generates the directives when frame pointers are - // used. If we generate CFI directives for the extra "STP"s, the linker will - // lose track of the correct values for the frame pointer and link register. - if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) { - TotalSkipped += stackGrowth; - continue; - } - + int64_t Offset = + MFI->getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( - nullptr, DwarfReg, Offset - TotalSkipped)); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } } -/// Get FPOffset by analyzing the first instruction. -static int getFPOffsetInPrologue(MachineInstr *MBBI) { - // First instruction must a) allocate the stack and b) have an immediate - // that is a multiple of -2. - assert(((MBBI->getOpcode() == AArch64::STPXpre || - MBBI->getOpcode() == AArch64::STPDpre) && - MBBI->getOperand(3).getReg() == AArch64::SP && - MBBI->getOperand(4).getImm() < 0 && - (MBBI->getOperand(4).getImm() & 1) == 0)); - - // Frame pointer is fp = sp - 16. Since the STPXpre subtracts the space - // required for the callee saved register area we get the frame pointer - // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8. - int FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8; - assert(FPOffset >= 0 && "Bad Framepointer Offset"); - return FPOffset; -} +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { + MachineFunction *MF = MBB->getParent(); + + // If MBB is an entry block, use X9 as the scratch register + if (&MF->front() == MBB) + return AArch64::X9; + + const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); + LivePhysRegs LiveRegs(&TRI); + LiveRegs.addLiveIns(*MBB); + + // Mark callee saved registers as used so we will not choose them. + const AArch64Subtarget &Subtarget = MF->getSubtarget(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + // Prefer X9 since it was historically used for the prologue scratch reg. + const MachineRegisterInfo &MRI = MF->getRegInfo(); + if (LiveRegs.available(MRI, AArch64::X9)) + return AArch64::X9; -static bool isCSSave(MachineInstr *MBBI) { - return MBBI->getOpcode() == AArch64::STPXi || - MBBI->getOpcode() == AArch64::STPDi || - MBBI->getOpcode() == AArch64::STPXpre || - MBBI->getOpcode() == AArch64::STPDpre; + for (unsigned Reg : AArch64::GPR64RegClass) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + return AArch64::NoRegister; } bool AArch64FrameLowering::canUseAsPrologue( const MachineBasicBlock &MBB) const { const MachineFunction *MF = MBB.getParent(); + MachineBasicBlock *TmpMBB = const_cast(&MBB); const AArch64Subtarget &Subtarget = MF->getSubtarget(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); // Don't need a scratch register if we're not going to re-align the stack. - // Otherwise, we may need a scratch register to be available and we do not - // support that for now. - return !RegInfo->needsStackRealignment(*MF); + if (!RegInfo->needsStackRealignment(*MF)) + return true; + // Otherwise, we can use any block as long as it has a scratch register + // available. + return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; +} + +bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( + MachineFunction &MF, unsigned StackBumpBytes) const { + AArch64FunctionInfo *AFI = MF.getInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + + if (AFI->getLocalStackSize() == 0) + return false; + + // 512 is the maximum immediate for stp/ldp that will be used for + // callee-save save/restores + if (StackBumpBytes >= 512) + return false; + + if (MFI->hasVarSizedObjects()) + return false; + + if (RegInfo->needsStackRealignment(MF)) + return false; + + // This isn't strictly necessary, but it simplifies things a bit since the + // current RedZone handling code assumes the SP is adjusted by the + // callee-save save/restore code. + if (canUseRedZone(MF)) + return false; + + return true; +} + +// Convert callee-save register save/restore instruction to do stack pointer +// decrement/increment to allocate/deallocate the callee-save stack area by +// converting store/load to use pre/post increment version. +static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) { + + unsigned NewOpc; + bool NewIsUnscaled = false; + switch (MBBI->getOpcode()) { + default: + llvm_unreachable("Unexpected callee-save save/restore opcode!"); + case AArch64::STPXi: + NewOpc = AArch64::STPXpre; + break; + case AArch64::STPDi: + NewOpc = AArch64::STPDpre; + break; + case AArch64::STRXui: + NewOpc = AArch64::STRXpre; + NewIsUnscaled = true; + break; + case AArch64::STRDui: + NewOpc = AArch64::STRDpre; + NewIsUnscaled = true; + break; + case AArch64::LDPXi: + NewOpc = AArch64::LDPXpost; + break; + case AArch64::LDPDi: + NewOpc = AArch64::LDPDpost; + break; + case AArch64::LDRXui: + NewOpc = AArch64::LDRXpost; + NewIsUnscaled = true; + break; + case AArch64::LDRDui: + NewOpc = AArch64::LDRDpost; + NewIsUnscaled = true; + break; + } + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + MIB.addReg(AArch64::SP, RegState::Define); + + // Copy all operands other than the immediate offset. + unsigned OpndIdx = 0; + for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd; + ++OpndIdx) + MIB.addOperand(MBBI->getOperand(OpndIdx)); + + assert(MBBI->getOperand(OpndIdx).getImm() == 0 && + "Unexpected immediate offset in first/last callee-save save/restore " + "instruction!"); + assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP && + "Unexpected base register in callee-save save/restore instruction!"); + // Last operand is immediate offset that needs fixing. + assert(CSStackSizeInc % 8 == 0); + int64_t CSStackSizeIncImm = CSStackSizeInc; + if (!NewIsUnscaled) + CSStackSizeIncImm /= 8; + MIB.addImm(CSStackSizeIncImm); + + MIB.setMIFlags(MBBI->getFlags()); + MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end()); + + return std::prev(MBB.erase(MBBI)); +} + +// Fixup callee-save register save/restore instructions to take into account +// combined SP bump by adding the local stack size to the stack offsets. +static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, + unsigned LocalStackSize) { + unsigned Opc = MI.getOpcode(); + (void)Opc; + assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi || + Opc == AArch64::STRXui || Opc == AArch64::STRDui || + Opc == AArch64::LDPXi || Opc == AArch64::LDPDi || + Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) && + "Unexpected callee-save save/restore opcode!"); + + unsigned OffsetIdx = MI.getNumExplicitOperands() - 1; + assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP && + "Unexpected base register in callee-save save/restore instruction!"); + // Last operand is immediate offset that needs fixing. + MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx); + // All generated opcodes have scaled offsets. + assert(LocalStackSize % 8 == 0); + OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8); } void AArch64FrameLowering::emitPrologue(MachineFunction &MF, @@ -316,40 +433,59 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); - // Label used to tie together the PROLOG_LABEL and the MachineMoves. - MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); - + if (!NumBytes) + return; // REDZONE: If the stack size is less than 128 bytes, we don't need // to actually allocate. - if (NumBytes && !canUseRedZone(MF)) { + if (canUseRedZone(MF)) + ++NumRedZoneFunctions; + else { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); + // Label used to tie together the PROLOG_LABEL and the MachineMoves. + MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); // Encode the stack size of the leaf function. unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); - } else if (NumBytes) { - ++NumRedZoneFunctions; } - return; } - // Only set up FP if we actually need to. - int FPOffset = 0; - if (HasFP) - FPOffset = getFPOffsetInPrologue(MBBI); + auto CSStackSize = AFI->getCalleeSavedStackSize(); + // All of the remaining stack allocations are for locals. + AFI->setLocalStackSize(NumBytes - CSStackSize); - // Move past the saves of the callee-saved registers. - while (isCSSave(MBBI)) { - ++MBBI; - NumBytes -= 16; + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + if (CombineSPBump) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + NumBytes = 0; + } else if (CSStackSize != 0) { + MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII, + -CSStackSize); + NumBytes -= CSStackSize; } assert(NumBytes >= 0 && "Negative stack allocation size!?"); + + // Move past the saves of the callee-saved registers, fixing up the offsets + // and pre-inc if we decided to combine the callee-save and local stack + // pointer bump above. + MachineBasicBlock::iterator End = MBB.end(); + while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) { + if (CombineSPBump) + fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize()); + ++MBBI; + } if (HasFP) { + // Only set up FP if we actually need to. Frame pointer is fp = sp - 16. + int FPOffset = CSStackSize - 16; + if (CombineSPBump) + FPOffset += AFI->getLocalStackSize(); + // Issue sub fp, sp, FPOffset or // mov fp,sp when FPOffset is zero. // Note: All stores of callee-saved registers are marked as "FrameSetup". @@ -358,47 +494,46 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup); } - // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes); - // Allocate space for the rest of the frame. + if (NumBytes) { + const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); + unsigned scratchSPReg = AArch64::SP; - const unsigned Alignment = MFI->getMaxAlignment(); - const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); - unsigned scratchSPReg = AArch64::SP; - if (NumBytes && NeedsRealignment) { - // Use the first callee-saved register as a scratch register. - scratchSPReg = AArch64::X9; - } + if (NeedsRealignment) { + scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); + assert(scratchSPReg != AArch64::NoRegister); + } - // If we're a leaf function, try using the red zone. - if (NumBytes && !canUseRedZone(MF)) - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup); + // If we're a leaf function, try using the red zone. + if (!canUseRedZone(MF)) + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); - if (NumBytes && NeedsRealignment) { - const unsigned NrBitsToZero = countTrailingZeros(Alignment); - assert(NrBitsToZero > 1); - assert(scratchSPReg != AArch64::SP); - - // SUB X9, SP, NumBytes - // -- X9 is temporary register, so shouldn't contain any live data here, - // -- free to use. This is already produced by emitFrameOffset above. - // AND SP, X9, 0b11111...0000 - // The logical immediates have a non-trivial encoding. The following - // formula computes the encoded immediate with all ones but - // NrBitsToZero zero bits as least significant bits. - uint32_t andMaskEncoded = - (1 <<12) // = N - | ((64-NrBitsToZero) << 6) // immr - | ((64-NrBitsToZero-1) << 0) // imms - ; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) - .addReg(scratchSPReg, RegState::Kill) - .addImm(andMaskEncoded); + if (NeedsRealignment) { + const unsigned Alignment = MFI->getMaxAlignment(); + const unsigned NrBitsToZero = countTrailingZeros(Alignment); + assert(NrBitsToZero > 1); + assert(scratchSPReg != AArch64::SP); + + // SUB X9, SP, NumBytes + // -- X9 is temporary register, so shouldn't contain any live data here, + // -- free to use. This is already produced by emitFrameOffset above. + // AND SP, X9, 0b11111...0000 + // The logical immediates have a non-trivial encoding. The following + // formula computes the encoded immediate with all ones but + // NrBitsToZero zero bits as least significant bits. + uint32_t andMaskEncoded = (1 << 12) // = N + | ((64 - NrBitsToZero) << 6) // immr + | ((64 - NrBitsToZero - 1) << 0); // imms + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) + .addReg(scratchSPReg, RegState::Kill) + .addImm(andMaskEncoded); + AFI->setStackRealigned(true); + } } // If we need a base pointer, set it up here. It's whatever the value of the @@ -491,21 +626,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); - - // Record the location of the stored LR - unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true); - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, LR, StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - - // Record the location of the stored FP - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); } else { // Encode the stack size of the leaf function. unsigned CFIIndex = MMI.addFrameInst( @@ -515,36 +635,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlags(MachineInstr::FrameSetup); } - // Now emit the moves for whatever callee saved regs we have. - emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr); - } -} - -static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - -/// Checks whether the given instruction restores callee save registers -/// and if so returns how many. -static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) { - unsigned RtIdx = 0; - switch (MI.getOpcode()) { - case AArch64::LDPXpost: - case AArch64::LDPDpost: - RtIdx = 1; - // FALLTHROUGH - case AArch64::LDPXi: - case AArch64::LDPDi: - if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) || - !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) || - MI.getOperand(RtIdx + 2).getReg() != AArch64::SP) - return 0; - return 2; + // Now emit the moves for whatever callee saved regs we have (including FP, + // LR if those are saved). + emitCalleeSavedFrameMoves(MBB, MBBI); } - return 0; } void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, @@ -552,7 +646,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); MachineFrameInfo *MFI = MF.getFrameInfo(); const AArch64Subtarget &Subtarget = MF.getSubtarget(); - const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; bool IsTailCallReturn = false; @@ -599,7 +692,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // ---------------------| --- | // | | | | // | CalleeSavedReg | | | - // | (NumRestores * 8) | | | + // | (CalleeSavedStackSize)| | | // | | | | // ---------------------| | NumBytes // | | StackSize (StackAdjustUp) @@ -614,41 +707,74 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps // it as the 2nd argument of AArch64ISD::TC_RETURN. - NumBytes += ArgumentPopSize; - unsigned NumRestores = 0; + auto CSStackSize = AFI->getCalleeSavedStackSize(); + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + + if (!CombineSPBump && CSStackSize != 0) + convertCalleeSaveRestoreToSPPrePostIncDec( + MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize); + // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); MachineBasicBlock::iterator Begin = MBB.begin(); while (LastPopI != Begin) { --LastPopI; - unsigned Restores = getNumCSRestores(*LastPopI, CSRegs); - NumRestores += Restores; - if (Restores == 0) { + if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) { ++LastPopI; break; - } + } else if (CombineSPBump) + fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize()); + } + + // If there is a single SP update, insert it before the ret and we're done. + if (CombineSPBump) { + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + NumBytes + ArgumentPopSize, TII, + MachineInstr::FrameDestroy); + return; } - NumBytes -= NumRestores * 8; + + NumBytes -= CSStackSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { + bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the - // stack pointer. - if (!canUseRedZone(MF)) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, - TII); - return; + // stack pointer (but we may need to pop stack args for fastcc). + if (RedZone && ArgumentPopSize == 0) + return; + + bool NoCalleeSaveRestore = CSStackSize == 0; + int StackRestoreBytes = RedZone ? 0 : NumBytes; + if (NoCalleeSaveRestore) + StackRestoreBytes += ArgumentPopSize; + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + StackRestoreBytes, TII, MachineInstr::FrameDestroy); + // If we were able to combine the local stack pop with the argument pop, + // then we're done. + if (NoCalleeSaveRestore || ArgumentPopSize == 0) + return; + NumBytes = 0; } // Restore the original stack pointer. // FIXME: Rather than doing the math here, we should instead just use // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. - if (NumBytes || MFI->hasVarSizedObjects()) + if (MFI->hasVarSizedObjects() || AFI->isStackRealigned()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags); + -CSStackSize + 16, TII, MachineInstr::FrameDestroy); + else if (NumBytes) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII, + MachineInstr::FrameDestroy); + + // This must be placed after the callee-save restore code because that code + // assumes the SP is at the same location as it was after the callee-save save + // code in the prologue. + if (ArgumentPopSize) + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + ArgumentPopSize, TII, MachineInstr::FrameDestroy); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -726,86 +852,167 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { - if (Reg != AArch64::LR) - return getKillRegState(true); + // Do not set a kill flag on values that are also marked as live-in. This + // happens with the @llvm-returnaddress intrinsic and with arguments passed in + // callee saved registers. + // Omitting the kill flags is conservatively correct even if the live-in + // is not used after all. + bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg); + return getKillRegState(!IsLiveIn); +} - // LR maybe referred to later by an @llvm.returnaddress intrinsic. - bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR); - bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken()); - return getKillRegState(LRKill); +static bool produceCompactUnwindFrame(MachineFunction &MF) { + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + AttributeSet Attrs = MF.getFunction()->getAttributes(); + return Subtarget.isTargetMachO() && + !(Subtarget.getTargetLowering()->supportSwiftError() && + Attrs.hasAttrSomewhere(Attribute::SwiftError)); } -bool AArch64FrameLowering::spillCalleeSavedRegisters( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector &CSI, - const TargetRegisterInfo *TRI) const { - MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + +struct RegPairInfo { + RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {} + unsigned Reg1; + unsigned Reg2; + int FrameIdx; + int Offset; + bool IsGPR; + bool isPaired() const { return Reg2 != AArch64::NoRegister; } +}; + +static void computeCalleeSaveRegisterPairs( + MachineFunction &MF, const std::vector &CSI, + const TargetRegisterInfo *TRI, SmallVectorImpl &RegPairs) { + + if (CSI.empty()) + return; + + AArch64FunctionInfo *AFI = MF.getInfo(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + CallingConv::ID CC = MF.getFunction()->getCallingConv(); unsigned Count = CSI.size(); - DebugLoc DL; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + (void)CC; + // MachO's compact unwind format relies on all registers being stored in + // pairs. + assert((!produceCompactUnwindFrame(MF) || + CC == CallingConv::PreserveMost || + (Count & 1) == 0) && + "Odd number of callee-saved regs to spill!"); + unsigned Offset = AFI->getCalleeSavedStackSize(); + + for (unsigned i = 0; i < Count; ++i) { + RegPairInfo RPI; + RPI.Reg1 = CSI[i].getReg(); + + assert(AArch64::GPR64RegClass.contains(RPI.Reg1) || + AArch64::FPR64RegClass.contains(RPI.Reg1)); + RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1); + + // Add the next reg to the pair if it is in the same register class. + if (i + 1 < Count) { + unsigned NextReg = CSI[i + 1].getReg(); + if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) || + (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg))) + RPI.Reg2 = NextReg; + } - for (unsigned i = 0; i < Count; i += 2) { - unsigned idx = Count - i - 2; - unsigned Reg1 = CSI[idx].getReg(); - unsigned Reg2 = CSI[idx + 1].getReg(); // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI // list to come in sorted by frame index so that we can issue the store // pair instructions directly. Assert if we see anything otherwise. // // The order of the registers in the list is controlled by // getCalleeSavedRegs(), so they will always be in-order, as well. - assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() && + assert((!RPI.isPaired() || + (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) && "Out of order callee saved regs!"); + + // MachO's compact unwind format relies on all registers being stored in + // adjacent register pairs. + assert((!produceCompactUnwindFrame(MF) || + CC == CallingConv::PreserveMost || + (RPI.isPaired() && + ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) || + RPI.Reg1 + 1 == RPI.Reg2))) && + "Callee-save registers not saved as adjacent register pair!"); + + RPI.FrameIdx = CSI[i].getFrameIdx(); + + if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) { + // Round up size of non-pair to pair size if we need to pad the + // callee-save area to ensure 16-byte alignment. + Offset -= 16; + assert(MFI->getObjectAlignment(RPI.FrameIdx) <= 16); + MFI->setObjectAlignment(RPI.FrameIdx, 16); + AFI->setCalleeSaveStackHasFreeSpace(true); + } else + Offset -= RPI.isPaired() ? 16 : 8; + assert(Offset % 8 == 0); + RPI.Offset = Offset / 8; + assert((RPI.Offset >= -64 && RPI.Offset <= 63) && + "Offset out of bounds for LDP/STP immediate"); + + RegPairs.push_back(RPI); + if (RPI.isPaired()) + ++i; + } +} + +bool AArch64FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector &CSI, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + DebugLoc DL; + SmallVector RegPairs; + + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + + for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; + ++RPII) { + RegPairInfo RPI = *RPII; + unsigned Reg1 = RPI.Reg1; + unsigned Reg2 = RPI.Reg2; unsigned StrOpc; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); - // Issue sequence of non-sp increment and pi sp spills for cs regs. The - // first spill is a pre-increment that allocates the stack. + + // Issue sequence of spills for cs regs. The first spill may be converted + // to a pre-decrement store later by emitPrologue if the callee-save stack + // area allocation can't be combined with the local stack area allocation. // For example: - // stp x22, x21, [sp, #-48]! // addImm(-6) + // stp x22, x21, [sp, #0] // addImm(+0) // stp x20, x19, [sp, #16] // addImm(+2) // stp fp, lr, [sp, #32] // addImm(+4) // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! - // Note: Similar rational and sequence for restores in epilog. - if (AArch64::GPR64RegClass.contains(Reg1)) { - assert(AArch64::GPR64RegClass.contains(Reg2) && - "Expected GPR64 callee-saved register pair!"); - // For first spill use pre-increment store. - if (i == 0) - StrOpc = AArch64::STPXpre; - else - StrOpc = AArch64::STPXi; - } else if (AArch64::FPR64RegClass.contains(Reg1)) { - assert(AArch64::FPR64RegClass.contains(Reg2) && - "Expected FPR64 callee-saved register pair!"); - // For first spill use pre-increment store. - if (i == 0) - StrOpc = AArch64::STPDpre; - else - StrOpc = AArch64::STPDi; - } else - llvm_unreachable("Unexpected callee saved register!"); - DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", " - << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx() - << ", " << CSI[idx + 1].getFrameIdx() << ")\n"); - // Compute offset: i = 0 => offset = -Count; - // i = 2 => offset = -(Count - 2) + Count = 2 = i; etc. - const int Offset = (i == 0) ? -Count : i; - assert((Offset >= -64 && Offset <= 63) && - "Offset out of bounds for STP immediate"); - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre) - MIB.addReg(AArch64::SP, RegState::Define); + // Note: Similar rationale and sequence for restores in epilog. + if (RPI.IsGPR) + StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; + else + StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; + DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1); + if (RPI.isPaired()) + dbgs() << ", " << TRI->getName(Reg2); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx+1; + dbgs() << ")\n"); + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); MBB.addLiveIn(Reg1); - MBB.addLiveIn(Reg2); - MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)) - .addReg(Reg1, getPrologueDeath(MF, Reg1)) + if (RPI.isPaired()) { + MBB.addLiveIn(Reg2); + MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), + MachineMemOperand::MOStore, 8, 8)); + } + MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) .addReg(AArch64::SP) - .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit + .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit .setMIFlag(MachineInstr::FrameSetup); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), + MachineMemOperand::MOStore, 8, 8)); } return true; } @@ -816,66 +1023,55 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - unsigned Count = CSI.size(); DebugLoc DL; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + SmallVector RegPairs; if (MI != MBB.end()) DL = MI->getDebugLoc(); - for (unsigned i = 0; i < Count; i += 2) { - unsigned Reg1 = CSI[i].getReg(); - unsigned Reg2 = CSI[i + 1].getReg(); - // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI - // list to come in sorted by frame index so that we can issue the store - // pair instructions directly. Assert if we see anything otherwise. - assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() && - "Out of order callee saved regs!"); - // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only - // the last load is sp-pi post-increment and de-allocates the stack: + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + + for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE; + ++RPII) { + RegPairInfo RPI = *RPII; + unsigned Reg1 = RPI.Reg1; + unsigned Reg2 = RPI.Reg2; + + // Issue sequence of restores for cs regs. The last restore may be converted + // to a post-increment load later by emitEpilogue if the callee-save stack + // area allocation can't be combined with the local stack area allocation. // For example: // ldp fp, lr, [sp, #32] // addImm(+4) // ldp x20, x19, [sp, #16] // addImm(+2) - // ldp x22, x21, [sp], #48 // addImm(+6) + // ldp x22, x21, [sp, #0] // addImm(+0) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; + if (RPI.IsGPR) + LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; + else + LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; + DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1); + if (RPI.isPaired()) + dbgs() << ", " << TRI->getName(Reg2); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx+1; + dbgs() << ")\n"); - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); - if (AArch64::GPR64RegClass.contains(Reg1)) { - assert(AArch64::GPR64RegClass.contains(Reg2) && - "Expected GPR64 callee-saved register pair!"); - if (i == Count - 2) - LdrOpc = AArch64::LDPXpost; - else - LdrOpc = AArch64::LDPXi; - } else if (AArch64::FPR64RegClass.contains(Reg1)) { - assert(AArch64::FPR64RegClass.contains(Reg2) && - "Expected FPR64 callee-saved register pair!"); - if (i == Count - 2) - LdrOpc = AArch64::LDPDpost; - else - LdrOpc = AArch64::LDPDi; - } else - llvm_unreachable("Unexpected callee saved register!"); - DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", " - << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx() - << ", " << CSI[i + 1].getFrameIdx() << ")\n"); - - // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4; - // etc. - const int Offset = (i == Count - 2) ? Count : Count - i - 2; - assert((Offset >= -64 && Offset <= 63) && - "Offset out of bounds for LDP immediate"); MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); - if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost) - MIB.addReg(AArch64::SP, RegState::Define); - - MIB.addReg(Reg2, getDefRegState(true)) - .addReg(Reg1, getDefRegState(true)) + if (RPI.isPaired()) { + MIB.addReg(Reg2, getDefRegState(true)); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), + MachineMemOperand::MOLoad, 8, 8)); + } + MIB.addReg(Reg1, getDefRegState(true)) .addReg(AArch64::SP) - .addImm(Offset); // [sp], #offset * 8 or [sp, #offset * 8] - // where the factor * 8 is implicit + .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit + .setMIFlag(MachineInstr::FrameDestroy); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), + MachineMemOperand::MOLoad, 8, 8)); } return true; } @@ -892,8 +1088,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, const AArch64RegisterInfo *RegInfo = static_cast( MF.getSubtarget().getRegisterInfo()); AArch64FunctionInfo *AFI = MF.getInfo(); - SmallVector UnspilledCSGPRs; - SmallVector UnspilledCSFPRs; + unsigned UnspilledCSGPR = AArch64::NoRegister; + unsigned UnspilledCSGPRPaired = AArch64::NoRegister; // The frame record needs to be created by saving the appropriate registers if (hasFP(MF)) { @@ -901,79 +1097,51 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(AArch64::LR); } - // Spill the BasePtr if it's used. Do this first thing so that the - // getCalleeSavedRegs() below will get the right answer. + unsigned BasePointerReg = AArch64::NoRegister; if (RegInfo->hasBasePointer(MF)) - SavedRegs.set(RegInfo->getBaseRegister()); - - if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF)) - SavedRegs.set(AArch64::X9); + BasePointerReg = RegInfo->getBaseRegister(); - // If any callee-saved registers are used, the frame cannot be eliminated. - unsigned NumGPRSpilled = 0; - unsigned NumFPRSpilled = 0; bool ExtraCSSpill = false; - bool CanEliminateFrame = true; - DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + // Figure out which callee-saved registers to save/restore. + for (unsigned i = 0; CSRegs[i]; ++i) { + const unsigned Reg = CSRegs[i]; + + // Add the base pointer register to SavedRegs if it is callee-save. + if (Reg == BasePointerReg) + SavedRegs.set(Reg); - // Check pairs of consecutive callee-saved registers. - for (unsigned i = 0; CSRegs[i]; i += 2) { - assert(CSRegs[i + 1] && "Odd number of callee-saved registers!"); - - const unsigned OddReg = CSRegs[i]; - const unsigned EvenReg = CSRegs[i + 1]; - assert((AArch64::GPR64RegClass.contains(OddReg) && - AArch64::GPR64RegClass.contains(EvenReg)) ^ - (AArch64::FPR64RegClass.contains(OddReg) && - AArch64::FPR64RegClass.contains(EvenReg)) && - "Register class mismatch!"); - - const bool OddRegUsed = SavedRegs.test(OddReg); - const bool EvenRegUsed = SavedRegs.test(EvenReg); - - // Early exit if none of the registers in the register pair is actually - // used. - if (!OddRegUsed && !EvenRegUsed) { - if (AArch64::GPR64RegClass.contains(OddReg)) { - UnspilledCSGPRs.push_back(OddReg); - UnspilledCSGPRs.push_back(EvenReg); - } else { - UnspilledCSFPRs.push_back(OddReg); - UnspilledCSFPRs.push_back(EvenReg); + bool RegUsed = SavedRegs.test(Reg); + unsigned PairedReg = CSRegs[i ^ 1]; + if (!RegUsed) { + if (AArch64::GPR64RegClass.contains(Reg) && + !RegInfo->isReservedReg(MF, Reg)) { + UnspilledCSGPR = Reg; + UnspilledCSGPRPaired = PairedReg; } continue; } - unsigned Reg = AArch64::NoRegister; - // If only one of the registers of the register pair is used, make sure to - // mark the other one as used as well. - if (OddRegUsed ^ EvenRegUsed) { - // Find out which register is the additional spill. - Reg = OddRegUsed ? EvenReg : OddReg; - SavedRegs.set(Reg); + // MachO's compact unwind format relies on all registers being stored in + // pairs. + // FIXME: the usual format is actually better if unwinding isn't needed. + if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) { + SavedRegs.set(PairedReg); + if (AArch64::GPR64RegClass.contains(PairedReg) && + !RegInfo->isReservedReg(MF, PairedReg)) + ExtraCSSpill = true; } + } - DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo)); - DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo)); - - assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) || - (RegInfo->getEncodingValue(OddReg) + 1 == - RegInfo->getEncodingValue(EvenReg))) && - "Register pair of non-adjacent registers!"); - if (AArch64::GPR64RegClass.contains(OddReg)) { - NumGPRSpilled += 2; - // If it's not a reserved register, we can use it in lieu of an - // emergency spill slot for the register scavenger. - // FIXME: It would be better to instead keep looking and choose another - // unspilled register that isn't reserved, if there is one. - if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg)) - ExtraCSSpill = true; - } else - NumFPRSpilled += 2; + DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; + for (int Reg = SavedRegs.find_first(); Reg != -1; + Reg = SavedRegs.find_next(Reg)) + dbgs() << ' ' << PrintReg(Reg, RegInfo); + dbgs() << "\n";); - CanEliminateFrame = false; - } + // If any callee-saved registers are used, the frame cannot be eliminated. + unsigned NumRegsSpilled = SavedRegs.count(); + bool CanEliminateFrame = NumRegsSpilled == 0; // FIXME: Set BigStack if any stack slot references may be out of range. // For now, just conservatively guestimate based on unscaled indexing @@ -982,8 +1150,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. MachineFrameInfo *MFI = MF.getFrameInfo(); - unsigned CFSize = - MFI->estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled); + unsigned CFSize = MFI->estimateStackSize(MF) + 8 * NumRegsSpilled; DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); bool BigStack = (CFSize >= 256); if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) @@ -996,19 +1163,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // above to keep the number of spills even, we don't need to do anything else // here. if (BigStack && !ExtraCSSpill) { - - // If we're adding a register to spill here, we have to add two of them - // to keep the number of regs to spill even. - assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!"); - unsigned Count = 0; - while (!UnspilledCSGPRs.empty() && Count < 2) { - unsigned Reg = UnspilledCSGPRs.back(); - UnspilledCSGPRs.pop_back(); - DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo) - << " to get a scratch register.\n"); - SavedRegs.set(Reg); + if (UnspilledCSGPR != AArch64::NoRegister) { + DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo) + << " to get a scratch register.\n"); + SavedRegs.set(UnspilledCSGPR); + // MachO's compact unwind format relies on all registers being stored in + // pairs, so if we need to spill one extra for BigStack, then we need to + // store the pair. + if (produceCompactUnwindFrame(MF)) + SavedRegs.set(UnspilledCSGPRPaired); ExtraCSSpill = true; - ++Count; + NumRegsSpilled = SavedRegs.count(); } // If we didn't find an extra callee-saved register to spill, create @@ -1021,4 +1186,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, << " as the emergency spill slot.\n"); } } + + // Round up to register pair alignment to avoid additional SP adjustment + // instructions. + AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16)); +} + +bool AArch64FrameLowering::enableStackSlotScavenging( + const MachineFunction &MF) const { + const AArch64FunctionInfo *AFI = MF.getInfo(); + return AFI->hasCalleeSaveStackFreeSpace(); } diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 7d8354c38787..f254ea9b70aa 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -25,12 +25,11 @@ public: true /*StackRealignable*/) {} void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned FramePtr) const; + MachineBasicBlock::iterator MBBI) const; - void eliminateCallFramePseudoInstr(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const override; + MachineBasicBlock::iterator + eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const override; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. @@ -67,6 +66,12 @@ public: bool enableShrinkWrapping(const MachineFunction &MF) const override { return true; } + + bool enableStackSlotScavenging(const MachineFunction &MF) const override; + +private: + bool shouldCombineCSRLocalStackBump(MachineFunction &MF, + unsigned StackBumpBytes) const; }; } // End llvm namespace diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 6c868880bcac..8d649250f656 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -57,7 +57,7 @@ public: return SelectionDAGISel::runOnMachineFunction(MF); } - SDNode *Select(SDNode *Node) override; + void Select(SDNode *Node) override; /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. @@ -65,8 +65,8 @@ public: unsigned ConstraintID, std::vector &OutOps) override; - SDNode *SelectMLAV64LaneV128(SDNode *N); - SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N); + bool tryMLAV64LaneV128(SDNode *N); + bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N); bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift); bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift); bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift); @@ -147,28 +147,29 @@ public: SDValue createTuple(ArrayRef Vecs, const unsigned RegClassIDs[], const unsigned SubRegs[]); - SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt); + void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt); - SDNode *SelectIndexedLoad(SDNode *N, bool &Done); + bool tryIndexedLoad(SDNode *N); - SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, unsigned SubRegIdx); - SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc, unsigned SubRegIdx); - SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectBitfieldExtractOp(SDNode *N); - SDNode *SelectBitfieldInsertOp(SDNode *N); - SDNode *SelectBitfieldInsertInZeroOp(SDNode *N); + bool tryBitfieldExtractOp(SDNode *N); + bool tryBitfieldExtractOpFromSExt(SDNode *N); + bool tryBitfieldInsertOp(SDNode *N); + bool tryBitfieldInsertInZeroOp(SDNode *N); - SDNode *SelectReadRegister(SDNode *N); - SDNode *SelectWriteRegister(SDNode *N); + bool tryReadRegister(SDNode *N); + bool tryWriteRegister(SDNode *N); // Include the pieces autogenerated from the target description. #include "AArch64GenDAGISel.inc" @@ -198,6 +199,9 @@ private: } bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width); + + void SelectCMP_SWAP(SDNode *N); + }; } // end anonymous namespace @@ -328,9 +332,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) { bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { // it hurts if the value is used at least twice, unless we are optimizing // for code size. - if (ForCodeSize || V.hasOneUse()) - return true; - return false; + return ForCodeSize || V.hasOneUse(); } /// SelectShiftedRegister - Select a "shifted register" operand. If the value @@ -452,7 +454,7 @@ static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp, /// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand /// is a lane in the upper half of a 128-bit vector. Recognize and select this /// so that we don't emit unnecessary lane extracts. -SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) { +bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) { SDLoc dl(N); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -467,7 +469,7 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) { if (Op1.getOpcode() != ISD::MUL || !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2, LaneIdx)) - return nullptr; + return false; } SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64); @@ -493,10 +495,11 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) { break; } - return CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops); + ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops)); + return true; } -SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) { +bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) { SDLoc dl(N); SDValue SMULLOp0; SDValue SMULLOp1; @@ -504,7 +507,7 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) { if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1, LaneIdx)) - return nullptr; + return false; SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64); @@ -537,7 +540,8 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) { } else llvm_unreachable("Unrecognized intrinsic."); - return CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops); + ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops)); + return true; } /// Instructions that accept extend modifiers like UXTW expect the register @@ -610,7 +614,7 @@ static bool isWorthFoldingADDlow(SDValue N) { // ldar and stlr have much more restrictive addressing modes (just a // register). - if (cast(Use)->getOrdering() > Monotonic) + if (isStrongerThanMonotonic(cast(Use)->getOrdering())) return false; } @@ -687,7 +691,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, const GlobalValue *GV = GAN->getGlobal(); unsigned Alignment = GV->getAlignment(); - Type *Ty = GV->getType()->getElementType(); + Type *Ty = GV->getValueType(); if (Alignment == 0 && Ty->isSized()) Alignment = DL.getABITypeAlignment(Ty); @@ -797,10 +801,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size, if (ShiftVal != 0 && ShiftVal != LegalShiftVal) return false; - if (isWorthFolding(N)) - return true; - - return false; + return isWorthFolding(N); } bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, @@ -1015,8 +1016,8 @@ SDValue AArch64DAGToDAGISel::createTuple(ArrayRef Regs, return SDValue(N, 0); } -SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, - unsigned Opc, bool isExt) { +void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, + bool isExt) { SDLoc dl(N); EVT VT = N->getValueType(0); @@ -1033,13 +1034,13 @@ SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, Ops.push_back(N->getOperand(1)); Ops.push_back(RegSeq); Ops.push_back(N->getOperand(NumVecs + ExtOff + 1)); - return CurDAG->getMachineNode(Opc, dl, VT, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops)); } -SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { +bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) { LoadSDNode *LD = cast(N); if (LD->isUnindexed()) - return nullptr; + return false; EVT VT = LD->getMemoryVT(); EVT DstVT = N->getValueType(0); ISD::MemIndexedMode AM = LD->getAddressingMode(); @@ -1101,7 +1102,7 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { } else if (VT.is128BitVector()) { Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost; } else - return nullptr; + return false; SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); ConstantSDNode *OffsetOp = cast(LD->getOffset()); @@ -1112,7 +1113,6 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT, MVT::Other, Ops); // Either way, we're replacing the node, so tell the caller that. - Done = true; SDValue LoadedVal = SDValue(Res, 1); if (InsertTo64) { SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32); @@ -1127,12 +1127,12 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { ReplaceUses(SDValue(N, 0), LoadedVal); ReplaceUses(SDValue(N, 1), SDValue(Res, 0)); ReplaceUses(SDValue(N, 2), SDValue(Res, 2)); - - return nullptr; + CurDAG->RemoveDeadNode(N); + return true; } -SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, - unsigned Opc, unsigned SubRegIdx) { +void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + unsigned SubRegIdx) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue Chain = N->getOperand(0); @@ -1149,11 +1149,11 @@ SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); - return nullptr; + CurDAG->RemoveDeadNode(N); } -SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, - unsigned Opc, unsigned SubRegIdx) { +void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, + unsigned Opc, unsigned SubRegIdx) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue Chain = N->getOperand(0); @@ -1181,11 +1181,11 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, // Update the chain ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); - return nullptr; + CurDAG->RemoveDeadNode(N); } -SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getOperand(2)->getValueType(0); @@ -1197,11 +1197,11 @@ SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)}; SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops); - return St; + ReplaceNode(N, St); } -SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getOperand(2)->getValueType(0); const EVT ResTys[] = {MVT::i64, // Type of the write back register @@ -1218,7 +1218,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, N->getOperand(0)}; // Chain SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); - return St; + ReplaceNode(N, St); } namespace { @@ -1256,8 +1256,8 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { V128Reg); } -SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getValueType(0); bool Narrow = VT.getSizeInBits() == 64; @@ -1292,12 +1292,11 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, } ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); - - return Ld; + CurDAG->RemoveDeadNode(N); } -SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getValueType(0); bool Narrow = VT.getSizeInBits() == 64; @@ -1348,12 +1347,11 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, // Update the Chain ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); - - return Ld; + CurDAG->RemoveDeadNode(N); } -SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getOperand(2)->getValueType(0); bool Narrow = VT.getSizeInBits() == 64; @@ -1379,11 +1377,11 @@ SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, MemOp[0] = cast(N)->getMemOperand(); cast(St)->setMemRefs(MemOp, MemOp + 1); - return St; + ReplaceNode(N, St); } -SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getOperand(2)->getValueType(0); bool Narrow = VT.getSizeInBits() == 64; @@ -1414,7 +1412,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, MemOp[0] = cast(N)->getMemOperand(); cast(St)->setMemRefs(MemOp, MemOp + 1); - return St; + ReplaceNode(N, St); } static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, @@ -1441,25 +1439,25 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, // form these situations when matching bigger pattern (bitfield insert). // For unsigned extracts, check for a shift right and mask - uint64_t And_imm = 0; - if (!isOpcWithIntImmediate(N, ISD::AND, And_imm)) + uint64_t AndImm = 0; + if (!isOpcWithIntImmediate(N, ISD::AND, AndImm)) return false; const SDNode *Op0 = N->getOperand(0).getNode(); // Because of simplify-demanded-bits in DAGCombine, the mask may have been // simplified. Try to undo that - And_imm |= (1 << NumberOfIgnoredLowBits) - 1; + AndImm |= (1 << NumberOfIgnoredLowBits) - 1; // The immediate is a mask of the low bits iff imm & (imm+1) == 0 - if (And_imm & (And_imm + 1)) + if (AndImm & (AndImm + 1)) return false; bool ClampMSB = false; - uint64_t Srl_imm = 0; + uint64_t SrlImm = 0; // Handle the SRL + ANY_EXTEND case. if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND && - isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) { + isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) { // Extend the incoming operand of the SRL to 64-bit. Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0)); // Make sure to clamp the MSB so that we preserve the semantics of the @@ -1467,13 +1465,13 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, ClampMSB = true; } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE && isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, - Srl_imm)) { + SrlImm)) { // If the shift result was truncated, we can still combine them. Opd0 = Op0->getOperand(0).getOperand(0); // Use the type of SRL node. VT = Opd0->getValueType(0); - } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) { + } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) { Opd0 = Op0->getOperand(0); } else if (BiggerPattern) { // Let's pretend a 0 shift right has been performed. @@ -1487,15 +1485,15 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, // Bail out on large immediates. This happens when no proper // combining/constant folding was performed. - if (!BiggerPattern && (Srl_imm <= 0 || Srl_imm >= VT.getSizeInBits())) { + if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) { DEBUG((dbgs() << N << ": Found large shift immediate, this should not happen\n")); return false; } - LSB = Srl_imm; - MSB = Srl_imm + (VT == MVT::i32 ? countTrailingOnes(And_imm) - : countTrailingOnes(And_imm)) - + LSB = SrlImm; + MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes(AndImm) + : countTrailingOnes(AndImm)) - 1; if (ClampMSB) // Since we're moving the extend before the right shift operation, we need @@ -1508,6 +1506,39 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, return true; } +static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc, + SDValue &Opd0, unsigned &Immr, + unsigned &Imms) { + assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); + + EVT VT = N->getValueType(0); + unsigned BitWidth = VT.getSizeInBits(); + assert((VT == MVT::i32 || VT == MVT::i64) && + "Type checking must have been done before calling this function"); + + SDValue Op = N->getOperand(0); + if (Op->getOpcode() == ISD::TRUNCATE) { + Op = Op->getOperand(0); + VT = Op->getValueType(0); + BitWidth = VT.getSizeInBits(); + } + + uint64_t ShiftImm; + if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) && + !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm)) + return false; + + unsigned Width = cast(N->getOperand(1))->getVT().getSizeInBits(); + if (ShiftImm + Width > BitWidth) + return false; + + Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri; + Opd0 = Op.getOperand(0); + Immr = ShiftImm; + Imms = ShiftImm + Width - 1; + return true; +} + static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, unsigned &LSB, unsigned &MSB) { @@ -1522,32 +1553,32 @@ static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc, // // This gets selected into a single UBFM: // - // UBFM Value, ShiftImm, BitWide + Srl_imm -1 + // UBFM Value, ShiftImm, BitWide + SrlImm -1 // if (N->getOpcode() != ISD::SRL) return false; - uint64_t And_mask = 0; - if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask)) + uint64_t AndMask = 0; + if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask)) return false; Opd0 = N->getOperand(0).getOperand(0); - uint64_t Srl_imm = 0; - if (!isIntImmediate(N->getOperand(1), Srl_imm)) + uint64_t SrlImm = 0; + if (!isIntImmediate(N->getOperand(1), SrlImm)) return false; // Check whether we really have several bits extract here. - unsigned BitWide = 64 - countLeadingOnes(~(And_mask >> Srl_imm)); - if (BitWide && isMask_64(And_mask >> Srl_imm)) { + unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm)); + if (BitWide && isMask_64(AndMask >> SrlImm)) { if (N->getValueType(0) == MVT::i32) Opc = AArch64::UBFMWri; else Opc = AArch64::UBFMXri; - LSB = Srl_imm; - MSB = BitWide + Srl_imm - 1; + LSB = SrlImm; + MSB = BitWide + SrlImm - 1; return true; } @@ -1572,10 +1603,10 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms)) return true; - // we're looking for a shift of a shift - uint64_t Shl_imm = 0; - uint64_t Trunc_bits = 0; - if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { + // We're looking for a shift of a shift. + uint64_t ShlImm = 0; + uint64_t TruncBits = 0; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) { Opd0 = N->getOperand(0).getOperand(0); } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL && N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) { @@ -1584,7 +1615,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, // always generate 64bit UBFM. This consistency will help the CSE pass // later find more redundancy. Opd0 = N->getOperand(0).getOperand(0); - Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits(); + TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits(); VT = Opd0->getValueType(0); assert(VT == MVT::i64 && "the promoted type should be i64"); } else if (BiggerPattern) { @@ -1597,21 +1628,21 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, // Missing combines/constant folding may have left us with strange // constants. - if (Shl_imm >= VT.getSizeInBits()) { + if (ShlImm >= VT.getSizeInBits()) { DEBUG((dbgs() << N << ": Found large shift immediate, this should not happen\n")); return false; } - uint64_t Srl_imm = 0; - if (!isIntImmediate(N->getOperand(1), Srl_imm)) + uint64_t SrlImm = 0; + if (!isIntImmediate(N->getOperand(1), SrlImm)) return false; - assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() && + assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() && "bad amount in shift node!"); - int immr = Srl_imm - Shl_imm; + int immr = SrlImm - ShlImm; Immr = immr < 0 ? immr + VT.getSizeInBits() : immr; - Imms = VT.getSizeInBits() - Shl_imm - Trunc_bits - 1; + Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1; // SRA requires a signed extraction if (VT == MVT::i32) Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri; @@ -1620,6 +1651,30 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, return true; } +bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) { + assert(N->getOpcode() == ISD::SIGN_EXTEND); + + EVT VT = N->getValueType(0); + EVT NarrowVT = N->getOperand(0)->getValueType(0); + if (VT != MVT::i64 || NarrowVT != MVT::i32) + return false; + + uint64_t ShiftImm; + SDValue Op = N->getOperand(0); + if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm)) + return false; + + SDLoc dl(N); + // Extend the incoming operand of the shift to 64-bits. + SDValue Opd0 = Widen(CurDAG, Op.getOperand(0)); + unsigned Immr = ShiftImm; + unsigned Imms = NarrowVT.getSizeInBits() - 1; + SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT), + CurDAG->getTargetConstant(Imms, dl, VT)}; + CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops); + return true; +} + static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, SDValue &Opd0, unsigned &Immr, unsigned &Imms, unsigned NumberOfIgnoredLowBits = 0, @@ -1638,6 +1693,9 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, case ISD::SRL: case ISD::SRA: return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern); + + case ISD::SIGN_EXTEND_INREG: + return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms); } unsigned NOpc = N->getMachineOpcode(); @@ -1658,11 +1716,11 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, return false; } -SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { +bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) { unsigned Opc, Immr, Imms; SDValue Opd0; if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms)) - return nullptr; + return false; EVT VT = N->getValueType(0); SDLoc dl(N); @@ -1675,22 +1733,22 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64); SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32); - MachineSDNode *Node = - CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i32, - SDValue(BFM, 0), SubReg); - return Node; + ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, + MVT::i32, SDValue(BFM, 0), SubReg)); + return true; } SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT), CurDAG->getTargetConstant(Imms, dl, VT)}; - return CurDAG->SelectNodeTo(N, Opc, VT, Ops); + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; } /// Does DstMask form a complementary pair with the mask provided by /// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking, /// this asks whether DstMask zeroes precisely those bits that will be set by /// the other half. -static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted, +static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted, unsigned NumberOfIgnoredHighBits, EVT VT) { assert((VT == MVT::i32 || VT == MVT::i64) && "i32 or i64 mask type expected!"); @@ -1851,6 +1909,20 @@ static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits, case AArch64::BFMWri: case AArch64::BFMXri: return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth); + + case AArch64::STRBBui: + case AArch64::STURBBi: + if (UserNode->getOperand(0) != Orig) + return; + UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff); + return; + + case AArch64::STRHHui: + case AArch64::STURHHi: + if (UserNode->getOperand(0) != Orig) + return; + UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff); + return; } } @@ -1963,36 +2035,129 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, return true; } -// Given a OR operation, check if we have the following pattern -// ubfm c, b, imm, imm2 (or something that does the same jobs, see -// isBitfieldExtractOp) -// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and -// countTrailingZeros(mask2) == imm2 - imm + 1 -// f = d | c -// if yes, given reference arguments will be update so that one can replace -// the OR instruction with: -// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 -static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, - SDValue &Src, unsigned &ImmR, - unsigned &ImmS, const APInt &UsefulBits, - SelectionDAG *CurDAG) { +static bool isShiftedMask(uint64_t Mask, EVT VT) { + assert(VT == MVT::i32 || VT == MVT::i64); + if (VT == MVT::i32) + return isShiftedMask_32(Mask); + return isShiftedMask_64(Mask); +} + +// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being +// inserted only sets known zero bits. +static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) { assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); - // Set Opc EVT VT = N->getValueType(0); - if (VT == MVT::i32) - Opc = AArch64::BFMWri; - else if (VT == MVT::i64) - Opc = AArch64::BFMXri; - else + if (VT != MVT::i32 && VT != MVT::i64) + return false; + + unsigned BitWidth = VT.getSizeInBits(); + + uint64_t OrImm; + if (!isOpcWithIntImmediate(N, ISD::OR, OrImm)) + return false; + + // Skip this transformation if the ORR immediate can be encoded in the ORR. + // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely + // performance neutral. + if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth)) return false; + uint64_t MaskImm; + SDValue And = N->getOperand(0); + // Must be a single use AND with an immediate operand. + if (!And.hasOneUse() || + !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm)) + return false; + + // Compute the Known Zero for the AND as this allows us to catch more general + // cases than just looking for AND with imm. + APInt KnownZero, KnownOne; + CurDAG->computeKnownBits(And, KnownZero, KnownOne); + + // Non-zero in the sense that they're not provably zero, which is the key + // point if we want to use this value. + uint64_t NotKnownZero = (~KnownZero).getZExtValue(); + + // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00). + if (!isShiftedMask(KnownZero.getZExtValue(), VT)) + return false; + + // The bits being inserted must only set those bits that are known to be zero. + if ((OrImm & NotKnownZero) != 0) { + // FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't + // currently handle this case. + return false; + } + + // BFI/BFXIL dst, src, #lsb, #width. + int LSB = countTrailingOnes(NotKnownZero); + int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation(); + + // BFI/BFXIL is an alias of BFM, so translate to BFM operands. + unsigned ImmR = (BitWidth - LSB) % BitWidth; + unsigned ImmS = Width - 1; + + // If we're creating a BFI instruction avoid cases where we need more + // instructions to materialize the BFI constant as compared to the original + // ORR. A BFXIL will use the same constant as the original ORR, so the code + // should be no worse in this case. + bool IsBFI = LSB != 0; + uint64_t BFIImm = OrImm >> LSB; + if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) { + // We have a BFI instruction and we know the constant can't be materialized + // with a ORR-immediate with the zero register. + unsigned OrChunks = 0, BFIChunks = 0; + for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) { + if (((OrImm >> Shift) & 0xFFFF) != 0) + ++OrChunks; + if (((BFIImm >> Shift) & 0xFFFF) != 0) + ++BFIChunks; + } + if (BFIChunks > OrChunks) + return false; + } + + // Materialize the constant to be inserted. + SDLoc DL(N); + unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm; + SDNode *MOVI = CurDAG->getMachineNode( + MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT)); + + // Create the BFI/BFXIL instruction. + SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0), + CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; +} + +static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits, + SelectionDAG *CurDAG) { + assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return false; + + unsigned BitWidth = VT.getSizeInBits(); + // Because of simplify-demanded-bits in DAGCombine, involved masks may not // have the expected shape. Try to undo that. unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros(); unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros(); + // Given a OR operation, check if we have the following pattern + // ubfm c, b, imm, imm2 (or something that does the same jobs, see + // isBitfieldExtractOp) + // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and + // countTrailingZeros(mask2) == imm2 - imm + 1 + // f = d | c + // if yes, replace the OR instruction with: + // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2 + // OR is commutative, check all combinations of operand order and values of // BiggerPattern, i.e. // Opd0, Opd1, BiggerPattern=false @@ -2004,8 +2169,11 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // and/or inserting fewer extra instructions. for (int I = 0; I < 4; ++I) { + SDValue Dst, Src; + unsigned ImmR, ImmS; bool BiggerPattern = I / 2; - SDNode *OrOpd0 = N->getOperand(I % 2).getNode(); + SDValue OrOpd0Val = N->getOperand(I % 2); + SDNode *OrOpd0 = OrOpd0Val.getNode(); SDValue OrOpd1Val = N->getOperand((I + 1) % 2); SDNode *OrOpd1 = OrOpd1Val.getNode(); @@ -2030,10 +2198,10 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // If the mask on the insertee is correct, we have a BFXIL operation. We // can share the ImmR and ImmS values from the already-computed UBFM. - } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), + } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val, BiggerPattern, Src, DstLSB, Width)) { - ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); + ImmR = (BitWidth - DstLSB) % BitWidth; ImmS = Width - 1; } else continue; @@ -2069,60 +2237,98 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, Dst = OrOpd1Val; // both parts match + SDLoc DL(N); + SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; + } + + // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff + // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted + // mask (e.g., 0x000ffff0). + uint64_t Mask0Imm, Mask1Imm; + SDValue And0 = N->getOperand(0); + SDValue And1 = N->getOperand(1); + if (And0.hasOneUse() && And1.hasOneUse() && + isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) && + isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) && + APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) && + (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) { + + // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm), + // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the + // bits to be inserted. + if (isShiftedMask(Mask0Imm, VT)) { + std::swap(And0, And1); + std::swap(Mask0Imm, Mask1Imm); + } + + SDValue Src = And1->getOperand(0); + SDValue Dst = And0->getOperand(0); + unsigned LSB = countTrailingZeros(Mask1Imm); + int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation(); + + // The BFXIL inserts the low-order bits from a source register, so right + // shift the needed bits into place. + SDLoc DL(N); + unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri; + SDNode *LSR = CurDAG->getMachineNode( + ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT), + CurDAG->getTargetConstant(BitWidth - 1, DL, VT)); + + // BFXIL is an alias of BFM, so translate to BFM operands. + unsigned ImmR = (BitWidth - LSB) % BitWidth; + unsigned ImmS = Width - 1; + + // Create the BFXIL instruction. + SDValue Ops[] = {Dst, SDValue(LSR, 0), + CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); return true; } return false; } -SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { +bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) { if (N->getOpcode() != ISD::OR) - return nullptr; + return false; - unsigned Opc; - unsigned LSB, MSB; - SDValue Opd0, Opd1; - EVT VT = N->getValueType(0); APInt NUsefulBits; getUsefulBits(SDValue(N, 0), NUsefulBits); // If all bits are not useful, just return UNDEF. - if (!NUsefulBits) - return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT); + if (!NUsefulBits) { + CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0)); + return true; + } - if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits, - CurDAG)) - return nullptr; + if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG)) + return true; - SDLoc dl(N); - SDValue Ops[] = { Opd0, - Opd1, - CurDAG->getTargetConstant(LSB, dl, VT), - CurDAG->getTargetConstant(MSB, dl, VT) }; - return CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return tryBitfieldInsertOpFromOrAndImm(N, CurDAG); } /// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the /// equivalent of a left shift by a constant amount followed by an and masking /// out a contiguous set of bits. -SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) { +bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) { if (N->getOpcode() != ISD::AND) - return nullptr; + return false; EVT VT = N->getValueType(0); - unsigned Opc; - if (VT == MVT::i32) - Opc = AArch64::UBFMWri; - else if (VT == MVT::i64) - Opc = AArch64::UBFMXri; - else - return nullptr; + if (VT != MVT::i32 && VT != MVT::i64) + return false; SDValue Op0; int DstLSB, Width; if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false, Op0, DstLSB, Width)) - return nullptr; + return false; // ImmR is the rotate right amount. unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); @@ -2132,7 +2338,9 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) { SDLoc DL(N); SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT), CurDAG->getTargetConstant(ImmS, DL, VT)}; - return CurDAG->SelectNodeTo(N, Opc, VT, Ops); + unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; } bool @@ -2214,62 +2422,68 @@ static int getIntOperandFromRegisterString(StringRef RegString) { // register string argument is either of the form detailed in the ALCE (the // form described in getIntOperandsFromRegsterString) or is a named register // known by the MRS SysReg mapper. -SDNode *AArch64DAGToDAGISel::SelectReadRegister(SDNode *N) { +bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) { const MDNodeSDNode *MD = dyn_cast(N->getOperand(1)); const MDString *RegString = dyn_cast(MD->getMD()->getOperand(0)); SDLoc DL(N); int Reg = getIntOperandFromRegisterString(RegString->getString()); - if (Reg != -1) - return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0), - MVT::Other, - CurDAG->getTargetConstant(Reg, DL, MVT::i32), - N->getOperand(0)); + if (Reg != -1) { + ReplaceNode(N, CurDAG->getMachineNode( + AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + N->getOperand(0))); + return true; + } // Use the sysreg mapper to map the remaining possible strings to the // value for the register to be used for the instruction operand. - AArch64SysReg::MRSMapper mapper; - bool IsValidSpecialReg; - Reg = mapper.fromString(RegString->getString(), - Subtarget->getFeatureBits(), - IsValidSpecialReg); - if (IsValidSpecialReg) - return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0), - MVT::Other, - CurDAG->getTargetConstant(Reg, DL, MVT::i32), - N->getOperand(0)); + auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString()); + if (TheReg && TheReg->Readable && + TheReg->haveFeatures(Subtarget->getFeatureBits())) + Reg = TheReg->Encoding; + else + Reg = AArch64SysReg::parseGenericRegister(RegString->getString()); + + if (Reg != -1) { + ReplaceNode(N, CurDAG->getMachineNode( + AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + N->getOperand(0))); + return true; + } - return nullptr; + return false; } // Lower the write_register intrinsic to an MSR instruction node if the special // register string argument is either of the form detailed in the ALCE (the // form described in getIntOperandsFromRegsterString) or is a named register // known by the MSR SysReg mapper. -SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) { +bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) { const MDNodeSDNode *MD = dyn_cast(N->getOperand(1)); const MDString *RegString = dyn_cast(MD->getMD()->getOperand(0)); SDLoc DL(N); int Reg = getIntOperandFromRegisterString(RegString->getString()); - if (Reg != -1) - return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other, + if (Reg != -1) { + ReplaceNode( + N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other, CurDAG->getTargetConstant(Reg, DL, MVT::i32), - N->getOperand(2), N->getOperand(0)); + N->getOperand(2), N->getOperand(0))); + return true; + } // Check if the register was one of those allowed as the pstatefield value in // the MSR (immediate) instruction. To accept the values allowed in the // pstatefield for the MSR (immediate) instruction, we also require that an // immediate value has been provided as an argument, we know that this is // the case as it has been ensured by semantic checking. - AArch64PState::PStateMapper PMapper; - bool IsValidSpecialReg; - Reg = PMapper.fromString(RegString->getString(), - Subtarget->getFeatureBits(), - IsValidSpecialReg); - if (IsValidSpecialReg) { + auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());; + if (PMapper) { assert (isa(N->getOperand(2)) && "Expected a constant integer expression."); + unsigned Reg = PMapper->Encoding; uint64_t Immed = cast(N->getOperand(2))->getZExtValue(); unsigned State; if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) { @@ -2279,29 +2493,66 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) { assert(Immed < 16 && "Bad imm"); State = AArch64::MSRpstateImm4; } - return CurDAG->getMachineNode(State, DL, MVT::Other, - CurDAG->getTargetConstant(Reg, DL, MVT::i32), - CurDAG->getTargetConstant(Immed, DL, MVT::i16), - N->getOperand(0)); + ReplaceNode(N, CurDAG->getMachineNode( + State, DL, MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + CurDAG->getTargetConstant(Immed, DL, MVT::i16), + N->getOperand(0))); + return true; } // Use the sysreg mapper to attempt to map the remaining possible strings // to the value for the register to be used for the MSR (register) // instruction operand. - AArch64SysReg::MSRMapper Mapper; - Reg = Mapper.fromString(RegString->getString(), - Subtarget->getFeatureBits(), - IsValidSpecialReg); + auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString()); + if (TheReg && TheReg->Writeable && + TheReg->haveFeatures(Subtarget->getFeatureBits())) + Reg = TheReg->Encoding; + else + Reg = AArch64SysReg::parseGenericRegister(RegString->getString()); + if (Reg != -1) { + ReplaceNode(N, CurDAG->getMachineNode( + AArch64::MSR, DL, MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + N->getOperand(2), N->getOperand(0))); + return true; + } - if (IsValidSpecialReg) - return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other, - CurDAG->getTargetConstant(Reg, DL, MVT::i32), - N->getOperand(2), N->getOperand(0)); + return false; +} + +/// We've got special pseudo-instructions for these +void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { + unsigned Opcode; + EVT MemTy = cast(N)->getMemoryVT(); + if (MemTy == MVT::i8) + Opcode = AArch64::CMP_SWAP_8; + else if (MemTy == MVT::i16) + Opcode = AArch64::CMP_SWAP_16; + else if (MemTy == MVT::i32) + Opcode = AArch64::CMP_SWAP_32; + else if (MemTy == MVT::i64) + Opcode = AArch64::CMP_SWAP_64; + else + llvm_unreachable("Unknown AtomicCmpSwap type"); - return nullptr; + MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32; + SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3), + N->getOperand(0)}; + SDNode *CmpSwap = CurDAG->getMachineNode( + Opcode, SDLoc(N), + CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops); + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + + ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0)); + ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2)); + CurDAG->RemoveDeadNode(N); } -SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { +void AArch64DAGToDAGISel::Select(SDNode *Node) { // Dump information about the Node being selected DEBUG(errs() << "Selecting: "); DEBUG(Node->dump(CurDAG)); @@ -2311,54 +2562,61 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { if (Node->isMachineOpcode()) { DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n"); Node->setNodeId(-1); - return nullptr; + return; } // Few custom selection stuff. - SDNode *ResNode = nullptr; EVT VT = Node->getValueType(0); switch (Node->getOpcode()) { default: break; + case ISD::ATOMIC_CMP_SWAP: + SelectCMP_SWAP(Node); + return; + case ISD::READ_REGISTER: - if (SDNode *Res = SelectReadRegister(Node)) - return Res; + if (tryReadRegister(Node)) + return; break; case ISD::WRITE_REGISTER: - if (SDNode *Res = SelectWriteRegister(Node)) - return Res; + if (tryWriteRegister(Node)) + return; break; case ISD::ADD: - if (SDNode *I = SelectMLAV64LaneV128(Node)) - return I; + if (tryMLAV64LaneV128(Node)) + return; break; case ISD::LOAD: { // Try to select as an indexed load. Fall through to normal processing // if we can't. - bool Done = false; - SDNode *I = SelectIndexedLoad(Node, Done); - if (Done) - return I; + if (tryIndexedLoad(Node)) + return; break; } case ISD::SRL: case ISD::AND: case ISD::SRA: - if (SDNode *I = SelectBitfieldExtractOp(Node)) - return I; - if (SDNode *I = SelectBitfieldInsertInZeroOp(Node)) - return I; + case ISD::SIGN_EXTEND_INREG: + if (tryBitfieldExtractOp(Node)) + return; + if (tryBitfieldInsertInZeroOp(Node)) + return; + break; + + case ISD::SIGN_EXTEND: + if (tryBitfieldExtractOpFromSExt(Node)) + return; break; case ISD::OR: - if (SDNode *I = SelectBitfieldInsertOp(Node)) - return I; + if (tryBitfieldInsertOp(Node)) + return; break; case ISD::EXTRACT_VECTOR_ELT: { @@ -2401,19 +2659,25 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { DEBUG(dbgs() << "ISEL: Custom selection!\n=> "); DEBUG(Extract->dumpr(CurDAG)); DEBUG(dbgs() << "\n"); - return Extract.getNode(); + ReplaceNode(Node, Extract.getNode()); + return; } case ISD::Constant: { // Materialize zero constants as copies from WZR/XZR. This allows // the coalescer to propagate these into other instructions. ConstantSDNode *ConstNode = cast(Node); if (ConstNode->isNullValue()) { - if (VT == MVT::i32) - return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), - AArch64::WZR, MVT::i32).getNode(); - else if (VT == MVT::i64) - return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), - AArch64::XZR, MVT::i64).getNode(); + if (VT == MVT::i32) { + SDValue New = CurDAG->getCopyFromReg( + CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32); + ReplaceNode(Node, New.getNode()); + return; + } else if (VT == MVT::i64) { + SDValue New = CurDAG->getCopyFromReg( + CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64); + ReplaceNode(Node, New.getNode()); + return; + } } break; } @@ -2428,7 +2692,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { SDLoc DL(Node); SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32), CurDAG->getTargetConstant(Shifter, DL, MVT::i32) }; - return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops); + CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops); + return; } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); @@ -2450,7 +2715,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast(Node)->getMemOperand(); cast(Ld)->setMemRefs(MemOp, MemOp + 1); - return Ld; + ReplaceNode(Node, Ld); + return; } case Intrinsic::aarch64_stlxp: case Intrinsic::aarch64_stxp: { @@ -2471,208 +2737,305 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { MemOp[0] = cast(Node)->getMemOperand(); cast(St)->setMemRefs(MemOp, MemOp + 1); - return St; + ReplaceNode(Node, St); + return; } case Intrinsic::aarch64_neon_ld1x2: - if (VT == MVT::v8i8) - return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld1x3: - if (VT == MVT::v8i8) - return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld1x4: - if (VT == MVT::v8i8) - return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld2: - if (VT == MVT::v8i8) - return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld3: - if (VT == MVT::v8i8) - return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld4: - if (VT == MVT::v8i8) - return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld2r: - if (VT == MVT::v8i8) - return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld3r: - if (VT == MVT::v8i8) - return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld4r: - if (VT == MVT::v8i8) - return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld2lane: - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectLoadLane(Node, 2, AArch64::LD2i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectLoadLane(Node, 2, AArch64::LD2i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectLoadLane(Node, 2, AArch64::LD2i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectLoadLane(Node, 2, AArch64::LD2i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectLoadLane(Node, 2, AArch64::LD2i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectLoadLane(Node, 2, AArch64::LD2i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectLoadLane(Node, 2, AArch64::LD2i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectLoadLane(Node, 2, AArch64::LD2i64); + return; + } break; case Intrinsic::aarch64_neon_ld3lane: - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectLoadLane(Node, 3, AArch64::LD3i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectLoadLane(Node, 3, AArch64::LD3i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectLoadLane(Node, 3, AArch64::LD3i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectLoadLane(Node, 3, AArch64::LD3i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectLoadLane(Node, 3, AArch64::LD3i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectLoadLane(Node, 3, AArch64::LD3i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectLoadLane(Node, 3, AArch64::LD3i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectLoadLane(Node, 3, AArch64::LD3i64); + return; + } break; case Intrinsic::aarch64_neon_ld4lane: - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectLoadLane(Node, 4, AArch64::LD4i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectLoadLane(Node, 4, AArch64::LD4i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectLoadLane(Node, 4, AArch64::LD4i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectLoadLane(Node, 4, AArch64::LD4i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectLoadLane(Node, 4, AArch64::LD4i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectLoadLane(Node, 4, AArch64::LD4i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectLoadLane(Node, 4, AArch64::LD4i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectLoadLane(Node, 4, AArch64::LD4i64); + return; + } break; } } break; @@ -2682,33 +3045,39 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { default: break; case Intrinsic::aarch64_neon_tbl2: - return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two - : AArch64::TBLv16i8Two, - false); + SelectTable(Node, 2, + VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two, + false); + return; case Intrinsic::aarch64_neon_tbl3: - return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three - : AArch64::TBLv16i8Three, - false); + SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three + : AArch64::TBLv16i8Three, + false); + return; case Intrinsic::aarch64_neon_tbl4: - return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four - : AArch64::TBLv16i8Four, - false); + SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four + : AArch64::TBLv16i8Four, + false); + return; case Intrinsic::aarch64_neon_tbx2: - return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two - : AArch64::TBXv16i8Two, - true); + SelectTable(Node, 2, + VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two, + true); + return; case Intrinsic::aarch64_neon_tbx3: - return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three - : AArch64::TBXv16i8Three, - true); + SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three + : AArch64::TBXv16i8Three, + true); + return; case Intrinsic::aarch64_neon_tbx4: - return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four - : AArch64::TBXv16i8Four, - true); + SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four + : AArch64::TBXv16i8Four, + true); + return; case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: - if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node)) - return N; + if (tryMULLV64LaneV128(IntNo, Node)) + return; break; } break; @@ -2721,588 +3090,827 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { default: break; case Intrinsic::aarch64_neon_st1x2: { - if (VT == MVT::v8i8) - return SelectStore(Node, 2, AArch64::ST1Twov8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 2, AArch64::ST1Twov16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 2, AArch64::ST1Twov4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 2, AArch64::ST1Twov8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 2, AArch64::ST1Twov2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 2, AArch64::ST1Twov4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 2, AArch64::ST1Twov2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 2, AArch64::ST1Twov1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 2, AArch64::ST1Twov8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 2, AArch64::ST1Twov16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 2, AArch64::ST1Twov4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 2, AArch64::ST1Twov8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 2, AArch64::ST1Twov2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 2, AArch64::ST1Twov4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 2, AArch64::ST1Twov2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 2, AArch64::ST1Twov1d); + return; + } break; } case Intrinsic::aarch64_neon_st1x3: { - if (VT == MVT::v8i8) - return SelectStore(Node, 3, AArch64::ST1Threev8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 3, AArch64::ST1Threev16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 3, AArch64::ST1Threev4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 3, AArch64::ST1Threev8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 3, AArch64::ST1Threev2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 3, AArch64::ST1Threev4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 3, AArch64::ST1Threev2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 3, AArch64::ST1Threev1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 3, AArch64::ST1Threev8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 3, AArch64::ST1Threev16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 3, AArch64::ST1Threev4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 3, AArch64::ST1Threev8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 3, AArch64::ST1Threev2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 3, AArch64::ST1Threev4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 3, AArch64::ST1Threev2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 3, AArch64::ST1Threev1d); + return; + } break; } case Intrinsic::aarch64_neon_st1x4: { - if (VT == MVT::v8i8) - return SelectStore(Node, 4, AArch64::ST1Fourv8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 4, AArch64::ST1Fourv16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 4, AArch64::ST1Fourv4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 4, AArch64::ST1Fourv8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 4, AArch64::ST1Fourv2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 4, AArch64::ST1Fourv4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 4, AArch64::ST1Fourv2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 4, AArch64::ST1Fourv1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 4, AArch64::ST1Fourv8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 4, AArch64::ST1Fourv16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 4, AArch64::ST1Fourv4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 4, AArch64::ST1Fourv8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 4, AArch64::ST1Fourv2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 4, AArch64::ST1Fourv4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 4, AArch64::ST1Fourv2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 4, AArch64::ST1Fourv1d); + return; + } break; } case Intrinsic::aarch64_neon_st2: { - if (VT == MVT::v8i8) - return SelectStore(Node, 2, AArch64::ST2Twov8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 2, AArch64::ST2Twov16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 2, AArch64::ST2Twov4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 2, AArch64::ST2Twov8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 2, AArch64::ST2Twov2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 2, AArch64::ST2Twov4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 2, AArch64::ST2Twov2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 2, AArch64::ST1Twov1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 2, AArch64::ST2Twov8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 2, AArch64::ST2Twov16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 2, AArch64::ST2Twov4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 2, AArch64::ST2Twov8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 2, AArch64::ST2Twov2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 2, AArch64::ST2Twov4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 2, AArch64::ST2Twov2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 2, AArch64::ST1Twov1d); + return; + } break; } case Intrinsic::aarch64_neon_st3: { - if (VT == MVT::v8i8) - return SelectStore(Node, 3, AArch64::ST3Threev8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 3, AArch64::ST3Threev16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 3, AArch64::ST3Threev4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 3, AArch64::ST3Threev8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 3, AArch64::ST3Threev2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 3, AArch64::ST3Threev4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 3, AArch64::ST3Threev2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 3, AArch64::ST1Threev1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 3, AArch64::ST3Threev8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 3, AArch64::ST3Threev16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 3, AArch64::ST3Threev4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 3, AArch64::ST3Threev8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 3, AArch64::ST3Threev2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 3, AArch64::ST3Threev4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 3, AArch64::ST3Threev2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 3, AArch64::ST1Threev1d); + return; + } break; } case Intrinsic::aarch64_neon_st4: { - if (VT == MVT::v8i8) - return SelectStore(Node, 4, AArch64::ST4Fourv8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 4, AArch64::ST4Fourv16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 4, AArch64::ST4Fourv4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 4, AArch64::ST4Fourv8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 4, AArch64::ST4Fourv2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 4, AArch64::ST4Fourv4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 4, AArch64::ST4Fourv2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 4, AArch64::ST1Fourv1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 4, AArch64::ST4Fourv8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 4, AArch64::ST4Fourv16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 4, AArch64::ST4Fourv4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 4, AArch64::ST4Fourv8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 4, AArch64::ST4Fourv2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 4, AArch64::ST4Fourv4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 4, AArch64::ST4Fourv2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 4, AArch64::ST1Fourv1d); + return; + } break; } case Intrinsic::aarch64_neon_st2lane: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectStoreLane(Node, 2, AArch64::ST2i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectStoreLane(Node, 2, AArch64::ST2i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectStoreLane(Node, 2, AArch64::ST2i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectStoreLane(Node, 2, AArch64::ST2i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectStoreLane(Node, 2, AArch64::ST2i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectStoreLane(Node, 2, AArch64::ST2i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectStoreLane(Node, 2, AArch64::ST2i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectStoreLane(Node, 2, AArch64::ST2i64); + return; + } break; } case Intrinsic::aarch64_neon_st3lane: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectStoreLane(Node, 3, AArch64::ST3i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectStoreLane(Node, 3, AArch64::ST3i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectStoreLane(Node, 3, AArch64::ST3i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectStoreLane(Node, 3, AArch64::ST3i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectStoreLane(Node, 3, AArch64::ST3i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectStoreLane(Node, 3, AArch64::ST3i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectStoreLane(Node, 3, AArch64::ST3i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectStoreLane(Node, 3, AArch64::ST3i64); + return; + } break; } case Intrinsic::aarch64_neon_st4lane: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectStoreLane(Node, 4, AArch64::ST4i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectStoreLane(Node, 4, AArch64::ST4i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectStoreLane(Node, 4, AArch64::ST4i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectStoreLane(Node, 4, AArch64::ST4i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectStoreLane(Node, 4, AArch64::ST4i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectStoreLane(Node, 4, AArch64::ST4i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectStoreLane(Node, 4, AArch64::ST4i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectStoreLane(Node, 4, AArch64::ST4i64); + return; + } break; } } break; } case AArch64ISD::LD2post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD3post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD4post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD1x2post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD1x3post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD1x4post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD1DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD2DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD3DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD4DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD1LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST); + return; + } break; } case AArch64ISD::LD2LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST); + return; + } break; } case AArch64ISD::LD3LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST); + return; + } break; } case AArch64ISD::LD4LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST); + return; + } break; } case AArch64ISD::ST2post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); + return; + } break; } case AArch64ISD::ST3post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); + return; + } break; } case AArch64ISD::ST4post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); + return; + } break; } case AArch64ISD::ST1x2post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST); + return; + } break; } case AArch64ISD::ST1x3post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST); + return; + } break; } case AArch64ISD::ST1x4post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST); + return; + } break; } case AArch64ISD::ST2LANEpost: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST); + return; + } break; } case AArch64ISD::ST3LANEpost: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST); + return; + } break; } case AArch64ISD::ST4LANEpost: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST); + return; + } break; } } // Select the default instruction - ResNode = SelectCode(Node); - - DEBUG(errs() << "=> "); - if (ResNode == nullptr || ResNode == Node) - DEBUG(Node->dump(CurDAG)); - else - DEBUG(ResNode->dump(CurDAG)); - DEBUG(errs() << "\n"); - - return ResNode; + SelectCode(Node); } /// createAArch64ISelDag - This pass converts a legalized DAG into a diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 92cf1cd71970..d6f2a190d4c8 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -40,12 +40,6 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); -// Place holder until extr generation is tested fully. -static cl::opt -EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, - cl::desc("Allow AArch64 (or (shift)(shift))->extract"), - cl::init(true)); - static cl::opt EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, cl::desc("Allow AArch64 SLI/SRI formation"), @@ -59,6 +53,13 @@ cl::opt EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +// Disabled for causing self-hosting failures once returned-attribute inference +// was enabled. +static cl::opt +EnableThisRetForwarding("aarch64-this-return-forwarding", cl::Hidden, + cl::desc("Directly forward this return"), + cl::init(false)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -225,13 +226,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero - // counterparts, which AArch64 supports directly. - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); @@ -402,6 +396,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -476,7 +472,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Also, try to fold ADD into CSINC/CSINV.. setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); - + setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); @@ -518,7 +514,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MaskAndBranchFoldingIsLegal = true; EnableExtLdPromotion = true; + // Set required alignment. setMinFunctionAlignment(2); + // Set preferred alignments. + setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); + setPrefLoopAlignment(STI.getPrefLoopAlignment()); setHasExtractBitsInsn(true); @@ -583,6 +583,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); + setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); + + setOperationAction(ISD::CTTZ, MVT::v2i8, Expand); + setOperationAction(ISD::CTTZ, MVT::v4i16, Expand); + setOperationAction(ISD::CTTZ, MVT::v2i32, Expand); + setOperationAction(ISD::CTTZ, MVT::v1i64, Expand); + setOperationAction(ISD::CTTZ, MVT::v16i8, Expand); + setOperationAction(ISD::CTTZ, MVT::v8i16, Expand); + setOperationAction(ISD::CTTZ, MVT::v4i32, Expand); + setOperationAction(ISD::CTTZ, MVT::v2i64, Expand); + // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); // Custom handling for some quad-vector types to detect MULL. @@ -623,91 +635,88 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } - // Prefer likely predicted branches to selects on out-of-order cores. - if (Subtarget->isCortexA57()) - PredictableSelectIsExpensive = true; + PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } -void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { +void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { if (VT == MVT::v2f32 || VT == MVT::v4f16) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType(ISD::LOAD, VT, MVT::v2i32); - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType(ISD::STORE, VT, MVT::v2i32); } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType(ISD::LOAD, VT, MVT::v2i64); - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType(ISD::STORE, VT, MVT::v2i64); } // Mark vector float intrinsics as expand. if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { - setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); // But we do support custom-lowering for FCOPYSIGN. - setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); - } - - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); - setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); - setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); - setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); - - setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); - setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); + } + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::OR, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); + + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); for (MVT InnerVT : MVT::all_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // CNT supports only B element sizes. if (VT != MVT::v8i8 && VT != MVT::v16i8) - setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); // [SU][MIN|MAX] are available for all NEON types apart from i64. - if (!VT.isFloatingPoint() && - VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64) + if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) - setOperationAction(Opcode, VT.getSimpleVT(), Legal); + setOperationAction(Opcode, VT, Legal); // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, ISD::FMINNUM, ISD::FMAXNUM}) - setOperationAction(Opcode, VT.getSimpleVT(), Legal); + setOperationAction(Opcode, VT, Legal); if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { - setIndexedLoadAction(im, VT.getSimpleVT(), Legal); - setIndexedStoreAction(im, VT.getSimpleVT(), Legal); + setIndexedLoadAction(im, VT, Legal); + setIndexedStoreAction(im, VT, Legal); } } } @@ -804,12 +813,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (Subtarget->requiresStrictAlign()) return false; - // FIXME: This is mostly true for Cyclone, but not necessarily others. if (Fast) { - // FIXME: Define an attribute for slow unaligned accesses instead of - // relying on the CPU type as a proxy. - // On Cyclone, unaligned 128-bit stores are slow. - *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || + // Some CPUs are fine with unaligned stores except for 128-bit ones. + *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || // See comments in performSTORECombine() for more details about // these conditions. @@ -954,12 +960,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; + case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; + case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; } return nullptr; } MachineBasicBlock * -AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, +AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *MBB) const { // We materialise the F128CSEL pseudo-instruction as some control flow and a // phi node: @@ -976,14 +984,14 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator It = ++MBB->getIterator(); - unsigned DestReg = MI->getOperand(0).getReg(); - unsigned IfTrueReg = MI->getOperand(1).getReg(); - unsigned IfFalseReg = MI->getOperand(2).getReg(); - unsigned CondCode = MI->getOperand(3).getImm(); - bool NZCVKilled = MI->getOperand(4).isKill(); + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned IfTrueReg = MI.getOperand(1).getReg(); + unsigned IfFalseReg = MI.getOperand(2).getReg(); + unsigned CondCode = MI.getOperand(3).getImm(); + bool NZCVKilled = MI.getOperand(4).isKill(); MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -1014,17 +1022,16 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, .addReg(IfFalseReg) .addMBB(MBB); - MI->eraseFromParent(); + MI.eraseFromParent(); return EndBB; } -MachineBasicBlock * -AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *BB) const { - switch (MI->getOpcode()) { +MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *BB) const { + switch (MI.getOpcode()) { default: #ifndef NDEBUG - MI->dump(); + MI.dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); @@ -1135,6 +1142,35 @@ static void changeFPCCToAArch64CC(ISD::CondCode CC, } } +/// Convert a DAG fp condition code to an AArch64 CC. +/// This differs from changeFPCCToAArch64CC in that it returns cond codes that +/// should be AND'ed instead of OR'ed. +static void changeFPCCToANDAArch64CC(ISD::CondCode CC, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2) { + CondCode2 = AArch64CC::AL; + switch (CC) { + default: + changeFPCCToAArch64CC(CC, CondCode, CondCode2); + assert(CondCode2 == AArch64CC::AL); + break; + case ISD::SETONE: + // (a one b) + // == ((a olt b) || (a ogt b)) + // == ((a ord b) && (a une b)) + CondCode = AArch64CC::VC; + CondCode2 = AArch64CC::NE; + break; + case ISD::SETUEQ: + // (a ueq b) + // == ((a uno b) || (a oeq b)) + // == ((a ule b) && (a uge b)) + CondCode = AArch64CC::PL; + CondCode2 = AArch64CC::LE; + break; + } +} + /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 /// CC usable with the vector instructions. Fewer operations are available /// without a real NZCV register, so we have to use less efficient combinations @@ -1174,11 +1210,18 @@ static bool isLegalArithImmed(uint64_t C) { } static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDLoc dl, SelectionDAG &DAG) { + const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); - if (VT.isFloatingPoint()) + if (VT.isFloatingPoint()) { + assert(VT != MVT::f128); + if (VT == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); + VT = MVT::f32; + } return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); + } // The CMP instruction is just an alias for SUBS, and representing it as // SUBS means that it's possible to get CSE with subtract operations. @@ -1258,22 +1301,31 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, - SDValue Condition, unsigned NZCV, - SDLoc DL, SelectionDAG &DAG) { + AArch64CC::CondCode Predicate, + AArch64CC::CondCode OutCC, + const SDLoc &DL, SelectionDAG &DAG) { unsigned Opcode = 0; - if (LHS.getValueType().isFloatingPoint()) + if (LHS.getValueType().isFloatingPoint()) { + assert(LHS.getValueType() != MVT::f128); + if (LHS.getValueType() == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); + } Opcode = AArch64ISD::FCCMP; - else if (RHS.getOpcode() == ISD::SUB) { + } else if (RHS.getOpcode() == ISD::SUB) { SDValue SubOp0 = RHS.getOperand(0); if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - // See emitComparison() on why we can only do this for SETEQ and SETNE. - Opcode = AArch64ISD::CCMN; - RHS = RHS.getOperand(1); - } + // See emitComparison() on why we can only do this for SETEQ and SETNE. + Opcode = AArch64ISD::CCMN; + RHS = RHS.getOperand(1); + } } if (Opcode == 0) Opcode = AArch64ISD::CCMP; + SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); + AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); } @@ -1284,31 +1336,49 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be /// brought into such a form. -static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, +static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate, unsigned Depth = 0) { if (!Val.hasOneUse()) return false; unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { - CanPushNegate = true; + if (Val->getOperand(0).getValueType() == MVT::f128) + return false; + CanNegate = true; return true; } - // Protect against stack overflow. - if (Depth > 15) + // Protect against exponential runtime and stack overflow. + if (Depth > 6) return false; if (Opcode == ISD::AND || Opcode == ISD::OR) { SDValue O0 = Val->getOperand(0); SDValue O1 = Val->getOperand(1); - bool CanPushNegateL; - if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) + bool CanNegateL; + if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1)) return false; - bool CanPushNegateR; - if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) + bool CanNegateR; + if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1)) return false; - // We cannot push a negate through an AND operation (it would become an OR), - // we can however change a (not (or x y)) to (and (not x) (not y)) if we can - // push the negate through the x/y subtrees. - CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; + + if (Opcode == ISD::OR) { + // For an OR expression we need to be able to negate at least one side or + // we cannot do the transformation at all. + if (!CanNegateL && !CanNegateR) + return false; + // We can however change a (not (or x y)) to (and (not x) (not y)) if we + // can negate the x and y subtrees. + CanNegate = CanNegateL && CanNegateR; + } else { + // If the operands are OR expressions then we finally need to negate their + // outputs, we can only do that for the operand with emitted last by + // negating OutCC, not for both operands. + bool NeedsNegOutL = O0->getOpcode() == ISD::OR; + bool NeedsNegOutR = O1->getOpcode() == ISD::OR; + if (NeedsNegOutL && NeedsNegOutR) + return false; + // We cannot negate an AND operation (it would become an OR), + CanNegate = false; + } return true; } return false; @@ -1324,10 +1394,9 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate /// for the comparisons in the current subtree; @p Depth limits the search /// depth to avoid stack overflow. -static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, - AArch64CC::CondCode &OutCC, bool PushNegate = false, - SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, - unsigned Depth = 0) { +static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, + AArch64CC::CondCode Predicate) { // We're at a tree leaf, produce a conditional comparison operation. unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { @@ -1335,7 +1404,7 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, SDValue RHS = Val->getOperand(1); ISD::CondCode CC = cast(Val->getOperand(2))->get(); bool isInteger = LHS.getValueType().isInteger(); - if (PushNegate) + if (Negate) CC = getSetCCInverse(CC, isInteger); SDLoc DL(Val); // Determine OutCC and handle FP special case. @@ -1344,68 +1413,62 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, } else { assert(LHS.getValueType().isFloatingPoint()); AArch64CC::CondCode ExtraCC; - changeFPCCToAArch64CC(CC, OutCC, ExtraCC); - // Surpisingly some floating point conditions can't be tested with a - // single condition code. Construct an additional comparison in this case. - // See comment below on how we deal with OR conditions. + changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); + // Some floating point conditions can't be tested with a single condition + // code. Construct an additional comparison in this case. if (ExtraCC != AArch64CC::AL) { SDValue ExtraCmp; if (!CCOp.getNode()) ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); - else { - SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); - // Note that we want the inverse of ExtraCC, so NZCV is not inversed. - unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); - ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, - NZCV, DL, DAG); - } + else + ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, + ExtraCC, DL, DAG); CCOp = ExtraCmp; - Predicate = AArch64CC::getInvertedCondCode(ExtraCC); - OutCC = AArch64CC::getInvertedCondCode(OutCC); + Predicate = ExtraCC; } } // Produce a normal comparison if we are first in the chain - if (!CCOp.getNode()) + if (!CCOp) return emitComparison(LHS, RHS, CC, DL, DAG); // Otherwise produce a ccmp. - SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); - AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); - unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); - return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, + return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, DAG); - } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) - return SDValue(); - - assert((Opcode == ISD::OR || !PushNegate) - && "Can only push negate through OR operation"); + } + assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) && + "Valid conjunction/disjunction tree"); // Check if both sides can be transformed. SDValue LHS = Val->getOperand(0); SDValue RHS = Val->getOperand(1); - bool CanPushNegateL; - if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) - return SDValue(); - bool CanPushNegateR; - if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) - return SDValue(); - // Do we need to negate our operands? - bool NegateOperands = Opcode == ISD::OR; + // In case of an OR we need to negate our operands and the result. + // (A v B) <=> not(not(A) ^ not(B)) + bool NegateOpsAndResult = Opcode == ISD::OR; // We can negate the results of all previous operations by inverting the - // predicate flags giving us a free negation for one side. For the other side - // we need to be able to push the negation to the leafs of the tree. - if (NegateOperands) { - if (!CanPushNegateL && !CanPushNegateR) - return SDValue(); - // Order the side where we can push the negate through to LHS. - if (!CanPushNegateL && CanPushNegateR) + // predicate flags giving us a free negation for one side. The other side + // must be negatable by itself. + if (NegateOpsAndResult) { + // See which side we can negate. + bool CanNegateL; + bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL); + assert(isValidL && "Valid conjunction/disjunction tree"); + (void)isValidL; + +#ifndef NDEBUG + bool CanNegateR; + bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR); + assert(isValidR && "Valid conjunction/disjunction tree"); + assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree"); +#endif + + // Order the side which we cannot negate to RHS so we can emit it first. + if (!CanNegateL) std::swap(LHS, RHS); } else { bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; - bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; - if (NeedsNegOutL && NeedsNegOutR) - return SDValue(); + assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) && + "Valid conjunction/disjunction tree"); // Order the side where we need to negate the output flags to RHS so it // gets emitted first. if (NeedsNegOutL) @@ -1416,24 +1479,39 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, // through if we are already in a PushNegate case, otherwise we can negate // the "flags to test" afterwards. AArch64CC::CondCode RHSCC; - SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, - CCOp, Predicate, Depth+1); - if (NegateOperands && !PushNegate) + SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate, + CCOp, Predicate); + if (NegateOpsAndResult && !Negate) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); - // Emit LHS. We must push the negate through if we need to negate it. - SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, - CmpR, RHSCC, Depth+1); + // Emit LHS. We may need to negate it. + SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC, + NegateOpsAndResult, CmpR, + RHSCC); // If we transformed an OR to and AND then we have to negate the result - // (or absorb a PushNegate resulting in a double negation). - if (Opcode == ISD::OR && !PushNegate) + // (or absorb the Negate parameter). + if (NegateOpsAndResult && !Negate) OutCC = AArch64CC::getInvertedCondCode(OutCC); return CmpL; } +/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain +/// of CCMP/CFCMP ops. See @ref AArch64CCMP. +/// \see emitConjunctionDisjunctionTreeRec(). +static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC) { + bool CanNegate; + if (!isConjunctionDisjunctionTree(Val, CanNegate)) + return SDValue(); + + return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(), + AArch64CC::AL); +} + /// @} static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { + SDValue &AArch64cc, SelectionDAG &DAG, + const SDLoc &dl) { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); @@ -1994,7 +2072,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); + .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; @@ -2096,8 +2174,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::getVectorVT(TruncVT, NumElts), Ops); + return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); } static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { @@ -2213,7 +2290,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. - case Intrinsic::aarch64_thread_pointer: { + case Intrinsic::thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); } @@ -2356,6 +2433,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, return CC_AArch64_GHC; case CallingConv::C: case CallingConv::Fast: + case CallingConv::PreserveMost: + case CallingConv::CXX_FAST_TLS: if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; @@ -2364,8 +2443,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { + const SmallVectorImpl &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2515,13 +2594,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments( ArgValue = DAG.getExtLoad( ExtType, DL, VA.getLocVT(), Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), - MemVT, false, false, false, 0); + MemVT); InVals.push_back(ArgValue); } } // varargs + AArch64FunctionInfo *FuncInfo = MF.getInfo(); if (isVarArg) { if (!Subtarget->isTargetDarwin()) { // The AAPCS variadic function ABI is identical to the non-variadic @@ -2530,22 +2610,20 @@ SDValue AArch64TargetLowering::LowerFormalArguments( saveVarArgRegisters(CCInfo, DAG, DL, Chain); } - AArch64FunctionInfo *AFI = MF.getInfo(); // This will point to the next argument passed via stack. unsigned StackOffset = CCInfo.getNextStackOffset(); // We currently pass all varargs at 8-byte alignment. StackOffset = ((StackOffset + 7) & ~7); - AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); + FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); } - AArch64FunctionInfo *FuncInfo = MF.getInfo(); unsigned StackArgSize = CCInfo.getNextStackOffset(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { // This is a non-standard ABI so by fiat I say we're allowed to make full // use of the stack area to be popped, which must be aligned to 16 bytes in // any case: - StackArgSize = RoundUpToAlignment(StackArgSize, 16); + StackArgSize = alignTo(StackArgSize, 16); // If we're expected to restore the stack (e.g. fastcc) then we'll be adding // a multiple of 16. @@ -2563,7 +2641,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, - SelectionDAG &DAG, SDLoc DL, + SelectionDAG &DAG, + const SDLoc &DL, SDValue &Chain) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2590,8 +2669,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, - false, 0); + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); @@ -2620,8 +2698,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), - false, false, 0); + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(16, DL, PtrVT)); @@ -2640,8 +2717,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals, bool isThisReturn, + const SmallVectorImpl &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS @@ -2658,7 +2735,7 @@ SDValue AArch64TargetLowering::LowerCallResult( // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference - if (i == 0 && isThisReturn) { + if (i == 0 && isThisReturn && EnableThisRetForwarding) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); @@ -2688,7 +2765,6 @@ SDValue AArch64TargetLowering::LowerCallResult( bool AArch64TargetLowering::isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, - bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { @@ -2698,7 +2774,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) return false; - const MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; @@ -2713,9 +2789,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return false; if (getTargetMachine().Options.GuaranteedTailCallOpt) { - if (IsTailCallConvention(CalleeCC) && CCMatch) - return true; - return false; + return IsTailCallConvention(CalleeCC) && CCMatch; } // Externally-defined functions with weak linkage should not be @@ -2742,6 +2816,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( assert((!isVarArg || CalleeCC == CallingConv::C) && "Unexpected variadic calling convention"); + LLVMContext &C = *DAG.getContext(); if (isVarArg && !Outs.empty()) { // At least two cases here: if caller is fastcc then we can't have any // memory arguments (we'd be expected to clean up the stack afterwards). If @@ -2750,8 +2825,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // FIXME: for now we take the most conservative of these in both cases: // disallow all variadic memory operands. SmallVector ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); for (const CCValAssign &ArgLoc : ArgLocs) @@ -2759,34 +2833,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return false; } - // If the calling conventions do not match, then we'd better make sure the - // results are returned in the same way as what the caller expects. + // Check that the call results are passed in the same way. + if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, + CCAssignFnForCall(CalleeCC, isVarArg), + CCAssignFnForCall(CallerCC, isVarArg))) + return false; + // The callee has to preserve all registers the caller needs to preserve. + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (!CCMatch) { - SmallVector RVLocs1; - CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, - *DAG.getContext()); - CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); - - SmallVector RVLocs2; - CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, - *DAG.getContext()); - CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); - - if (RVLocs1.size() != RVLocs2.size()) + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; - for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { - if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) - return false; - if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) - return false; - if (RVLocs1[i].isRegLoc()) { - if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) - return false; - } else { - if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) - return false; - } - } } // Nothing more to check if the callee is taking no arguments @@ -2794,16 +2852,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return true; SmallVector ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); const AArch64FunctionInfo *FuncInfo = MF.getInfo(); - // If the stack arguments for this call would fit into our own save area then - // the call can be made tail. - return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); + // If the stack arguments for this call do not fit into our own save area then + // the call cannot be made tail. + if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) + return false; + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) + return false; + + return true; } SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, @@ -2845,7 +2909,8 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, } bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { - return CallCC == CallingConv::Fast; + return CallCC == CallingConv::Fast || + CallCC == CallingConv::PreserveMost; } /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, @@ -2865,7 +2930,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); - bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool IsThisReturn = false; AArch64FunctionInfo *FuncInfo = MF.getInfo(); @@ -2875,8 +2939,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsTailCall) { // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization( - Callee, CallConv, IsVarArg, IsStructRet, - MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); + Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -2959,7 +3022,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Since callee will pop argument stack as a tail call, we must keep the // popped size 16-byte aligned. - NumBytes = RoundUpToAlignment(NumBytes, 16); + NumBytes = alignTo(NumBytes, 16); // FPDiff will be negative if this tail call requires more space than we // would automatically have in our incoming argument space. Positive if we @@ -3092,8 +3155,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, VA.getValVT() == MVT::i16) Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); - SDValue Store = - DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); + SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); MemOpChains.push_back(Store); } } @@ -3199,9 +3261,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); InFlag = Chain.getValue(1); - uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) - ? RoundUpToAlignment(NumBytes, 16) - : 0; + uint64_t CalleePopBytes = + DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), DAG.getIntPtrConstant(CalleePopBytes, DL, true), @@ -3232,7 +3293,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, - SDLoc DL, SelectionDAG &DAG) const { + const SDLoc &DL, SelectionDAG &DAG) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; @@ -3318,26 +3379,6 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); } - if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { - assert(getTargetMachine().getCodeModel() == CodeModel::Small && - "use of MO_CONSTPOOL only supported on small model"); - SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; - SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); - SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); - SDValue GlobalAddr = DAG.getLoad( - PtrVT, DL, DAG.getEntryNode(), PoolAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - /*isVolatile=*/false, - /*isNonTemporal=*/true, - /*isInvariant=*/true, 8); - if (GN->getOffset() != 0) - return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, - DAG.getConstant(GN->getOffset(), DL, PtrVT)); - return GlobalAddr; - } - if (getTargetMachine().getCodeModel() == CodeModel::Large) { const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( @@ -3405,8 +3446,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad(MVT::i64, DL, Chain, DescAddr, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, - true, true, 8); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + /* Alignment = */ 8, MachineMemOperand::MONonTemporal | + MachineMemOperand::MOInvariant); Chain = FuncTLVGet.getValue(1); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -3447,18 +3489,16 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the /// above sequence, and expanded really late in the compilation flow, to ensure /// the sequence is produced as per above. -SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, +SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, + const SDLoc &DL, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - SmallVector Ops; - Ops.push_back(Chain); - Ops.push_back(SymAddr); - - Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops); + Chain = + DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); SDValue Glue = Chain.getValue(1); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); @@ -3888,7 +3928,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, - SDValue FVal, SDLoc dl, + SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. @@ -4181,7 +4221,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, getPointerTy(DAG.getDataLayout())); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), - MachinePointerInfo(SV), false, false, 0); + MachinePointerInfo(SV)); } SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, @@ -4201,7 +4241,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, // void *__stack at offset 0 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, - MachinePointerInfo(SV), false, false, 8)); + MachinePointerInfo(SV), /* Alignment = */ 8)); // void *__gr_top at offset 8 int GPRSize = FuncInfo->getVarArgsGPRSize(); @@ -4216,7 +4256,8 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, DAG.getConstant(GPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, - MachinePointerInfo(SV, 8), false, false, 8)); + MachinePointerInfo(SV, 8), + /* Alignment = */ 8)); } // void *__vr_top at offset 16 @@ -4231,24 +4272,23 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, DAG.getConstant(FPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, - MachinePointerInfo(SV, 16), false, false, 8)); + MachinePointerInfo(SV, 16), + /* Alignment = */ 8)); } // int __gr_offs at offset 24 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); - MemOps.push_back(DAG.getStore(Chain, DL, - DAG.getConstant(-GPRSize, DL, MVT::i32), - GROffsAddr, MachinePointerInfo(SV, 24), false, - false, 4)); + MemOps.push_back(DAG.getStore( + Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, + MachinePointerInfo(SV, 24), /* Alignment = */ 4)); // int __vr_offs at offset 28 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); - MemOps.push_back(DAG.getStore(Chain, DL, - DAG.getConstant(-FPRSize, DL, MVT::i32), - VROffsAddr, MachinePointerInfo(SV, 28), false, - false, 4)); + MemOps.push_back(DAG.getStore( + Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, + MachinePointerInfo(SV, 28), /* Alignment = */ 4)); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } @@ -4287,8 +4327,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { unsigned Align = Op.getConstantOperandVal(3); auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), - false, false, false, 0); + SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V)); Chain = VAList.getValue(1); if (Align > 8) { @@ -4318,14 +4357,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(ArgSize, DL, PtrVT)); // Store the incremented VAList to the legalized pointer - SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), - false, false, 0); + SDValue APStore = + DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); // Load the actual argument out of the pointer VAList if (NeedFPTrunc) { // Load the value as an f64. - SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, - MachinePointerInfo(), false, false, false, 0); + SDValue WideFP = + DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); // Round the value down to an f32. SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), DAG.getIntPtrConstant(1, DL)); @@ -4334,8 +4373,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(Ops, DL); } - return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, - false, false, 0); + return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); } SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, @@ -4350,7 +4388,7 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, - MachinePointerInfo(), false, false, false, 0); + MachinePointerInfo()); return FrameAddr; } @@ -4381,7 +4419,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), - MachinePointerInfo(), false, false, false, 0); + MachinePointerInfo()); } // Return LR, which contains the return address. Mark it an implicit live-in. @@ -4521,6 +4559,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// +/// getEstimate - Return the appropriate estimate DAG for either the reciprocal +/// or the reciprocal square root. +static SDValue getEstimate(const AArch64Subtarget &ST, + const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode, + const SDValue &Operand, unsigned &ExtraSteps) { + if (!ST.hasNEON()) + return SDValue(); + + EVT VT = Operand.getValueType(); + + std::string RecipOp; + RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt"; + RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp; + RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f"; + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + ExtraSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); +} + +SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, unsigned &ExtraSteps) const { + return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps); +} + +SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const { + UseOneConst = true; + return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps); +} + //===----------------------------------------------------------------------===// // AArch64 Inline Assembly Support //===----------------------------------------------------------------------===// @@ -4548,6 +4620,27 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // is prefixed by the %w modifier. Floating-point and SIMD register operands // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or // %q modifier. +const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { + // At this point, we have to lower this constraint to something else, so we + // lower it to an "r" or "w". However, by doing this we will force the result + // to be in register, while the X constraint is much more permissive. + // + // Although we are correct (we are free to emit anything, without + // constraints), we might break use cases that would expect us to be more + // efficient and emit something else. + if (!Subtarget->hasFPARMv8()) + return "r"; + + if (ConstraintVT.isFloatingPoint()) + return "w"; + + if (ConstraintVT.isVector() && + (ConstraintVT.getSizeInBits() == 64 || + ConstraintVT.getSizeInBits() == 128)) + return "w"; + + return "r"; +} /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. @@ -4642,11 +4735,16 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( int RegNo; bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); if (!Failed && RegNo >= 0 && RegNo <= 31) { - // v0 - v31 are aliases of q0 - q31. + // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. // By default we'll emit v0-v31 for this unless there's a modifier where // we'll emit the correct register as well. - Res.first = AArch64::FPR128RegClass.getRegister(RegNo); - Res.second = &AArch64::FPR128RegClass; + if (VT != MVT::Other && VT.getSizeInBits() == 64) { + Res.first = AArch64::FPR64RegClass.getRegister(RegNo); + Res.second = &AArch64::FPR64RegClass; + } else { + Res.first = AArch64::FPR128RegClass.getRegister(RegNo); + Res.second = &AArch64::FPR128RegClass; + } } } } @@ -4862,11 +4960,12 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; - else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { + else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(V.getOperand(1))) { // A shuffle can only come from building a vector from various - // elements of other vectors. + // elements of other vectors, provided their indices are constant. return SDValue(); } @@ -4985,7 +5084,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) + if (Entry.isUndef()) continue; auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); @@ -5018,7 +5117,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], - ShuffleOps[1], &Mask[0]); + ShuffleOps[1], Mask); return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } @@ -5304,7 +5403,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, - SDLoc dl) { + const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); @@ -5433,35 +5532,34 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); SDValue Shuffle; - if (V2.getNode()->getOpcode() == ISD::UNDEF) { + if (V2.getNode()->isUndef()) { if (IndexLen == 8) V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); + DAG.getBuildVector(IndexVT, DL, + makeArrayRef(TBLMask.data(), IndexLen))); } else { if (IndexLen == 8) { V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); + DAG.getBuildVector(IndexVT, DL, + makeArrayRef(TBLMask.data(), IndexLen))); } else { // FIXME: We cannot, for the moment, emit a TBL2 instruction because we // cannot currently represent the register constraints on the input // table registers. // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, - // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - // &TBLMask[0], IndexLen)); + // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], + // IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), - V1Cst, V2Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); + DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, + V2Cst, DAG.getBuildVector(IndexVT, DL, + makeArrayRef(TBLMask.data(), IndexLen))); } } return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); @@ -5496,8 +5594,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], - V1.getValueType().getSimpleVT())) { + if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) @@ -5546,8 +5643,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, DAG.getConstant(Imm, dl, MVT::i32)); - } else if (V2->getOpcode() == ISD::UNDEF && - isSingletonEXTMask(ShuffleMask, VT, Imm)) { + } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); @@ -5580,8 +5676,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } - SDValue Concat = tryFormConcatFromShuffle(Op, DAG); - if (Concat.getNode()) + if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) return Concat; bool DstIsLeft; @@ -5853,8 +5948,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) if (EnableAArch64SlrGeneration) { - SDValue Res = tryLowerToSLI(Op.getNode(), DAG); - if (Res.getNode()) + if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) return Res; } @@ -5972,7 +6066,7 @@ static SDValue NormalizeBuildVector(SDValue Op, } Ops.push_back(Lane); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, @@ -6217,7 +6311,7 @@ FailedModImm: SDValue ConstantValue; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; if (i > 0) isOnlyLowElement = false; @@ -6273,7 +6367,7 @@ FailedModImm: for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); - SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); + SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); Val = LowerBUILD_VECTOR(Val, DAG); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); @@ -6328,7 +6422,7 @@ FailedModImm: // value is already in an S or D register. // Do not do this for UNDEF/LOAD nodes because we have better patterns // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. - if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD && + if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD && (ElemSize == 32 || ElemSize == 64)) { unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; MachineSDNode *N = @@ -6339,7 +6433,7 @@ FailedModImm: } for (; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); @@ -6580,7 +6674,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, - SDLoc dl, SelectionDAG &DAG) { + const SDLoc &dl, SelectionDAG &DAG) { EVT SrcVT = LHS.getValueType(); assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && "function only supposed to emit natural comparisons"); @@ -6877,12 +6971,10 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { const DataLayout &DL = I->getModule()->getDataLayout(); EVT VT = getValueType(DL, User->getOperand(0)->getType()); - if (isFMAFasterThanFMulAndFAdd(VT) && - isOperationLegalOrCustom(ISD::FMA, VT) && - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)) - return false; - - return true; + return !(isFMAFasterThanFMulAndFAdd(VT) && + isOperationLegalOrCustom(ISD::FMA, VT) && + (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath)); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding @@ -7183,16 +7275,17 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { - if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) - return true; - return false; + // Avoid UB for INT64_MIN. + if (Immed == std::numeric_limits::min()) + return false; + // Same encoding for add/sub, just flip the sign. + Immed = std::abs(Immed); + return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); } // Integer comparisons are implemented with ADDS/SUBS, so the range of valid // immediates is the same as for an add or a sub. bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { - if (Immed < 0) - Immed *= -1; return isLegalAddImmediate(Immed); } @@ -7244,10 +7337,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 - if (!AM.Scale || AM.Scale == 1 || - (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) - return true; - return false; + return !AM.Scale || AM.Scale == 1 || + (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); } int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, @@ -7334,6 +7425,33 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return Shift < 3; } +/// Turn vector tests of the signbit in the form of: +/// xor (sra X, elt_size(X)-1), -1 +/// into: +/// cmge X, X, #0 +static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!Subtarget->hasNEON() || !VT.isVector()) + return SDValue(); + + // There must be a shift right algebraic before the xor, and the xor must be a + // 'not' operation. + SDValue Shift = N->getOperand(0); + SDValue Ones = N->getOperand(1); + if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || + !ISD::isBuildVectorAllOnes(Ones.getNode())) + return SDValue(); + + // The shift should be smearing the sign bit across each vector element. + auto *ShiftAmt = dyn_cast(Shift.getOperand(1)); + EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); + if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) + return SDValue(); + + return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); +} + // Generate SUBS and CSEL for integer abs. static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -7362,13 +7480,15 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// performXorCombine - Attempts to handle integer ABS. static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) + return Cmp; + return performIntegerAbsCombine(N, DAG); } @@ -7376,6 +7496,10 @@ SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector *Created) const { + AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + if (isIntDivCheap(N->getValueType(0), Attr)) + return SDValue(N,0); // Lower SDIV as SDIV + // fold (sdiv X, pow2) EVT VT = N->getValueType(0); if ((VT != MVT::i32 && VT != MVT::i64) || @@ -7426,7 +7550,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and // 64-bit is 5 cycles, so this is always a win. if (ConstantSDNode *C = dyn_cast(N->getOperand(1))) { - APInt Value = C->getAPIntValue(); + const APInt &Value = C->getAPIntValue(); EVT VT = N->getValueType(0); SDLoc DL(N); if (Value.isNonNegative()) { @@ -7543,9 +7667,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, !cast(N0)->isVolatile()) { LoadSDNode *LN0 = cast(N0); SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), - LN0->getPointerInfo(), LN0->isVolatile(), - LN0->isNonTemporal(), LN0->isInvariant(), - LN0->getAlignment()); + LN0->getPointerInfo(), LN0->getAlignment(), + LN0->getMemOperand()->getFlags()); // Make sure successors of the original load stay after it by updating them // to use the new Chain. @@ -7567,7 +7690,8 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); SDValue Op = N->getOperand(0); - if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || + Op.getOpcode() != ISD::FMUL) return SDValue(); SDValue ConstVec = Op->getOperand(1); @@ -7801,25 +7925,49 @@ static SDValue tryCombineToBSL(SDNode *N, static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) - if (!EnableAArch64ExtrGeneration) - return SDValue(); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - SDValue Res = tryCombineToEXTR(N, DCI); - if (Res.getNode()) + if (SDValue Res = tryCombineToEXTR(N, DCI)) return Res; - Res = tryCombineToBSL(N, DCI); - if (Res.getNode()) + if (SDValue Res = tryCombineToBSL(N, DCI)) return Res; return SDValue(); } +static SDValue performSRLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the + // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32) + // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero. + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() == ISD::BSWAP) { + SDLoc DL(N); + SDValue N1 = N->getOperand(1); + SDValue N00 = N0.getOperand(0); + if (ConstantSDNode *C = dyn_cast(N1)) { + uint64_t ShiftAmt = C->getZExtValue(); + if (VT == MVT::i32 && ShiftAmt == 16 && + DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16))) + return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); + if (VT == MVT::i64 && ShiftAmt == 32 && + DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32))) + return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); + } + } + return SDValue(); +} + static SDValue performBitcastCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -8575,15 +8723,15 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), - St->isVolatile(), St->isNonTemporal(), St->getAlignment()); + St->getAlignment(), St->getMemOperand()->getFlags()); unsigned Offset = EltOffset; while (--NumVecElts) { SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), Alignment); + St->getPointerInfo(), Alignment, + St->getMemOperand()->getFlags()); Offset += EltOffset; } return NewST1; @@ -8603,9 +8751,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. - // Cyclone has bad performance on unaligned 16B stores when crossing line and - // page boundaries. We want to split such stores. - if (!Subtarget->isCyclone()) + if (!Subtarget->isMisaligned128StoreSlow()) return SDValue(); // Don't split at -Oz. @@ -8647,12 +8793,12 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), - S->isVolatile(), S->isNonTemporal(), S->getAlignment()); + S->getAlignment(), S->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(8, DL, MVT::i64)); return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, - S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), - S->getAlignment()); + S->getPointerInfo(), S->getAlignment(), + S->getMemOperand()->getFlags()); } /// Target-specific DAG combine function for post-increment LD1 (lane) and @@ -8741,9 +8887,10 @@ static SDValue performPostLD1Combine(SDNode *N, LoadSDN->getMemOperand()); // Update the uses. - SmallVector NewResults; - NewResults.push_back(SDValue(LD, 0)); // The result of load - NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain + SDValue NewResults[] = { + SDValue(LD, 0), // The result of load + SDValue(UpdN.getNode(), 2) // Chain + }; DCI.CombineTo(LD, NewResults); DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register @@ -8774,8 +8921,7 @@ static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { - SDValue Split = split16BStores(N, DCI, DAG, Subtarget); - if (Split.getNode()) + if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget)) return Split; if (Subtarget->supportsAddressTopByteIgnored() && @@ -9215,10 +9361,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { } case ISD::Constant: case ISD::TargetConstant: { - if (std::abs(cast(V.getNode())->getSExtValue()) < - 1LL << (width - 1)) - return true; - return false; + return std::abs(cast(V.getNode())->getSExtValue()) < + 1LL << (width - 1); } } @@ -9286,14 +9430,13 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { // isEquivalentMaskless() is the code for testing if the AND can be removed // factored out of the DAG recognition as the DAG can take several forms. -static -bool isEquivalentMaskless(unsigned CC, unsigned width, - ISD::LoadExtType ExtType, signed AddConstant, - signed CompConstant) { +static bool isEquivalentMaskless(unsigned CC, unsigned width, + ISD::LoadExtType ExtType, int AddConstant, + int CompConstant) { // By being careful about our equations and only writing the in term // symbolic values and well known constants (0, 1, -1, MaxUInt) we can // make them generally applicable to all bit widths. - signed MaxUInt = (1 << width); + int MaxUInt = (1 << width); // For the purposes of these comparisons sign extending the type is // equivalent to zero extending the add and displacing it by half the integer @@ -9441,8 +9584,7 @@ SDValue performCONDCombine(SDNode *N, static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { - SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3); - if (NV.getNode()) + if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) N = NV.getNode(); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); @@ -9678,7 +9820,7 @@ static SDValue performSelectCombine(SDNode *N, // Now duplicate the comparison mask we want across all other lanes. SmallVector DUPMask(CCVT.getVectorNumElements(), 0); - SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); + SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), Mask); @@ -9716,6 +9858,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performFDivCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); + case ISD::SRL: + return performSRLCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: @@ -9829,10 +9973,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, // return instructions to help enable tail call optimizations for this // instruction. bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall()) - return false; - - return true; + return CI->isTailCall(); } bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, @@ -9935,6 +10076,31 @@ static void ReplaceReductionResults(SDNode *N, Results.push_back(SplitVal); } +static void ReplaceCMP_SWAP_128Results(SDNode *N, + SmallVectorImpl & Results, + SelectionDAG &DAG) { + assert(N->getValueType(0) == MVT::i128 && + "AtomicCmpSwap on types less than 128 should be legal"); + SDValue Ops[] = {N->getOperand(1), + N->getOperand(2)->getOperand(0), + N->getOperand(2)->getOperand(1), + N->getOperand(3)->getOperand(0), + N->getOperand(3)->getOperand(1), + N->getOperand(0)}; + SDNode *CmpSwap = DAG.getMachineNode( + AArch64::CMP_SWAP_128, SDLoc(N), + DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); + + MachineFunction &MF = DAG.getMachineFunction(); + MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + + Results.push_back(SDValue(CmpSwap, 0)); + Results.push_back(SDValue(CmpSwap, 1)); + Results.push_back(SDValue(CmpSwap, 3)); +} + void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -9966,11 +10132,16 @@ void AArch64TargetLowering::ReplaceNodeResults( assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); // Let normal code take care of it by not adding anything to Results. return; + case ISD::ATOMIC_CMP_SWAP: + ReplaceCMP_SWAP_128Results(N, Results, DAG); + return; } } bool AArch64TargetLowering::useLoadStackGuardNode() const { - return true; + if (!Subtarget->isTargetAndroid()) + return true; + return TargetLowering::useLoadStackGuardNode(); } unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { @@ -10017,14 +10188,19 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { - return true; + // At -O0, fast-regalloc cannot cope with the live vregs necessary to + // implement cmpxchg without spilling. If the address being exchanged is also + // on the stack and close enough to the spill slot, this can lead to a + // situation where the monitor always gets cleared and the atomic operation + // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. + return getTargetMachine().getOptLevel() != 0; } Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast(Addr->getType())->getElementType(); - bool IsAcquire = isAtLeastAcquire(Ord); + bool IsAcquire = isAcquireOrStronger(Ord); // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i64, i64} and we have to recombine them into a @@ -10066,7 +10242,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - bool IsRelease = isAtLeastRelease(Ord); + bool IsRelease = isReleaseOrStronger(Ord); // Since the intrinsics must have legal type, the i128 intrinsics take two // parameters: "i64, i64". We must marshal Val into the appropriate form @@ -10104,6 +10280,22 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, return false; } +Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getIRStackGuard(IRB); + + // Android provides a fixed TLS slot for the stack cookie. See the definition + // of TLS_SLOT_STACK_GUARD in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + const unsigned TlsOffset = 0x28; + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + Function *ThreadPointerFunc = + Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); + return IRB.CreatePointerCast( + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); +} + Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (!Subtarget->isTargetAndroid()) return TargetLowering::getSafeStackPointerLocation(IRB); @@ -10114,7 +10306,7 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons const unsigned TlsOffset = 0x48; Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer); + Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); @@ -10166,3 +10358,16 @@ void AArch64TargetLowering::insertCopiesSplitCSR( .addReg(NewVR); } } + +bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { + // Integer division on AArch64 is expensive. However, when aggressively + // optimizing for code size, we prefer to use a div instruction, as it is + // usually smaller than the alternative sequence. + // The exception to this is vector division. Since AArch64 doesn't have vector + // integer division, leaving the division as-is is a loss even in terms of + // size, because it will have to be scalarized, while the alternative code + // sequence can be performed in vector form. + bool OptSize = + Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + return OptSize && !VT.isVector(); +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index e99616c94068..c87cfed1f892 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -187,6 +187,10 @@ enum NodeType : unsigned { SMULL, UMULL, + // Reciprocal estimates. + FRECPE, + FRSQRTE, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, @@ -272,11 +276,11 @@ public: SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; - MachineBasicBlock *EmitF128CSEL(MachineInstr *MI, + MachineBasicBlock *EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock * - EmitInstrWithCustomInserter(MachineInstr *MI, + EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, @@ -358,6 +362,10 @@ public: TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + /// If the target has a standard location for the stack protector cookie, + /// returns the address of that location. Otherwise, returns nullptr. + Value *getIRStackGuard(IRBuilder<> &IRB) const override; + /// If the target has a standard location for the unsafe stack pointer, /// returns the address of that location. Otherwise, returns nullptr. Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; @@ -378,6 +386,8 @@ public: return AArch64::X1; } + bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + bool isCheapToSpeculateCttz() const override { return true; } @@ -385,6 +395,12 @@ public: bool isCheapToSpeculateCtlz() const override { return true; } + + bool hasBitPreservingFPLogic(EVT VT) const override { + // FIXME: Is this always true? It should be true for vectors at least. + return VT == MVT::f32 || VT == MVT::f64; + } + bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); @@ -394,6 +410,10 @@ public: MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const override; + bool supportSwiftError() const override { + return true; + } + private: bool isExtFreeImpl(const Instruction *Ext) const override; @@ -401,30 +421,30 @@ private: /// make the right decision when generating code for different targets. const AArch64Subtarget *Subtarget; - void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT); + void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); - SDValue - LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, - SelectionDAG &DAG, - SmallVectorImpl &InVals) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Ins, + const SDLoc &DL, SelectionDAG &DAG, + SmallVectorImpl &InVals) const override; SDValue LowerCall(CallLoweringInfo & /*CLI*/, SmallVectorImpl &InVals) const override; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, - SelectionDAG &DAG, SmallVectorImpl &InVals, - bool isThisReturn, SDValue ThisVal) const; + const SmallVectorImpl &Ins, + const SDLoc &DL, SelectionDAG &DAG, + SmallVectorImpl &InVals, bool isThisReturn, + SDValue ThisVal) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, - bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const; @@ -439,7 +459,7 @@ private: bool IsTailCallConvention(CallingConv::ID CallCC) const; - void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, + void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL, SDValue &Chain) const; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, @@ -449,21 +469,21 @@ private: SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, SDLoc DL, + const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, + SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, - SDValue TVal, SDValue FVal, SDLoc dl, + SDValue TVal, SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; @@ -500,6 +520,11 @@ private: SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector *Created) const override; + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; @@ -515,6 +540,9 @@ private: std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; + + const char *LowerXConstraint(EVT ConstraintVT) const override; + void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const override; diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index 4923a1161dfc..59de62ad2877 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -29,7 +29,7 @@ def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; class acquiring_load : PatFrag<(ops node:$ptr), (base node:$ptr), [{ AtomicOrdering Ordering = cast(N)->getOrdering(); - return isAtLeastAcquire(Ordering); + return isAcquireOrStronger(Ordering); }]>; // An atomic load operation that does not need either acquire or release @@ -37,7 +37,7 @@ class acquiring_load class relaxed_load : PatFrag<(ops node:$ptr), (base node:$ptr), [{ AtomicOrdering Ordering = cast(N)->getOrdering(); - return !isAtLeastAcquire(Ordering); + return !isAcquireOrStronger(Ordering); }]>; // 8-bit loads @@ -112,15 +112,16 @@ def : Pat<(relaxed_load class releasing_store : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ AtomicOrdering Ordering = cast(N)->getOrdering(); - assert(Ordering != AcquireRelease && "unexpected store ordering"); - return isAtLeastRelease(Ordering); + assert(Ordering != AtomicOrdering::AcquireRelease && + "unexpected store ordering"); + return isReleaseOrStronger(Ordering); }]>; // An atomic store operation that doesn't actually need to be atomic on AArch64. class relaxed_store : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ AtomicOrdering Ordering = cast(N)->getOrdering(); - return !isAtLeastRelease(Ordering); + return !isReleaseOrStronger(Ordering); }]>; // 8-bit stores @@ -361,3 +362,43 @@ def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), // And clear exclusive. def : Pat<(int_aarch64_clrex), (CLREX 0xf)>; + +//===---------------------------------- +// Atomic cmpxchg for -O0 +//===---------------------------------- + +// The fast register allocator used during -O0 inserts spills to cover any VRegs +// live across basic block boundaries. When this happens between an LDXR and an +// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to +// fail. + +// Unfortunately, this means we have to have an alternative (expanded +// post-regalloc) path for -O0 compilations. Fortunately this path can be +// significantly more naive than the standard expansion: we conservatively +// assume seq_cst, strong cmpxchg and omit clrex on failure. + +let Constraints = "@earlyclobber $Rd,@earlyclobber $status", + mayLoad = 1, mayStore = 1 in { +def CMP_SWAP_8 : Pseudo<(outs GPR32:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>, + Sched<[WriteAtomic]>; + +def CMP_SWAP_16 : Pseudo<(outs GPR32:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>, + Sched<[WriteAtomic]>; + +def CMP_SWAP_32 : Pseudo<(outs GPR32:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>, + Sched<[WriteAtomic]>; + +def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR64:$desired, GPR64:$new), []>, + Sched<[WriteAtomic]>; +} + +let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $status", + mayLoad = 1, mayStore = 1 in +def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$status), + (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, + GPR64:$newLo, GPR64:$newHi), []>, + Sched<[WriteAtomic]>; diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 6ac2175e5035..34d35e961210 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -496,7 +496,7 @@ def imm0_65535 : Operand, ImmLeaf { let ParserMatchClass = Imm0_65535Operand; - let PrintMethod = "printHexImm"; + let PrintMethod = "printImmHex"; } // imm0_255 predicate - True if the immediate is in the range [0,255]. @@ -505,7 +505,7 @@ def imm0_255 : Operand, ImmLeaf { let ParserMatchClass = Imm0_255Operand; - let PrintMethod = "printHexImm"; + let PrintMethod = "printImm"; } // imm0_127 predicate - True if the immediate is in the range [0,127] @@ -514,7 +514,7 @@ def imm0_127 : Operand, ImmLeaf { let ParserMatchClass = Imm0_127Operand; - let PrintMethod = "printHexImm"; + let PrintMethod = "printImm"; } // NOTE: These imm0_N operands have to be of type i64 because i64 is the size @@ -923,10 +923,7 @@ def psbhint_op : Operand { // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields. if (!MCOp.isImm()) return false; - bool ValidNamed; - (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(), - STI.getFeatureBits(), ValidNamed); - return ValidNamed; + return AArch64PSBHint::lookupPSBByEncoding(MCOp.getImm()) != nullptr; }]; } @@ -1549,7 +1546,7 @@ class ADRI pattern> def movimm32_imm : Operand { let ParserMatchClass = Imm0_65535Operand; let EncoderMethod = "getMoveWideImmOpValue"; - let PrintMethod = "printHexImm"; + let PrintMethod = "printImm"; } def movimm32_shift : Operand { let PrintMethod = "printShifter"; @@ -9377,7 +9374,8 @@ class BaseCASEncoding : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn), "cas" # order # size, "\t$Rs, $Rt, [$Rn]", - "$out = $Rs",[]> { + "$out = $Rs",[]>, + Sched<[WriteAtomic]> { let NP = 1; } @@ -9391,7 +9389,8 @@ multiclass CompareAndSwap Acq, bits<1> Rel, string order> { class BaseCASP : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn), "casp" # order # size, "\t$Rs, $Rt, [$Rn]", - "$out = $Rs",[]> { + "$out = $Rs",[]>, + Sched<[WriteAtomic]> { let NP = 0; } @@ -9405,7 +9404,8 @@ multiclass CompareAndSwapPair Acq, bits<1> Rel, string order> { let Predicates = [HasV8_1a] in class BaseSWP : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size, - "\t$Rs, $Rt, [$Rn]","",[]> { + "\t$Rs, $Rt, [$Rn]","",[]>, + Sched<[WriteAtomic]> { bits<2> Sz; bit Acq; bit Rel; @@ -9436,7 +9436,8 @@ multiclass Swap Acq, bits<1> Rel, string order> { let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in class BaseLDOPregister : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size, - "\t$Rs, $Rt, [$Rn]","",[]> { + "\t$Rs, $Rt, [$Rn]","",[]>, + Sched<[WriteAtomic]> { bits<2> Sz; bit Acq; bit Rel; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index f398117de953..0aa4708f35ac 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -22,27 +22,31 @@ #include "llvm/MC/MCInst.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" +#include using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" +static LLVM_CONSTEXPR MachineMemOperand::Flags MOSuppressPair = + MachineMemOperand::MOTargetFlag1; + AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP), RI(STI.getTargetTriple()), Subtarget(STI) {} /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. -unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { - const MachineBasicBlock &MBB = *MI->getParent(); +unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const { + const MachineBasicBlock &MBB = *MI.getParent(); const MachineFunction *MF = MBB.getParent(); const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - if (MI->getOpcode() == AArch64::INLINEASM) - return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI); + if (MI.getOpcode() == AArch64::INLINEASM) + return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); - const MCInstrDesc &Desc = MI->getDesc(); + const MCInstrDesc &Desc = MI.getDesc(); switch (Desc.getOpcode()) { default: // Anything not explicitly designated otherwise is a nomal 4-byte insn. @@ -89,25 +93,25 @@ static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, } // Branch analysis. -bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl &Cond, - bool AllowModify) const { +bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); if (I == MBB.end()) return false; - if (!isUnpredicatedTerminator(I)) + if (!isUnpredicatedTerminator(*I)) return false; // Get the last instruction in the block. - MachineInstr *LastInst = I; + MachineInstr *LastInst = &*I; // If there is only one terminator instruction, process it. unsigned LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { if (isUncondBranchOpcode(LastOpc)) { TBB = LastInst->getOperand(0).getMBB(); return false; @@ -121,7 +125,7 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, } // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; + MachineInstr *SecondLastInst = &*I; unsigned SecondLastOpc = SecondLastInst->getOpcode(); // If AllowModify is true and the block ends with two or more unconditional @@ -131,19 +135,19 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, LastInst->eraseFromParent(); LastInst = SecondLastInst; LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { // Return now the only terminator is an unconditional branch. TBB = LastInst->getOperand(0).getMBB(); return false; } else { - SecondLastInst = I; + SecondLastInst = &*I; SecondLastOpc = SecondLastInst->getOpcode(); } } } // If there are three terminators, we don't know what sort of block this is. - if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) return true; // If the block ends with a B and a Bcc, handle it. @@ -243,7 +247,7 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { } void AArch64InstrInfo::instantiateCondBranch( - MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, + MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, ArrayRef Cond) const { if (Cond[0].getImm() != -1) { // Regular Bcc @@ -259,9 +263,11 @@ void AArch64InstrInfo::instantiateCondBranch( } } -unsigned AArch64InstrInfo::InsertBranch( - MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - ArrayRef Cond, DebugLoc DL) const { +unsigned AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef Cond, + const DebugLoc &DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); @@ -399,8 +405,8 @@ bool AArch64InstrInfo::canInsertSelect( } void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DstReg, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, ArrayRef Cond, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -533,8 +539,8 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, } /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. -static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { - uint64_t Imm = MI->getOperand(1).getImm(); +static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { + uint64_t Imm = MI.getOperand(1).getImm(); uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); uint64_t Encoding; return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); @@ -542,11 +548,13 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. -bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { - if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53()) - return MI->isAsCheapAsAMove(); +bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { + if (!Subtarget.hasCustomCheapAsMoveHandling()) + return MI.isAsCheapAsAMove(); + + unsigned Imm; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: return false; @@ -555,7 +563,17 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ADDXri: case AArch64::SUBWri: case AArch64::SUBXri: - return (MI->getOperand(3).getImm() == 0); + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 || + MI.getOperand(3).getImm() == 0); + + // add/sub on register with shift + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + Imm = MI.getOperand(3).getImm(); + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && + AArch64_AM::getArithShiftValue(Imm) < 4); // logical ops on immediate case AArch64::ANDWri: @@ -580,12 +598,41 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ORRWrr: case AArch64::ORRXrr: return true; + + // logical ops on register with shift + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + case AArch64::EONWrs: + case AArch64::EONXrs: + case AArch64::EORWrs: + case AArch64::EORXrs: + case AArch64::ORNWrs: + case AArch64::ORNXrs: + case AArch64::ORRWrs: + case AArch64::ORRXrs: + Imm = MI.getOperand(3).getImm(); + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && + AArch64_AM::getShiftValue(Imm) < 4 && + AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL); + // If MOVi32imm or MOVi64imm can be expanded into ORRWri or // ORRXri, it is as cheap as MOV case AArch64::MOVi32imm: return canBeExpandedToORR(MI, 32); case AArch64::MOVi64imm: return canBeExpandedToORR(MI, 64); + + // It is cheap to zero out registers if the subtarget has ZeroCycleZeroing + // feature. + case AArch64::FMOVS0: + case AArch64::FMOVD0: + return Subtarget.hasZeroCycleZeroing(); + case TargetOpcode::COPY: + return (Subtarget.hasZeroCycleZeroing() && + (MI.getOperand(1).getReg() == AArch64::WZR || + MI.getOperand(1).getReg() == AArch64::XZR)); } llvm_unreachable("Unknown opcode to check as cheap as a move!"); @@ -611,20 +658,18 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, } } -bool -AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, - MachineInstr *MIb, - AliasAnalysis *AA) const { +bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( + MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const { const TargetRegisterInfo *TRI = &getRegisterInfo(); unsigned BaseRegA = 0, BaseRegB = 0; - int OffsetA = 0, OffsetB = 0; - int WidthA = 0, WidthB = 0; + int64_t OffsetA = 0, OffsetB = 0; + unsigned WidthA = 0, WidthB = 0; - assert(MIa && MIa->mayLoadOrStore() && "MIa must be a load or store."); - assert(MIb && MIb->mayLoadOrStore() && "MIb must be a load or store."); + assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); + assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); - if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() || - MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || + MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; // Retrieve the base register, offset from the base register and width. Width @@ -648,10 +693,10 @@ AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. -bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, +bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::SUBSWrr: @@ -667,8 +712,8 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, case AArch64::ADDSXrs: case AArch64::ADDSXrx: // Replace SUBSWrr with SUBWrr if NZCV is not used. - SrcReg = MI->getOperand(1).getReg(); - SrcReg2 = MI->getOperand(2).getReg(); + SrcReg = MI.getOperand(1).getReg(); + SrcReg2 = MI.getOperand(2).getReg(); CmpMask = ~0; CmpValue = 0; return true; @@ -676,17 +721,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, case AArch64::ADDSWri: case AArch64::SUBSXri: case AArch64::ADDSXri: - SrcReg = MI->getOperand(1).getReg(); + SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; // FIXME: In order to convert CmpValue to 0 or 1 - CmpValue = (MI->getOperand(2).getImm() != 0); + CmpValue = MI.getOperand(2).getImm() != 0; return true; case AArch64::ANDSWri: case AArch64::ANDSXri: // ANDS does not use the same encoding scheme as the others xxxS // instructions. - SrcReg = MI->getOperand(1).getReg(); + SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; // FIXME:The return val type of decodeLogicalImmediate is uint64_t, @@ -694,17 +739,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, // the high 32 bits of uint64_t will be lost. // In fact it causes a bug in spec2006-483.xalancbmk // CmpValue is only used to compare with zero in OptimizeCompareInstr - CmpValue = (AArch64_AM::decodeLogicalImmediate( - MI->getOperand(2).getImm(), - MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0); + CmpValue = AArch64_AM::decodeLogicalImmediate( + MI.getOperand(2).getImm(), + MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0; return true; } return false; } -static bool UpdateOperandRegClass(MachineInstr *Instr) { - MachineBasicBlock *MBB = Instr->getParent(); +static bool UpdateOperandRegClass(MachineInstr &Instr) { + MachineBasicBlock *MBB = Instr.getParent(); assert(MBB && "Can't get MachineBasicBlock here"); MachineFunction *MF = MBB->getParent(); assert(MF && "Can't get MachineFunction here"); @@ -712,11 +757,11 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx; + for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; ++OpIdx) { - MachineOperand &MO = Instr->getOperand(OpIdx); + MachineOperand &MO = Instr.getOperand(OpIdx); const TargetRegisterClass *OpRegCstraints = - Instr->getRegClassConstraint(OpIdx, TII, TRI); + Instr.getRegClassConstraint(OpIdx, TII, TRI); // If there's no constraint, there's nothing to do. if (!OpRegCstraints) @@ -744,16 +789,16 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) { /// \brief Return the opcode that does not set flags when possible - otherwise /// return the original opcode. The caller is responsible to do the actual /// substitution and legality checking. -static unsigned convertFlagSettingOpcode(const MachineInstr *MI) { +static unsigned convertFlagSettingOpcode(const MachineInstr &MI) { // Don't convert all compare instructions, because for some the zero register // encoding becomes the sp register. bool MIDefinesZeroReg = false; - if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR)) + if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) MIDefinesZeroReg = true; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: - return MI->getOpcode(); + return MI.getOpcode(); case AArch64::ADDSWrr: return AArch64::ADDWrr; case AArch64::ADDSWri: @@ -789,60 +834,76 @@ static unsigned convertFlagSettingOpcode(const MachineInstr *MI) { } } -/// True when condition code could be modified on the instruction -/// trace starting at from and ending at to. -static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To, - const bool CheckOnlyCCWrites, - const TargetRegisterInfo *TRI) { - // We iterate backward starting \p To until we hit \p From - MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin(); +enum AccessKind { + AK_Write = 0x01, + AK_Read = 0x10, + AK_All = 0x11 +}; +/// True when condition flags are accessed (either by writing or reading) +/// on the instruction trace starting at From and ending at To. +/// +/// Note: If From and To are from different blocks it's assumed CC are accessed +/// on the path. +static bool areCFlagsAccessedBetweenInstrs( + MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, + const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { // Early exit if To is at the beginning of the BB. - if (I == B) + if (To == To->getParent()->begin()) return true; - // Check whether the definition of SrcReg is in the same basic block as - // Compare. If not, assume the condition code gets modified on some path. + // Check whether the instructions are in the same basic block + // If not, assume the condition flags might get modified somewhere. if (To->getParent() != From->getParent()) return true; - // Check that NZCV isn't set on the trace. - for (--I; I != E; --I) { - const MachineInstr &Instr = *I; + // From must be above To. + assert(std::find_if(MachineBasicBlock::reverse_iterator(To), + To->getParent()->rend(), [From](MachineInstr &MI) { + return MachineBasicBlock::iterator(MI) == From; + }) != To->getParent()->rend()); - if (Instr.modifiesRegister(AArch64::NZCV, TRI) || - (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI))) - // This instruction modifies or uses NZCV after the one we want to - // change. - return true; - if (I == B) - // We currently don't allow the instruction trace to cross basic - // block boundaries + // We iterate backward starting \p To until we hit \p From. + for (--To; To != From; --To) { + const MachineInstr &Instr = *To; + + if ( ((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) || + ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) return true; } return false; } -/// optimizeCompareInstr - Convert the instruction supplying the argument to the -/// comparison into one that sets the zero bit in the flags register. + +/// Try to optimize a compare instruction. A compare instruction is an +/// instruction which produces AArch64::NZCV. It can be truly compare instruction +/// when there are no uses of its destination register. +/// +/// The following steps are tried in order: +/// 1. Convert CmpInstr into an unconditional version. +/// 2. Remove CmpInstr if above there is an instruction producing a needed +/// condition code or an instruction which can be converted into such an instruction. +/// Only comparison with zero is supported. bool AArch64InstrInfo::optimizeCompareInstr( - MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, + MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { + assert(CmpInstr.getParent()); + assert(MRI); // Replace SUBSWrr with SUBWrr if NZCV is not used. - int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true); - if (Cmp_NZCV != -1) { - if (CmpInstr->definesRegister(AArch64::WZR) || - CmpInstr->definesRegister(AArch64::XZR)) { - CmpInstr->eraseFromParent(); + int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); + if (DeadNZCVIdx != -1) { + if (CmpInstr.definesRegister(AArch64::WZR) || + CmpInstr.definesRegister(AArch64::XZR)) { + CmpInstr.eraseFromParent(); return true; } - unsigned Opc = CmpInstr->getOpcode(); + unsigned Opc = CmpInstr.getOpcode(); unsigned NewOpc = convertFlagSettingOpcode(CmpInstr); if (NewOpc == Opc) return false; const MCInstrDesc &MCID = get(NewOpc); - CmpInstr->setDesc(MCID); - CmpInstr->RemoveOperand(Cmp_NZCV); + CmpInstr.setDesc(MCID); + CmpInstr.RemoveOperand(DeadNZCVIdx); bool succeeded = UpdateOperandRegClass(CmpInstr); (void)succeeded; assert(succeeded && "Some operands reg class are incompatible!"); @@ -857,23 +918,21 @@ bool AArch64InstrInfo::optimizeCompareInstr( return false; // CmpInstr is a Compare instruction if destination register is not used. - if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) - return false; - - // Get the unique definition of SrcReg. - MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); - if (!MI) + if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) return false; - bool CheckOnlyCCWrites = false; - const TargetRegisterInfo *TRI = &getRegisterInfo(); - if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI)) - return false; + return substituteCmpToZero(CmpInstr, SrcReg, MRI); +} - unsigned NewOpc = MI->getOpcode(); - switch (MI->getOpcode()) { +/// Get opcode of S version of Instr. +/// If Instr is S version its opcode is returned. +/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version +/// or we are not interested in it. +static unsigned sForm(MachineInstr &Instr) { + switch (Instr.getOpcode()) { default: - return false; + return AArch64::INSTRUCTION_LIST_END; + case AArch64::ADDSWrr: case AArch64::ADDSWri: case AArch64::ADDSXrr: @@ -882,116 +941,221 @@ bool AArch64InstrInfo::optimizeCompareInstr( case AArch64::SUBSWri: case AArch64::SUBSXrr: case AArch64::SUBSXri: - break; - case AArch64::ADDWrr: NewOpc = AArch64::ADDSWrr; break; - case AArch64::ADDWri: NewOpc = AArch64::ADDSWri; break; - case AArch64::ADDXrr: NewOpc = AArch64::ADDSXrr; break; - case AArch64::ADDXri: NewOpc = AArch64::ADDSXri; break; - case AArch64::ADCWr: NewOpc = AArch64::ADCSWr; break; - case AArch64::ADCXr: NewOpc = AArch64::ADCSXr; break; - case AArch64::SUBWrr: NewOpc = AArch64::SUBSWrr; break; - case AArch64::SUBWri: NewOpc = AArch64::SUBSWri; break; - case AArch64::SUBXrr: NewOpc = AArch64::SUBSXrr; break; - case AArch64::SUBXri: NewOpc = AArch64::SUBSXri; break; - case AArch64::SBCWr: NewOpc = AArch64::SBCSWr; break; - case AArch64::SBCXr: NewOpc = AArch64::SBCSXr; break; - case AArch64::ANDWri: NewOpc = AArch64::ANDSWri; break; - case AArch64::ANDXri: NewOpc = AArch64::ANDSXri; break; - } - - // Scan forward for the use of NZCV. - // When checking against MI: if it's a conditional code requires - // checking of V bit, then this is not safe to do. - // It is safe to remove CmpInstr if NZCV is redefined or killed. - // If we are done with the basic block, we need to check whether NZCV is - // live-out. - bool IsSafe = false; - for (MachineBasicBlock::iterator I = CmpInstr, - E = CmpInstr->getParent()->end(); - !IsSafe && ++I != E;) { - const MachineInstr &Instr = *I; - for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO; - ++IO) { - const MachineOperand &MO = Instr.getOperand(IO); - if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) { - IsSafe = true; - break; - } - if (!MO.isReg() || MO.getReg() != AArch64::NZCV) - continue; - if (MO.isDef()) { - IsSafe = true; - break; - } + return Instr.getOpcode();; + + case AArch64::ADDWrr: return AArch64::ADDSWrr; + case AArch64::ADDWri: return AArch64::ADDSWri; + case AArch64::ADDXrr: return AArch64::ADDSXrr; + case AArch64::ADDXri: return AArch64::ADDSXri; + case AArch64::ADCWr: return AArch64::ADCSWr; + case AArch64::ADCXr: return AArch64::ADCSXr; + case AArch64::SUBWrr: return AArch64::SUBSWrr; + case AArch64::SUBWri: return AArch64::SUBSWri; + case AArch64::SUBXrr: return AArch64::SUBSXrr; + case AArch64::SUBXri: return AArch64::SUBSXri; + case AArch64::SBCWr: return AArch64::SBCSWr; + case AArch64::SBCXr: return AArch64::SBCSXr; + case AArch64::ANDWri: return AArch64::ANDSWri; + case AArch64::ANDXri: return AArch64::ANDSXri; + } +} - // Decode the condition code. - unsigned Opc = Instr.getOpcode(); - AArch64CC::CondCode CC; - switch (Opc) { - default: - return false; - case AArch64::Bcc: - CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm(); - break; - case AArch64::CSINVWr: - case AArch64::CSINVXr: - case AArch64::CSINCWr: - case AArch64::CSINCXr: - case AArch64::CSELWr: - case AArch64::CSELXr: - case AArch64::CSNEGWr: - case AArch64::CSNEGXr: - case AArch64::FCSELSrrr: - case AArch64::FCSELDrrr: - CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm(); - break; - } +/// Check if AArch64::NZCV should be alive in successors of MBB. +static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) { + for (auto *BB : MBB->successors()) + if (BB->isLiveIn(AArch64::NZCV)) + return true; + return false; +} - // It is not safe to remove Compare instruction if Overflow(V) is used. - switch (CC) { - default: - // NZCV can be used multiple times, we should continue. - break; - case AArch64CC::VS: - case AArch64CC::VC: - case AArch64CC::GE: - case AArch64CC::LT: - case AArch64CC::GT: - case AArch64CC::LE: - return false; - } +struct UsedNZCV { + bool N; + bool Z; + bool C; + bool V; + UsedNZCV(): N(false), Z(false), C(false), V(false) {} + UsedNZCV& operator |=(const UsedNZCV& UsedFlags) { + this->N |= UsedFlags.N; + this->Z |= UsedFlags.Z; + this->C |= UsedFlags.C; + this->V |= UsedFlags.V; + return *this; + } +}; + +/// Find a condition code used by the instruction. +/// Returns AArch64CC::Invalid if either the instruction does not use condition +/// codes or we don't optimize CmpInstr in the presence of such instructions. +static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { + switch (Instr.getOpcode()) { + default: + return AArch64CC::Invalid; + + case AArch64::Bcc: { + int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); + assert(Idx >= 2); + return static_cast(Instr.getOperand(Idx - 2).getImm()); } + + case AArch64::CSINVWr: + case AArch64::CSINVXr: + case AArch64::CSINCWr: + case AArch64::CSINCXr: + case AArch64::CSELWr: + case AArch64::CSELXr: + case AArch64::CSNEGWr: + case AArch64::CSNEGXr: + case AArch64::FCSELSrrr: + case AArch64::FCSELDrrr: { + int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); + assert(Idx >= 1); + return static_cast(Instr.getOperand(Idx - 1).getImm()); + } + } +} + +static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { + assert(CC != AArch64CC::Invalid); + UsedNZCV UsedFlags; + switch (CC) { + default: + break; + + case AArch64CC::EQ: // Z set + case AArch64CC::NE: // Z clear + UsedFlags.Z = true; + break; + + case AArch64CC::HI: // Z clear and C set + case AArch64CC::LS: // Z set or C clear + UsedFlags.Z = true; + case AArch64CC::HS: // C set + case AArch64CC::LO: // C clear + UsedFlags.C = true; + break; + + case AArch64CC::MI: // N set + case AArch64CC::PL: // N clear + UsedFlags.N = true; + break; + + case AArch64CC::VS: // V set + case AArch64CC::VC: // V clear + UsedFlags.V = true; + break; + + case AArch64CC::GT: // Z clear, N and V the same + case AArch64CC::LE: // Z set, N and V differ + UsedFlags.Z = true; + case AArch64CC::GE: // N and V the same + case AArch64CC::LT: // N and V differ + UsedFlags.N = true; + UsedFlags.V = true; + break; } + return UsedFlags; +} + +static bool isADDSRegImm(unsigned Opcode) { + return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; +} + +static bool isSUBSRegImm(unsigned Opcode) { + return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; +} + +/// Check if CmpInstr can be substituted by MI. +/// +/// CmpInstr can be substituted: +/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' +/// - and, MI and CmpInstr are from the same MachineBB +/// - and, condition flags are not alive in successors of the CmpInstr parent +/// - and, if MI opcode is the S form there must be no defs of flags between +/// MI and CmpInstr +/// or if MI opcode is not the S form there must be neither defs of flags +/// nor uses of flags between MI and CmpInstr. +/// - and C/V flags are not used after CmpInstr +static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, + const TargetRegisterInfo *TRI) { + assert(MI); + assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END); + assert(CmpInstr); + + const unsigned CmpOpcode = CmpInstr->getOpcode(); + if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) + return false; - // If NZCV is not killed nor re-defined, we should check whether it is - // live-out. If it is live-out, do not optimize. - if (!IsSafe) { - MachineBasicBlock *ParentBlock = CmpInstr->getParent(); - for (auto *MBB : ParentBlock->successors()) - if (MBB->isLiveIn(AArch64::NZCV)) + if (MI->getParent() != CmpInstr->getParent()) + return false; + + if (areCFlagsAliveInSuccessors(CmpInstr->getParent())) + return false; + + AccessKind AccessToCheck = AK_Write; + if (sForm(*MI) != MI->getOpcode()) + AccessToCheck = AK_All; + if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck)) + return false; + + UsedNZCV NZCVUsedAfterCmp; + for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end(); + I != E; ++I) { + const MachineInstr &Instr = *I; + if (Instr.readsRegister(AArch64::NZCV, TRI)) { + AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); + if (CC == AArch64CC::Invalid) // Unsupported conditional instruction return false; + NZCVUsedAfterCmp |= getUsedNZCV(CC); + } + + if (Instr.modifiesRegister(AArch64::NZCV, TRI)) + break; } + + return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V; +} + +/// Substitute an instruction comparing to zero with another instruction +/// which produces needed condition flags. +/// +/// Return true on success. +bool AArch64InstrInfo::substituteCmpToZero( + MachineInstr &CmpInstr, unsigned SrcReg, + const MachineRegisterInfo *MRI) const { + assert(MRI); + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) + return false; + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + unsigned NewOpc = sForm(*MI); + if (NewOpc == AArch64::INSTRUCTION_LIST_END) + return false; + + if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI)) + return false; // Update the instruction to set NZCV. MI->setDesc(get(NewOpc)); - CmpInstr->eraseFromParent(); - bool succeeded = UpdateOperandRegClass(MI); + CmpInstr.eraseFromParent(); + bool succeeded = UpdateOperandRegClass(*MI); (void)succeeded; assert(succeeded && "Some operands reg class are incompatible!"); MI->addRegisterDefined(AArch64::NZCV, TRI); return true; } -bool -AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD) +bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD) return false; - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - unsigned Reg = MI->getOperand(0).getReg(); + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); const GlobalValue *GV = - cast((*MI->memoperands_begin())->getValue()); + cast((*MI.memoperands_begin())->getValue()); const TargetMachine &TM = MBB.getParent()->getTarget(); unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); const unsigned char MO_NC = AArch64II::MO_NC; @@ -1000,8 +1164,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) .addGlobalAddress(GV, 0, AArch64II::MO_GOT); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) - .addReg(Reg, RegState::Kill).addImm(0) - .addMemOperand(*MI->memoperands_begin()); + .addReg(Reg, RegState::Kill) + .addImm(0) + .addMemOperand(*MI.memoperands_begin()); } else if (TM.getCodeModel() == CodeModel::Large) { BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48); @@ -1015,8 +1180,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { .addReg(Reg, RegState::Kill) .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) - .addReg(Reg, RegState::Kill).addImm(0) - .addMemOperand(*MI->memoperands_begin()); + .addReg(Reg, RegState::Kill) + .addImm(0) + .addMemOperand(*MI.memoperands_begin()); } else { BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); @@ -1024,7 +1190,7 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) .addReg(Reg, RegState::Kill) .addGlobalAddress(GV, 0, LoFlags) - .addMemOperand(*MI->memoperands_begin()); + .addMemOperand(*MI.memoperands_begin()); } MBB.erase(MI); @@ -1033,8 +1199,8 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { } /// Return true if this is this instruction has a non-zero immediate -bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::ADDSWrs: @@ -1069,8 +1235,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { case AArch64::SUBSXrs: case AArch64::SUBWrs: case AArch64::SUBXrs: - if (MI->getOperand(3).isImm()) { - unsigned val = MI->getOperand(3).getImm(); + if (MI.getOperand(3).isImm()) { + unsigned val = MI.getOperand(3).getImm(); return (val != 0); } break; @@ -1079,8 +1245,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { } /// Return true if this is this instruction has a non-zero immediate -bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::ADDSWrx: @@ -1095,8 +1261,8 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { case AArch64::SUBWrx: case AArch64::SUBXrx: case AArch64::SUBXrx64: - if (MI->getOperand(3).isImm()) { - unsigned val = MI->getOperand(3).getImm(); + if (MI.getOperand(3).isImm()) { + unsigned val = MI.getOperand(3).getImm(); return (val != 0); } break; @@ -1107,51 +1273,51 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { // Return true if this instruction simply sets its single destination register // to zero. This is equivalent to a register rename of the zero-register. -bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::MOVZWi: case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) - if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) { - assert(MI->getDesc().getNumOperands() == 3 && - MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands"); + if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { + assert(MI.getDesc().getNumOperands() == 3 && + MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); return true; } break; case AArch64::ANDWri: // and Rd, Rzr, #imm - return MI->getOperand(1).getReg() == AArch64::WZR; + return MI.getOperand(1).getReg() == AArch64::WZR; case AArch64::ANDXri: - return MI->getOperand(1).getReg() == AArch64::XZR; + return MI.getOperand(1).getReg() == AArch64::XZR; case TargetOpcode::COPY: - return MI->getOperand(1).getReg() == AArch64::WZR; + return MI.getOperand(1).getReg() == AArch64::WZR; } return false; } // Return true if this instruction simply renames a general register without // modifying bits. -bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case TargetOpcode::COPY: { // GPR32 copies will by lowered to ORRXrs - unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstReg = MI.getOperand(0).getReg(); return (AArch64::GPR32RegClass.contains(DstReg) || AArch64::GPR64RegClass.contains(DstReg)); } case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) - if (MI->getOperand(1).getReg() == AArch64::XZR) { - assert(MI->getDesc().getNumOperands() == 4 && - MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands"); + if (MI.getOperand(1).getReg() == AArch64::XZR) { + assert(MI.getDesc().getNumOperands() == 4 && + MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); return true; } break; case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) - if (MI->getOperand(2).getImm() == 0) { - assert(MI->getDesc().getNumOperands() == 4 && - MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands"); + if (MI.getOperand(2).getImm() == 0) { + assert(MI.getDesc().getNumOperands() == 4 && + MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); return true; } break; @@ -1161,19 +1327,19 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const { // Return true if this instruction simply renames a general register without // modifying bits. -bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case TargetOpcode::COPY: { // FPR64 copies will by lowered to ORR.16b - unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstReg = MI.getOperand(0).getReg(); return (AArch64::FPR64RegClass.contains(DstReg) || AArch64::FPR128RegClass.contains(DstReg)); } case AArch64::ORRv16i8: - if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { - assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() && + if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { + assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && "invalid ORRv16i8 operands"); return true; } @@ -1182,9 +1348,9 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const { return false; } -unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, +unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::LDRWui: @@ -1194,10 +1360,10 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: - if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && + MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; } @@ -1205,9 +1371,9 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, return 0; } -unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, +unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::STRWui: @@ -1217,10 +1383,10 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: - if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && + MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; } @@ -1230,8 +1396,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, /// Return true if this is load/store scales or extends its register offset. /// This refers to scaling a dynamic index as opposed to scaled immediates. /// MI should be a memory op that allows scaled addressing. -bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::LDRBBroW: @@ -1281,7 +1447,7 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { case AArch64::STRWroX: case AArch64::STRXroX: - unsigned Val = MI->getOperand(3).getImm(); + unsigned Val = MI.getOperand(3).getImm(); AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val); return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val); } @@ -1289,36 +1455,96 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { } /// Check all MachineMemOperands for a hint to suppress pairing. -bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const { - assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && - "Too many target MO flags"); - for (auto *MM : MI->memoperands()) { - if (MM->getFlags() & - (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) { - return true; - } - } - return false; +bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const { + return any_of(MI.memoperands(), [](MachineMemOperand *MMO) { + return MMO->getFlags() & MOSuppressPair; + }); } /// Set a flag on the first MachineMemOperand to suppress pairing. -void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const { - if (MI->memoperands_empty()) +void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const { + if (MI.memoperands_empty()) return; + (*MI.memoperands_begin())->setFlags(MOSuppressPair); +} - assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && - "Too many target MO flags"); - (*MI->memoperands_begin()) - ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit); +bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const { + switch (Opc) { + default: + return false; + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURBBi: + case AArch64::STURHHi: + case AArch64::STURWi: + case AArch64::STURXi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + case AArch64::LDURSWi: + case AArch64::LDURHHi: + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + return true; + } } -bool -AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, - const TargetRegisterInfo *TRI) const { - switch (LdSt->getOpcode()) { +bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const { + return isUnscaledLdSt(MI.getOpcode()); +} + +// Is this a candidate for ld/st merging or pairing? For example, we don't +// touch volatiles or load/stores that have a hint to avoid pair formation. +bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const { + // If this is a volatile load/store, don't mess with it. + if (MI.hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm (as opposed to an address reloc). + assert(MI.getOperand(1).isReg() && "Expected a reg operand."); + if (!MI.getOperand(2).isImm()) + return false; + + // Can't merge/pair if the instruction modifies the base register. + // e.g., ldr x0, [x0] + unsigned BaseReg = MI.getOperand(1).getReg(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (MI.modifiesRegister(BaseReg, TRI)) + return false; + + // Check if this load/store has a hint to avoid pair formation. + // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. + if (isLdStPairSuppressed(MI)) + return false; + + // On some CPUs quad load/store pairs are slower than two single load/stores. + if (Subtarget.avoidQuadLdStPairs()) { + switch (MI.getOpcode()) { + default: + break; + + case AArch64::LDURQi: + case AArch64::STURQi: + case AArch64::LDRQui: + case AArch64::STRQui: + return false; + } + } + + return true; +} + +bool AArch64InstrInfo::getMemOpBaseRegImmOfs( + MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, + const TargetRegisterInfo *TRI) const { + switch (LdSt.getOpcode()) { default: return false; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: @@ -1329,29 +1555,45 @@ AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: - if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) - return false; - BaseReg = LdSt->getOperand(1).getReg(); - MachineFunction &MF = *LdSt->getParent()->getParent(); - unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize(); - Offset = LdSt->getOperand(2).getImm() * Width; - return true; + case AArch64::LDRSWui: + // Unscaled instructions. + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURXi: + case AArch64::STURWi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + case AArch64::LDURSWi: + unsigned Width; + return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI); }; } bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( - MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width, + MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const { + assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); // Handle only loads/stores with base register followed by immediate offset. - if (LdSt->getNumOperands() != 3) - return false; - if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) + if (LdSt.getNumExplicitOperands() == 3) { + // Non-paired instruction (e.g., ldr x1, [x0, #8]). + if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm()) + return false; + } else if (LdSt.getNumExplicitOperands() == 4) { + // Paired instruction (e.g., ldp x1, x2, [x0, #8]). + if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() || + !LdSt.getOperand(3).isImm()) + return false; + } else return false; // Offset is calculated as the immediate operand multiplied by the scaling factor. // Unscaled instructions have scaling factor set to 1. - int Scale = 0; - switch (LdSt->getOpcode()) { + unsigned Scale = 0; + switch (LdSt.getOpcode()) { default: return false; case AArch64::LDURQi: @@ -1392,18 +1634,48 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( Width = 1; Scale = 1; break; + case AArch64::LDPQi: + case AArch64::LDNPQi: + case AArch64::STPQi: + case AArch64::STNPQi: + Scale = 16; + Width = 32; + break; case AArch64::LDRQui: case AArch64::STRQui: Scale = Width = 16; break; + case AArch64::LDPXi: + case AArch64::LDPDi: + case AArch64::LDNPXi: + case AArch64::LDNPDi: + case AArch64::STPXi: + case AArch64::STPDi: + case AArch64::STNPXi: + case AArch64::STNPDi: + Scale = 8; + Width = 16; + break; case AArch64::LDRXui: case AArch64::LDRDui: case AArch64::STRXui: case AArch64::STRDui: Scale = Width = 8; break; + case AArch64::LDPWi: + case AArch64::LDPSi: + case AArch64::LDNPWi: + case AArch64::LDNPSi: + case AArch64::STPWi: + case AArch64::STPSi: + case AArch64::STNPWi: + case AArch64::STNPSi: + Scale = 4; + Width = 8; + break; case AArch64::LDRWui: case AArch64::LDRSui: + case AArch64::LDRSWui: case AArch64::STRWui: case AArch64::STRSui: Scale = Width = 4; @@ -1420,41 +1692,120 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STRBBui: Scale = Width = 1; break; - }; + } - BaseReg = LdSt->getOperand(1).getReg(); - Offset = LdSt->getOperand(2).getImm() * Scale; + if (LdSt.getNumExplicitOperands() == 3) { + BaseReg = LdSt.getOperand(1).getReg(); + Offset = LdSt.getOperand(2).getImm() * Scale; + } else { + assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); + BaseReg = LdSt.getOperand(2).getReg(); + Offset = LdSt.getOperand(3).getImm() * Scale; + } return true; } +// Scale the unscaled offsets. Returns false if the unscaled offset can't be +// scaled. +static bool scaleOffset(unsigned Opc, int64_t &Offset) { + unsigned OffsetStride = 1; + switch (Opc) { + default: + return false; + case AArch64::LDURQi: + case AArch64::STURQi: + OffsetStride = 16; + break; + case AArch64::LDURXi: + case AArch64::LDURDi: + case AArch64::STURXi: + case AArch64::STURDi: + OffsetStride = 8; + break; + case AArch64::LDURWi: + case AArch64::LDURSi: + case AArch64::LDURSWi: + case AArch64::STURWi: + case AArch64::STURSi: + OffsetStride = 4; + break; + } + // If the byte-offset isn't a multiple of the stride, we can't scale this + // offset. + if (Offset % OffsetStride != 0) + return false; + + // Convert the byte-offset used by unscaled into an "element" offset used + // by the scaled pair load/store instructions. + Offset /= OffsetStride; + return true; +} + +static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { + if (FirstOpc == SecondOpc) + return true; + // We can also pair sign-ext and zero-ext instructions. + switch (FirstOpc) { + default: + return false; + case AArch64::LDRWui: + case AArch64::LDURWi: + return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; + case AArch64::LDRSWui: + case AArch64::LDURSWi: + return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; + } + // These instructions can't be paired based on their opcodes. + return false; +} + /// Detect opportunities for ldp/stp formation. /// /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true. -bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { +bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, + MachineInstr &SecondLdSt, + unsigned NumLoads) const { // Only cluster up to a single pair. if (NumLoads > 1) return false; - if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode()) + + // Can we pair these instructions based on their opcodes? + unsigned FirstOpc = FirstLdSt.getOpcode(); + unsigned SecondOpc = SecondLdSt.getOpcode(); + if (!canPairLdStOpc(FirstOpc, SecondOpc)) + return false; + + // Can't merge volatiles or load/stores that have a hint to avoid pair + // formation, for example. + if (!isCandidateToMergeOrPair(FirstLdSt) || + !isCandidateToMergeOrPair(SecondLdSt)) + return false; + + // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. + int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); + if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) + return false; + + int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); + if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) return false; - // getMemOpBaseRegImmOfs guarantees that oper 2 isImm. - unsigned Ofs1 = FirstLdSt->getOperand(2).getImm(); - // Allow 6 bits of positive range. - if (Ofs1 > 64) + + // Pairwise instructions have a 7-bit signed offset field. + if (Offset1 > 63 || Offset1 < -64) return false; + // The caller should already have ordered First/SecondLdSt by offset. - unsigned Ofs2 = SecondLdSt->getOperand(2).getImm(); - return Ofs1 + 1 == Ofs2; + assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); + return Offset1 + 1 == Offset2; } -bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, - MachineInstr *Second) const { - if (Subtarget.isCyclone()) { - // Cyclone can fuse CMN, CMP, TST followed by Bcc. - unsigned SecondOpcode = Second->getOpcode(); +bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First, + MachineInstr &Second) const { + if (Subtarget.hasMacroOpFusion()) { + // Fuse CMN, CMP, TST followed by Bcc. + unsigned SecondOpcode = Second.getOpcode(); if (SecondOpcode == AArch64::Bcc) { - switch (First->getOpcode()) { + switch (First.getOpcode()) { default: return false; case AArch64::SUBSWri: @@ -1466,10 +1817,10 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, return true; } } - // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ. + // Fuse ALU operations followed by CBZ/CBNZ. if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { - switch (First->getOpcode()) { + switch (First.getOpcode()) { default: return false; case AArch64::ADDWri: @@ -1491,7 +1842,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, - const MDNode *Expr, DebugLoc DL) const { + const MDNode *Expr, const DebugLoc &DL) const { MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE)) .addFrameIndex(FrameIx) .addImm(0) @@ -1521,7 +1872,7 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, } void AArch64InstrInfo::copyPhysRegTuple( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef Indices) const { assert(Subtarget.hasNEON() && @@ -1547,9 +1898,9 @@ void AArch64InstrInfo::copyPhysRegTuple( } void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { if (AArch64::GPR32spRegClass.contains(DestReg) && (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { const TargetRegisterInfo *TRI = &getRegisterInfo(); @@ -1818,8 +2169,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (SrcReg == AArch64::NZCV) { assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); - BuildMI(MBB, I, DL, get(AArch64::MRS)) - .addReg(DestReg) + BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) .addImm(AArch64SysReg::NZCV) .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); return; @@ -1879,39 +2229,45 @@ void AArch64InstrInfo::storeRegToStackSlot( else if (AArch64::DDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Twov1d, Offset = false; + Opc = AArch64::ST1Twov1d; + Offset = false; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Threev1d, Offset = false; + Opc = AArch64::ST1Threev1d; + Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Fourv1d, Offset = false; + Opc = AArch64::ST1Fourv1d; + Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Twov2d, Offset = false; + Opc = AArch64::ST1Twov2d; + Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Threev2d, Offset = false; + Opc = AArch64::ST1Threev2d; + Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Fourv2d, Offset = false; + Opc = AArch64::ST1Fourv2d; + Offset = false; } break; } @@ -1977,39 +2333,45 @@ void AArch64InstrInfo::loadRegFromStackSlot( else if (AArch64::DDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Twov1d, Offset = false; + Opc = AArch64::LD1Twov1d; + Offset = false; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Threev1d, Offset = false; + Opc = AArch64::LD1Threev1d; + Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Fourv1d, Offset = false; + Opc = AArch64::LD1Fourv1d; + Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Twov2d, Offset = false; + Opc = AArch64::LD1Twov2d; + Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Threev2d, Offset = false; + Opc = AArch64::LD1Threev2d; + Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Fourv2d, Offset = false; + Opc = AArch64::LD1Fourv2d; + Offset = false; } break; } @@ -2024,13 +2386,16 @@ void AArch64InstrInfo::loadRegFromStackSlot( } void llvm::emitFrameOffset(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV) { if (DestReg == SrcReg && Offset == 0) return; + assert((DestReg != AArch64::SP || Offset % 16 == 0) && + "SP increment/decrement not 16-byte aligned"); + bool isSub = Offset < 0; if (isSub) Offset = -Offset; @@ -2082,8 +2447,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, } MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, int FrameIndex) const { + MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex, + LiveIntervals *LIS) const { // This is a bit of a hack. Consider this instruction: // // %vreg0 = COPY %SP; GPR64all:%vreg0 @@ -2097,9 +2463,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // // // - if (MI->isCopy()) { - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); + if (MI.isCopy()) { + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = MI.getOperand(1).getReg(); if (SrcReg == AArch64::SP && TargetRegisterInfo::isVirtualRegister(DstReg)) { MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); @@ -2393,9 +2759,10 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(AArch64::HINT); NopInst.addOperand(MCOperand::createImm(0)); } -/// useMachineCombiner - return true when a target supports MachineCombiner + +// AArch64 supports MachineCombiner. bool AArch64InstrInfo::useMachineCombiner() const { - // AArch64 supports the combiner + return true; } // @@ -2456,37 +2823,75 @@ static bool isCombineInstrCandidate64(unsigned Opc) { return false; } // +// FP Opcodes that can be combined with a FMUL +static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { + switch (Inst.getOpcode()) { + case AArch64::FADDSrr: + case AArch64::FADDDrr: + case AArch64::FADDv2f32: + case AArch64::FADDv2f64: + case AArch64::FADDv4f32: + case AArch64::FSUBSrr: + case AArch64::FSUBDrr: + case AArch64::FSUBv2f32: + case AArch64::FSUBv2f64: + case AArch64::FSUBv4f32: + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; + default: + break; + } + return false; +} +// // Opcodes that can be combined with a MUL static bool isCombineInstrCandidate(unsigned Opc) { return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); } -static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, - unsigned MulOpc, unsigned ZeroReg) { +// +// Utility routine that checks if \param MO is defined by an +// \param CombineOpc instruction in the basic block \param MBB +static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned CombineOpc, unsigned ZeroReg = 0, + bool CheckZeroReg = false) { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineInstr *MI = nullptr; - // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) MI = MRI.getUniqueVRegDef(MO.getReg()); // And it needs to be in the trace (otherwise, it won't have a depth). - if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc) - return false; - - assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && - MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && - MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); - - // The third input reg must be zero. - if (MI->getOperand(3).getReg() != ZeroReg) + if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) return false; - // Must only used by the user we combine with. if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) return false; + if (CheckZeroReg) { + assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && + MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); + // The third input reg must be zero. + if (MI->getOperand(3).getReg() != ZeroReg) + return false; + } + return true; } +// +// Is \param MO defined by an integer multiply and can be combined? +static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc, unsigned ZeroReg) { + return canCombine(MBB, MO, MulOpc, ZeroReg, true); +} + +// +// Is \param MO defined by a floating-point multiply and can be combined? +static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc) { + return canCombine(MBB, MO, MulOpc); +} + // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) // 2. Other math / logic operations (xor, or) @@ -2522,17 +2927,17 @@ static bool getMaddPatterns(MachineInstr &Root, bool Found = false; if (!isCombineInstrCandidate(Opc)) - return 0; + return false; if (isCombineInstrSettingFlag(Opc)) { int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); // When NZCV is live bail out. if (Cmp_NZCV == -1) - return 0; - unsigned NewOpc = convertFlagSettingOpcode(&Root); + return false; + unsigned NewOpc = convertFlagSettingOpcode(Root); // When opcode can't change bail out. // CHECKME: do we miss any cases for opcode conversion? if (NewOpc == Opc) - return 0; + return false; Opc = NewOpc; } @@ -2620,7 +3025,230 @@ static bool getMaddPatterns(MachineInstr &Root, } return Found; } +/// Floating-Point Support + +/// Find instructions that can be turned into madd. +static bool getFMAPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + + if (!isCombineInstrCandidateFP(Root)) + return 0; + MachineBasicBlock &MBB = *Root.getParent(); + bool Found = false; + + switch (Root.getOpcode()) { + default: + assert(false && "Unsupported FP instruction in combiner\n"); + break; + case AArch64::FADDSrr: + assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && + "FADDWrr does not have register operands"); + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2); + Found = true; + } + break; + case AArch64::FADDDrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2); + Found = true; + } + break; + case AArch64::FADDv2f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2); + Found = true; + } + break; + case AArch64::FADDv2f64: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2); + Found = true; + } + break; + case AArch64::FADDv4f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2); + Found = true; + } + break; + + case AArch64::FSUBSrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); + Found = true; + } + break; + case AArch64::FSUBDrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); + Found = true; + } + break; + case AArch64::FSUBv2f32: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2); + Found = true; + } + break; + case AArch64::FSUBv2f64: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2); + Found = true; + } + break; + case AArch64::FSUBv4f32: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2); + Found = true; + } + break; + } + return Found; +} + +/// Return true when a code sequence can improve throughput. It +/// should be called only for instructions in loops. +/// \param Pattern - combiner pattern +bool +AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { + switch (Pattern) { + default: + break; + case MachineCombinerPattern::FMULADDS_OP1: + case MachineCombinerPattern::FMULADDS_OP2: + case MachineCombinerPattern::FMULSUBS_OP1: + case MachineCombinerPattern::FMULSUBS_OP2: + case MachineCombinerPattern::FMULADDD_OP1: + case MachineCombinerPattern::FMULADDD_OP2: + case MachineCombinerPattern::FMULSUBD_OP1: + case MachineCombinerPattern::FMULSUBD_OP2: + case MachineCombinerPattern::FMLAv1i32_indexed_OP1: + case MachineCombinerPattern::FMLAv1i32_indexed_OP2: + case MachineCombinerPattern::FMLAv1i64_indexed_OP1: + case MachineCombinerPattern::FMLAv1i64_indexed_OP2: + case MachineCombinerPattern::FMLAv2f32_OP2: + case MachineCombinerPattern::FMLAv2f32_OP1: + case MachineCombinerPattern::FMLAv2f64_OP1: + case MachineCombinerPattern::FMLAv2f64_OP2: + case MachineCombinerPattern::FMLAv2i32_indexed_OP1: + case MachineCombinerPattern::FMLAv2i32_indexed_OP2: + case MachineCombinerPattern::FMLAv2i64_indexed_OP1: + case MachineCombinerPattern::FMLAv2i64_indexed_OP2: + case MachineCombinerPattern::FMLAv4f32_OP1: + case MachineCombinerPattern::FMLAv4f32_OP2: + case MachineCombinerPattern::FMLAv4i32_indexed_OP1: + case MachineCombinerPattern::FMLAv4i32_indexed_OP2: + case MachineCombinerPattern::FMLSv1i32_indexed_OP2: + case MachineCombinerPattern::FMLSv1i64_indexed_OP2: + case MachineCombinerPattern::FMLSv2i32_indexed_OP2: + case MachineCombinerPattern::FMLSv2i64_indexed_OP2: + case MachineCombinerPattern::FMLSv2f32_OP2: + case MachineCombinerPattern::FMLSv2f64_OP2: + case MachineCombinerPattern::FMLSv4i32_indexed_OP2: + case MachineCombinerPattern::FMLSv4f32_OP2: + return true; + } // end switch (Pattern) + return false; +} /// Return true when there is potentially a faster code sequence for an /// instruction chain ending in \p Root. All potential patterns are listed in /// the \p Pattern vector. Pattern should be sorted in priority order since the @@ -2629,28 +3257,35 @@ static bool getMaddPatterns(MachineInstr &Root, bool AArch64InstrInfo::getMachineCombinerPatterns( MachineInstr &Root, SmallVectorImpl &Patterns) const { + // Integer patterns if (getMaddPatterns(Root, Patterns)) return true; + // Floating point patterns + if (getFMAPatterns(Root, Patterns)) + return true; return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); } -/// genMadd - Generate madd instruction and combine mul and add. -/// Example: -/// MUL I=A,B,0 -/// ADD R,I,C -/// ==> MADD R,A,B,C -/// \param Root is the ADD instruction +enum class FMAInstKind { Default, Indexed, Accumulator }; +/// genFusedMultiply - Generate fused multiply instructions. +/// This function supports both integer and floating point instructions. +/// A typical example: +/// F|MUL I=A,B,0 +/// F|ADD R,I,C +/// ==> F|MADD R,A,B,C +/// \param Root is the F|ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction /// \param IdxMulOpd is index of operand in Root that is the result of -/// the MUL. In the example above IdxMulOpd is 1. -/// \param MaddOpc the opcode fo the madd instruction -static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, - const TargetInstrInfo *TII, MachineInstr &Root, - SmallVectorImpl &InsInstrs, - unsigned IdxMulOpd, unsigned MaddOpc, - const TargetRegisterClass *RC) { +/// the F|MUL. In the example above IdxMulOpd is 1. +/// \param MaddOpc the opcode fo the f|madd instruction +static MachineInstr * +genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl &InsInstrs, unsigned IdxMulOpd, + unsigned MaddOpc, const TargetRegisterClass *RC, + FMAInstKind kind = FMAInstKind::Default) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; @@ -2672,12 +3307,26 @@ static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, if (TargetRegisterInfo::isVirtualRegister(SrcReg2)) MRI.constrainRegClass(SrcReg2, RC); - MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), - ResultReg) - .addReg(SrcReg0, getKillRegState(Src0IsKill)) - .addReg(SrcReg1, getKillRegState(Src1IsKill)) - .addReg(SrcReg2, getKillRegState(Src2IsKill)); - // Insert the MADD + MachineInstrBuilder MIB; + if (kind == FMAInstKind::Default) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(SrcReg2, getKillRegState(Src2IsKill)); + else if (kind == FMAInstKind::Indexed) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg2, getKillRegState(Src2IsKill)) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addImm(MUL->getOperand(3).getImm()); + else if (kind == FMAInstKind::Accumulator) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg2, getKillRegState(Src2IsKill)) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)); + else + assert(false && "Invalid FMA instruction kind \n"); + // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) InsInstrs.push_back(MIB); return MUL; } @@ -2765,7 +3414,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; case MachineCombinerPattern::MULADDW_OP2: case MachineCombinerPattern::MULADDX_OP2: @@ -2780,7 +3429,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULADDWI_OP1: case MachineCombinerPattern::MULADDXI_OP1: { @@ -2872,7 +3521,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MSUBXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULSUBWI_OP1: case MachineCombinerPattern::MULSUBXI_OP1: { @@ -2917,6 +3566,234 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; } + // Floating Point Support + case MachineCombinerPattern::FMULADDS_OP1: + case MachineCombinerPattern::FMULADDD_OP1: + // MUL I=A,B,0 + // ADD R,I,C + // ==> MADD R,A,B,C + // --- Create(MADD); + if (Pattern == MachineCombinerPattern::FMULADDS_OP1) { + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::FMULADDS_OP2: + case MachineCombinerPattern::FMULADDD_OP2: + // FMUL I=A,B,0 + // FADD R,C,I + // ==> FMADD R,A,B,C + // --- Create(FMADD); + if (Pattern == MachineCombinerPattern::FMULADDS_OP2) { + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::FMLAv1i32_indexed_OP1: + Opc = AArch64::FMLAv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv1i32_indexed_OP2: + Opc = AArch64::FMLAv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLAv1i64_indexed_OP1: + Opc = AArch64::FMLAv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv1i64_indexed_OP2: + Opc = AArch64::FMLAv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLAv2i32_indexed_OP1: + case MachineCombinerPattern::FMLAv2f32_OP1: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { + Opc = AArch64::FMLAv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + case MachineCombinerPattern::FMLAv2i32_indexed_OP2: + case MachineCombinerPattern::FMLAv2f32_OP2: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { + Opc = AArch64::FMLAv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv2i64_indexed_OP1: + case MachineCombinerPattern::FMLAv2f64_OP1: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { + Opc = AArch64::FMLAv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + case MachineCombinerPattern::FMLAv2i64_indexed_OP2: + case MachineCombinerPattern::FMLAv2f64_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { + Opc = AArch64::FMLAv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv4i32_indexed_OP1: + case MachineCombinerPattern::FMLAv4f32_OP1: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { + Opc = AArch64::FMLAv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv4i32_indexed_OP2: + case MachineCombinerPattern::FMLAv4f32_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { + Opc = AArch64::FMLAv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMULSUBS_OP1: + case MachineCombinerPattern::FMULSUBD_OP1: { + // FMUL I=A,B,0 + // FSUB R,I,C + // ==> FNMSUB R,A,B,C // = -C + A*B + // --- Create(FNMSUB); + if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) { + Opc = AArch64::FNMSUBSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FNMSUBDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + } + case MachineCombinerPattern::FMULSUBS_OP2: + case MachineCombinerPattern::FMULSUBD_OP2: { + // FMUL I=A,B,0 + // FSUB R,C,I + // ==> FMSUB R,A,B,C (computes C - A*B) + // --- Create(FMSUB); + if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) { + Opc = AArch64::FMSUBSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMSUBDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::FMLSv1i32_indexed_OP2: + Opc = AArch64::FMLSv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLSv1i64_indexed_OP2: + Opc = AArch64::FMLSv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLSv2f32_OP2: + case MachineCombinerPattern::FMLSv2i32_indexed_OP2: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { + Opc = AArch64::FMLSv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLSv2f64_OP2: + case MachineCombinerPattern::FMLSv2i64_indexed_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { + Opc = AArch64::FMLSv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLSv4f32_OP2: + case MachineCombinerPattern::FMLSv4i32_indexed_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { + Opc = AArch64::FMLSv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); @@ -2940,14 +3817,23 @@ void AArch64InstrInfo::genAlternativeCodeSequence( /// to /// b. /// +/// Replace compare and branch sequence by TBZ/TBNZ instruction when the +/// compare's constant operand is power of 2. +/// +/// Examples: +/// and w8, w8, #0x400 +/// cbnz w8, L1 +/// to +/// tbnz w8, #10, L1 +/// /// \param MI Conditional Branch /// \return True when the simple conditional branch is generated /// -bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { +bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { bool IsNegativeBranch = false; bool IsTestAndBranch = false; unsigned TargetBBInMI = 0; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("Unknown branch instruction?"); case AArch64::Bcc: @@ -2976,48 +3862,108 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { // So we increment a zero register and test for bits other // than bit 0? Conservatively bail out in case the verifier // missed this case. - if (IsTestAndBranch && MI->getOperand(1).getImm()) + if (IsTestAndBranch && MI.getOperand(1).getImm()) return false; // Find Definition. - assert(MI->getParent() && "Incomplete machine instruciton\n"); - MachineBasicBlock *MBB = MI->getParent(); + assert(MI.getParent() && "Incomplete machine instruciton\n"); + MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - unsigned VReg = MI->getOperand(0).getReg(); + unsigned VReg = MI.getOperand(0).getReg(); if (!TargetRegisterInfo::isVirtualRegister(VReg)) return false; MachineInstr *DefMI = MRI->getVRegDef(VReg); - // Look for CSINC - if (!(DefMI->getOpcode() == AArch64::CSINCWr && - DefMI->getOperand(1).getReg() == AArch64::WZR && - DefMI->getOperand(2).getReg() == AArch64::WZR) && - !(DefMI->getOpcode() == AArch64::CSINCXr && - DefMI->getOperand(1).getReg() == AArch64::XZR && - DefMI->getOperand(2).getReg() == AArch64::XZR)) - return false; + // Look through COPY instructions to find definition. + while (DefMI->isCopy()) { + unsigned CopyVReg = DefMI->getOperand(1).getReg(); + if (!MRI->hasOneNonDBGUse(CopyVReg)) + return false; + if (!MRI->hasOneDef(CopyVReg)) + return false; + DefMI = MRI->getVRegDef(CopyVReg); + } - if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) + switch (DefMI->getOpcode()) { + default: return false; + // Fold AND into a TBZ/TBNZ if constant operand is power of 2. + case AArch64::ANDWri: + case AArch64::ANDXri: { + if (IsTestAndBranch) + return false; + if (DefMI->getParent() != MBB) + return false; + if (!MRI->hasOneNonDBGUse(VReg)) + return false; - AArch64CC::CondCode CC = - (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); - bool CheckOnlyCCWrites = true; - // Convert only when the condition code is not modified between - // the CSINC and the branch. The CC may be used by other - // instructions in between. - if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo())) - return false; - MachineBasicBlock &RefToMBB = *MBB; - MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB(); - DebugLoc DL = MI->getDebugLoc(); - if (IsNegativeBranch) - CC = AArch64CC::getInvertedCondCode(CC); - BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); - MI->eraseFromParent(); - return true; + bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); + uint64_t Mask = AArch64_AM::decodeLogicalImmediate( + DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); + if (!isPowerOf2_64(Mask)) + return false; + + MachineOperand &MO = DefMI->getOperand(1); + unsigned NewReg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(NewReg)) + return false; + + assert(!MRI->def_empty(NewReg) && "Register must be defined."); + + MachineBasicBlock &RefToMBB = *MBB; + MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Imm = Log2_64(Mask); + unsigned Opc = (Imm < 32) + ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) + : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); + MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) + .addReg(NewReg) + .addImm(Imm) + .addMBB(TBB); + // Register lives on to the CBZ now. + MO.setIsKill(false); + + // For immediate smaller than 32, we need to use the 32-bit + // variant (W) in all cases. Indeed the 64-bit variant does not + // allow to encode them. + // Therefore, if the input register is 64-bit, we need to take the + // 32-bit sub-part. + if (!Is32Bit && Imm < 32) + NewMI->getOperand(0).setSubReg(AArch64::sub_32); + MI.eraseFromParent(); + return true; + } + // Look for CSINC + case AArch64::CSINCWr: + case AArch64::CSINCXr: { + if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && + DefMI->getOperand(2).getReg() == AArch64::WZR) && + !(DefMI->getOperand(1).getReg() == AArch64::XZR && + DefMI->getOperand(2).getReg() == AArch64::XZR)) + return false; + + if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) + return false; + + AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); + // Convert only when the condition code is not modified between + // the CSINC and the branch. The CC may be used by other + // instructions in between. + if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) + return false; + MachineBasicBlock &RefToMBB = *MBB; + MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); + DebugLoc DL = MI.getDebugLoc(); + if (IsNegativeBranch) + CC = AArch64CC::getInvertedCondCode(CC); + BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); + MI.eraseFromParent(); + return true; + } + } } std::pair @@ -3046,7 +3992,6 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { static const std::pair TargetFlags[] = { {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"}, - {MO_TLS, "aarch64-tls"}, - {MO_CONSTPOOL, "aarch64-constant-pool"}}; + {MO_TLS, "aarch64-tls"}}; return makeArrayRef(TargetFlags); } diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index b5bb446f8c16..24bc0e639747 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -28,12 +28,6 @@ class AArch64Subtarget; class AArch64TargetMachine; class AArch64InstrInfo : public AArch64GenInstrInfo { - // Reserve bits in the MachineMemOperand target hint flags, starting at 1. - // They will be shifted into MOTargetHintStart when accessed. - enum TargetMemOperandFlags { - MOSuppressPair = 1 - }; - const AArch64RegisterInfo RI; const AArch64Subtarget &Subtarget; @@ -45,76 +39,88 @@ public: /// always be able to get register info as well (through this method). const AArch64RegisterInfo &getRegisterInfo() const { return RI; } - unsigned GetInstSizeInBytes(const MachineInstr *MI) const; + unsigned GetInstSizeInBytes(const MachineInstr &MI) const; - bool isAsCheapAsAMove(const MachineInstr *MI) const override; + bool isAsCheapAsAMove(const MachineInstr &MI) const override; bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const override; bool - areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, + areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; - unsigned isLoadFromStackSlot(const MachineInstr *MI, + unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - unsigned isStoreToStackSlot(const MachineInstr *MI, + unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; /// Returns true if there is a shiftable register and that the shift value /// is non-zero. - bool hasShiftedReg(const MachineInstr *MI) const; + bool hasShiftedReg(const MachineInstr &MI) const; /// Returns true if there is an extendable register and that the extending /// value is non-zero. - bool hasExtendedReg(const MachineInstr *MI) const; + bool hasExtendedReg(const MachineInstr &MI) const; /// \brief Does this instruction set its full destination register to zero? - bool isGPRZero(const MachineInstr *MI) const; + bool isGPRZero(const MachineInstr &MI) const; /// \brief Does this instruction rename a GPR without modifying bits? - bool isGPRCopy(const MachineInstr *MI) const; + bool isGPRCopy(const MachineInstr &MI) const; /// \brief Does this instruction rename an FPR without modifying bits? - bool isFPRCopy(const MachineInstr *MI) const; + bool isFPRCopy(const MachineInstr &MI) const; /// Return true if this is load/store scales or extends its register offset. /// This refers to scaling a dynamic index as opposed to scaled immediates. /// MI should be a memory op that allows scaled addressing. - bool isScaledAddr(const MachineInstr *MI) const; + bool isScaledAddr(const MachineInstr &MI) const; /// Return true if pairing the given load or store is hinted to be /// unprofitable. - bool isLdStPairSuppressed(const MachineInstr *MI) const; + bool isLdStPairSuppressed(const MachineInstr &MI) const; + + /// Return true if this is an unscaled load/store. + bool isUnscaledLdSt(unsigned Opc) const; + + /// Return true if this is an unscaled load/store. + bool isUnscaledLdSt(MachineInstr &MI) const; + + /// Return true if this is a load/store that can be potentially paired/merged. + bool isCandidateToMergeOrPair(MachineInstr &MI) const; /// Hint that pairing the given load or store is unprofitable. - void suppressLdStPair(MachineInstr *MI) const; + void suppressLdStPair(MachineInstr &MI) const; - bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, + bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, + int64_t &Offset, const TargetRegisterInfo *TRI) const override; - bool getMemOpBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg, - int &Offset, int &Width, + bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg, + int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const; bool enableClusterLoads() const override { return true; } - bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, - unsigned NumLoads) const override; + bool enableClusterStores() const override { return true; } + + bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, + unsigned NumLoads) const override; - bool shouldScheduleAdjacent(MachineInstr *First, - MachineInstr *Second) const override; + bool shouldScheduleAdjacent(MachineInstr &First, + MachineInstr &Second) const override; MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, - const MDNode *Expr, DebugLoc DL) const; + const MDNode *Expr, + const DebugLoc &DL) const; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - DebugLoc DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef Indices) const; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - DebugLoc DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, @@ -129,40 +135,47 @@ public: const TargetRegisterInfo *TRI) const override; using TargetInstrInfo::foldMemoryOperandImpl; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, - int FrameIndex) const override; + MachineInstr * + foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, + ArrayRef Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex, + LiveIntervals *LIS = nullptr) const override; - bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify = false) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef Cond, - DebugLoc DL) const override; + const DebugLoc &DL) const override; bool ReverseBranchCondition(SmallVectorImpl &Cond) const override; bool canInsertSelect(const MachineBasicBlock &, ArrayRef Cond, unsigned, unsigned, int &, int &, int &) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - DebugLoc DL, unsigned DstReg, ArrayRef Cond, - unsigned TrueReg, unsigned FalseReg) const override; + const DebugLoc &DL, unsigned DstReg, + ArrayRef Cond, unsigned TrueReg, + unsigned FalseReg) const override; void getNoopForMachoTarget(MCInst &NopInst) const override; /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. - bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const override; /// optimizeCompareInstr - Convert the instruction supplying the argument to /// the comparison into one that sets the zero bit in the flags register. - bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; - bool optimizeCondBranch(MachineInstr *MI) const override; + bool optimizeCondBranch(MachineInstr &MI) const override; + + /// Return true when a code sequence can improve throughput. It + /// should be called only for instructions in loops. + /// \param Pattern - combiner pattern + bool isThroughputPattern(MachineCombinerPattern Pattern) const override; /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in . All potential patterns are /// listed in the array. @@ -179,10 +192,10 @@ public: SmallVectorImpl &InsInstrs, SmallVectorImpl &DelInstrs, DenseMap &InstrIdxForVirtReg) const override; - /// useMachineCombiner - AArch64 supports MachineCombiner + /// AArch64 supports MachineCombiner. bool useMachineCombiner() const override; - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + bool expandPostRAPseudo(MachineInstr &MI) const override; std::pair decomposeMachineOperandsTargetFlags(unsigned TF) const override; @@ -192,9 +205,11 @@ public: getSerializableBitmaskMachineOperandTargetFlags() const override; private: - void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, + void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, ArrayRef Cond) const; + bool substituteCmpToZero(MachineInstr &CmpInstr, unsigned SrcReg, + const MachineRegisterInfo *MRI) const; }; /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg @@ -202,8 +217,8 @@ private: /// insertion (PEI) pass, where a virtual scratch register may be allocated /// if necessary, to be replaced by the scavenger at the end of PEI. void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset, - const TargetInstrInfo *TII, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, + int Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag = MachineInstr::NoFlags, bool SetNZCV = false); diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index d02bc9ff394d..af9ed812e6da 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -26,6 +26,8 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto", "crypto">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; +def HasRAS : Predicate<"Subtarget->hasRAS()">, + AssemblerPredicate<"FeatureRAS", "ras">; def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; @@ -34,7 +36,8 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">, def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; -def IsCyclone : Predicate<"Subtarget->isCyclone()">; +def UseAlternateSExtLoadCVTF32 + : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -283,6 +286,9 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>; def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; +def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>; +def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>; + def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>; def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>; @@ -295,9 +301,6 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; //===----------------------------------------------------------------------===// // AArch64 Instruction Predicate Definitions. -// -def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; -def NoZCZ : Predicate<"!Subtarget->hasZeroCycleZeroing()">; def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">; def ForCodeSize : Predicate<"ForCodeSize">; @@ -312,10 +315,13 @@ include "AArch64InstrFormats.td" //===----------------------------------------------------------------------===// let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { +// We set Sched to empty list because we expect these instructions to simply get +// removed in most cases. def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - [(AArch64callseq_start timm:$amt)]>; + [(AArch64callseq_start timm:$amt)]>, Sched<[]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), - [(AArch64callseq_end timm:$amt1, timm:$amt2)]>; + [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, + Sched<[]>; } // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { @@ -383,6 +389,7 @@ def : InstAlias<"wfe", (HINT 0b010)>; def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>; // v8.2a Statistical Profiling extension def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>; @@ -528,6 +535,12 @@ def i64imm_32bit : ImmLeaf(Imm); }]>; +def s64imm_32bit : ImmLeaf(Imm); + return Imm64 >= std::numeric_limits::min() && + Imm64 <= std::numeric_limits::max(); +}]>; + def trunc_imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32); }]>; @@ -679,10 +692,11 @@ def : InstAlias<"negs $dst, $src$shift", // Unsigned/Signed divide defm UDIV : Div<0, "udiv", udiv>; defm SDIV : Div<1, "sdiv", sdiv>; -let isCodeGenOnly = 1 in { -defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>; -defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>; -} + +def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>; +def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>; +def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>; +def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>; // Variable shift defm ASRV : Shift<0b10, "asr", sra>; @@ -734,6 +748,40 @@ def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))), (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))), (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; + +def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))), + (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))), + (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))), + (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), XZR)>; + +def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))), + (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))), + (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))), + (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), XZR)>; + +def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)), + (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)), + (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)), + GPR64:$Ra)), + (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; + +def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))), + (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))), + (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32), + (s64imm_32bit:$C)))), + (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; } // AddedComplexity = 5 def : MulAccumWAlias<"mul", MADDWrrr>; @@ -1089,6 +1137,14 @@ def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV), (CSINVWr WZR, WZR, (i32 imm:$cc))>; def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV), (CSINVXr XZR, XZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV), + (CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV), + (CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV), + (CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>; +def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV), + (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>; // The inverse of the condition code from the alias instruction is what is used // in the aliased instruction. The parser all ready inverts the condition code @@ -1158,7 +1214,8 @@ def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; // Create a separate pseudo-instruction for codegen to use so that we don't // flag lr as used in every function. It'll be restored before the RET by the // epilogue if it's legitimately used. -def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> { +def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>, + Sched<[WriteBrReg]> { let isTerminator = 1; let isBarrier = 1; let isReturn = 1; @@ -1168,7 +1225,7 @@ def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> { // R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction // (which in the usual case is a BLR). let hasSideEffects = 1 in -def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> { +def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> { let AsmString = ".tlsdesccall $sym"; } @@ -1178,7 +1235,8 @@ let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1, isCodeGenOnly = 1 in def TLSDESC_CALLSEQ : Pseudo<(outs), (ins i64imm:$sym), - [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>; + [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>, + Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>; def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym), (TLSDESC_CALLSEQ texternalsym:$sym)>; @@ -2444,13 +2502,32 @@ defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>; defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>; defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>; defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>; -let isCodeGenOnly = 1 in { -defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>; -defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; -defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>; -defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; + +multiclass FPToIntegerIntPats { + def : Pat<(i32 (round f16:$Rn)), (!cast(INST # UWHr) $Rn)>; + def : Pat<(i64 (round f16:$Rn)), (!cast(INST # UXHr) $Rn)>; + def : Pat<(i32 (round f32:$Rn)), (!cast(INST # UWSr) $Rn)>; + def : Pat<(i64 (round f32:$Rn)), (!cast(INST # UXSr) $Rn)>; + def : Pat<(i32 (round f64:$Rn)), (!cast(INST # UWDr) $Rn)>; + def : Pat<(i64 (round f64:$Rn)), (!cast(INST # UXDr) $Rn)>; + + def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), + (!cast(INST # SWHri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), + (!cast(INST # SXHri) $Rn, $scale)>; + def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), + (!cast(INST # SWSri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), + (!cast(INST # SXSri) $Rn, $scale)>; + def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), + (!cast(INST # SWDri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), + (!cast(INST # SXDri) $Rn, $scale)>; } +defm : FPToIntegerIntPats; +defm : FPToIntegerIntPats; + multiclass FPToIntegerPats { def : Pat<(i32 (to_int (round f32:$Rn))), (!cast(INST # UWSr) f32:$Rn)>; @@ -2485,13 +2562,11 @@ defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>; defm FMOV : UnscaledConversion<"fmov">; // Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable -let isReMaterializable = 1, isCodeGenOnly = 1 in { +let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in { def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>, - PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>, - Requires<[NoZCZ]>; + Sched<[WriteF]>; def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>, - PseudoInstExpansion<(FMOVXDr FPR64:$Rd, XZR)>, - Requires<[NoZCZ]>; + Sched<[WriteF]>; } //===----------------------------------------------------------------------===// @@ -2617,6 +2692,7 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd), (i32 imm:$cond), NZCV))]> { let Uses = [NZCV]; let usesCustomInserter = 1; + let hasNoSchedulingInfo = 1; } @@ -2742,12 +2818,19 @@ defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn", int_aarch64_neon_fcvtxn>; defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>; defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>; -let isCodeGenOnly = 1 in { -defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", - int_aarch64_neon_fcvtzs>; -defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", - int_aarch64_neon_fcvtzu>; -} + +def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>; +def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>; +def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>; +def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>; +def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>; + +def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>; +def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>; +def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>; +def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>; +def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>; + defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>; defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>; defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>; @@ -3318,6 +3401,19 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))), def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))), (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))), + (FRECPEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))), + (FRECPEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))), + (FRECPEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))), + (FRECPEv2f64 FPR128:$Rn)>; + def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))), (FRECPXv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))), @@ -3330,6 +3426,19 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))), def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))), (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))), + (FRSQRTEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))), + (FRSQRTEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))), + (FRSQRTEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))), + (FRSQRTEv2f64 FPR128:$Rn)>; + // If an integer is about to be converted to a floating point value, // just load it on the floating point unit. // Here are the patterns for 8 and 16-bits to float. @@ -4319,18 +4428,6 @@ def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128, "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; - -// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing. -// Complexity is added to break a tie with a plain MOVI. -let AddedComplexity = 1 in { -def : Pat<(f32 fpimm0), - (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>, - Requires<[HasZCZ]>; -def : Pat<(f64 fpimm0), - (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>, - Requires<[HasZCZ]>; -} - def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>; @@ -4845,7 +4942,8 @@ class SExtLoadi8CVTf32Pat 0), dsub)), 0), - ssub)))>, Requires<[NotForCodeSize, IsCyclone]>; + ssub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>; @@ -4898,7 +4996,8 @@ class SExtLoadi16CVTf64Pat 0), dsub)), 0), - dsub)))>, Requires<[NotForCodeSize, IsCyclone]>; + dsub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; @@ -5982,7 +6081,7 @@ def : NTStore64Pat; def : Pat<(nontemporalstore GPR64:$Rt, (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), - (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32), + (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32), GPR64sp:$Rn, simm7s4:$offset)>; } // AddedComplexity=10 } // Predicates = [IsLE] @@ -5990,8 +6089,10 @@ def : Pat<(nontemporalstore GPR64:$Rt, // Tail call return handling. These are all compiler pseudo-instructions, // so no encoding information or anything like that. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { - def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>; - def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>; + def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>, + Sched<[WriteBrReg]>; + def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>, + Sched<[WriteBrReg]>; } def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)), diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 43664df3b861..dca13fc49414 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -33,9 +33,6 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-ldst-opt" -/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine -/// load / store instructions to form ldp / stp instructions. - STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); STATISTIC(NumPostFolded, "Number of post-index updates folded"); STATISTIC(NumPreFolded, "Number of pre-index updates folded"); @@ -45,9 +42,19 @@ STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); -static cl::opt ScanLimit("aarch64-load-store-scan-limit", +// The LdStLimit limits how far we search for load/store pairs. +static cl::opt LdStLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); +// The UpdateLimit limits how far we search for update instructions when we form +// pre-/post-index instructions. +static cl::opt UpdateLimit("aarch64-update-scan-limit", cl::init(100), + cl::Hidden); + +static cl::opt EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden, + cl::init(false), + cl::desc("Enable narrow load merge")); + namespace llvm { void initializeAArch64LoadStoreOptPass(PassRegistry &); } @@ -88,22 +95,29 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { const TargetRegisterInfo *TRI; const AArch64Subtarget *Subtarget; + // Track which registers have been modified and used. + BitVector ModifiedRegs, UsedRegs; + // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. // Return the matching instruction if one is found, else MBB->end(). MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, LdStPairFlags &Flags, - unsigned Limit); + unsigned Limit, + bool FindNarrowMerge); // Scan the instructions looking for a store that writes to the address from // which the current load instruction reads. Return true if one is found. bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, MachineBasicBlock::iterator &StoreI); + // Merge the two instructions indicated into a wider instruction. + MachineBasicBlock::iterator + mergeNarrowInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator MergeMI, + const LdStPairFlags &Flags); + // Merge the two instructions indicated into a single pair-wise instruction. - // If MergeForward is true, erase the first instruction and fold its - // operation into the second. If false, the reverse. Return the instruction - // following the first instruction (which may change during processing). MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -118,8 +132,8 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan forwards. MachineBasicBlock::iterator - findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, - int UnscaledOffset); + findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, + int UnscaledOffset, unsigned Limit); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -129,7 +143,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find an instruction that updates the base register of the ld/st // instruction. - bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, + bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset); // Merge a pre- or post-index base register update into a ld/st instruction. @@ -140,17 +154,21 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find and merge foldable ldr/str instructions. bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); + // Find and pair ldr/str instructions. + bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); + // Find and promote load instructions which read directly from store. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); - // Check if converting two narrow loads into a single wider load with - // bitfield extracts could be enabled. - bool enableNarrowLdMerge(MachineFunction &Fn); - bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } @@ -161,37 +179,8 @@ char AArch64LoadStoreOpt::ID = 0; INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", AARCH64_LOAD_STORE_OPT_NAME, false, false) -static bool isUnscaledLdSt(unsigned Opc) { - switch (Opc) { - default: - return false; - case AArch64::STURSi: - case AArch64::STURDi: - case AArch64::STURQi: - case AArch64::STURBBi: - case AArch64::STURHHi: - case AArch64::STURWi: - case AArch64::STURXi: - case AArch64::LDURSi: - case AArch64::LDURDi: - case AArch64::LDURQi: - case AArch64::LDURWi: - case AArch64::LDURXi: - case AArch64::LDURSWi: - case AArch64::LDURHHi: - case AArch64::LDURBBi: - case AArch64::LDURSBWi: - case AArch64::LDURSHWi: - return true; - } -} - -static bool isUnscaledLdSt(MachineInstr *MI) { - return isUnscaledLdSt(MI->getOpcode()); -} - -static unsigned getBitExtrOpcode(MachineInstr *MI) { - switch (MI->getOpcode()) { +static unsigned getBitExtrOpcode(MachineInstr &MI) { + switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode."); case AArch64::LDRBBui: @@ -219,10 +208,6 @@ static bool isNarrowStore(unsigned Opc) { } } -static bool isNarrowStore(MachineInstr *MI) { - return isNarrowStore(MI->getOpcode()); -} - static bool isNarrowLoad(unsigned Opc) { switch (Opc) { default: @@ -239,13 +224,17 @@ static bool isNarrowLoad(unsigned Opc) { } } -static bool isNarrowLoad(MachineInstr *MI) { - return isNarrowLoad(MI->getOpcode()); +static bool isNarrowLoad(MachineInstr &MI) { + return isNarrowLoad(MI.getOpcode()); +} + +static bool isNarrowLoadOrStore(unsigned Opc) { + return isNarrowLoad(Opc) || isNarrowStore(Opc); } // Scaling factor for unscaled load or store. -static int getMemScale(MachineInstr *MI) { - switch (MI->getOpcode()) { +static int getMemScale(MachineInstr &MI) { + switch (MI.getOpcode()) { default: llvm_unreachable("Opcode has unknown scale!"); case AArch64::LDRBBui: @@ -354,6 +343,37 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, } } +static unsigned getMatchingWideOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no wide equivalent!"); + case AArch64::STRBBui: + return AArch64::STRHHui; + case AArch64::STRHHui: + return AArch64::STRWui; + case AArch64::STURBBi: + return AArch64::STURHHi; + case AArch64::STURHHi: + return AArch64::STURWi; + case AArch64::STURWi: + return AArch64::STURXi; + case AArch64::STRWui: + return AArch64::STRXui; + case AArch64::LDRHHui: + case AArch64::LDRSHWui: + return AArch64::LDRWui; + case AArch64::LDURHHi: + case AArch64::LDURSHWi: + return AArch64::LDURWi; + case AArch64::LDRBBui: + case AArch64::LDRSBWui: + return AArch64::LDRHHui; + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + return AArch64::LDURHHi; + } +} + static unsigned getMatchingPairOpcode(unsigned Opc) { switch (Opc) { default: @@ -367,14 +387,6 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::STRQui: case AArch64::STURQi: return AArch64::STPQi; - case AArch64::STRBBui: - return AArch64::STRHHui; - case AArch64::STRHHui: - return AArch64::STRWui; - case AArch64::STURBBi: - return AArch64::STURHHi; - case AArch64::STURHHi: - return AArch64::STURWi; case AArch64::STRWui: case AArch64::STURWi: return AArch64::STPWi; @@ -399,25 +411,13 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::LDRSWui: case AArch64::LDURSWi: return AArch64::LDPSWi; - case AArch64::LDRHHui: - case AArch64::LDRSHWui: - return AArch64::LDRWui; - case AArch64::LDURHHi: - case AArch64::LDURSHWi: - return AArch64::LDURWi; - case AArch64::LDRBBui: - case AArch64::LDRSBWui: - return AArch64::LDRHHui; - case AArch64::LDURBBi: - case AArch64::LDURSBWi: - return AArch64::LDURHHi; } } -static unsigned isMatchingStore(MachineInstr *LoadInst, - MachineInstr *StoreInst) { - unsigned LdOpc = LoadInst->getOpcode(); - unsigned StOpc = StoreInst->getOpcode(); +static unsigned isMatchingStore(MachineInstr &LoadInst, + MachineInstr &StoreInst) { + unsigned LdOpc = LoadInst.getOpcode(); + unsigned StOpc = StoreInst.getOpcode(); switch (LdOpc) { default: llvm_unreachable("Unsupported load instruction!"); @@ -562,8 +562,8 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { } } -static bool isPairedLdSt(const MachineInstr *MI) { - switch (MI->getOpcode()) { +static bool isPairedLdSt(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return false; case AArch64::LDPSi: @@ -581,41 +581,55 @@ static bool isPairedLdSt(const MachineInstr *MI) { } } -static const MachineOperand &getLdStRegOp(const MachineInstr *MI, +static const MachineOperand &getLdStRegOp(const MachineInstr &MI, unsigned PairedRegOp = 0) { assert(PairedRegOp < 2 && "Unexpected register operand idx."); unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; - return MI->getOperand(Idx); + return MI.getOperand(Idx); } -static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { +static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { unsigned Idx = isPairedLdSt(MI) ? 2 : 1; - return MI->getOperand(Idx); + return MI.getOperand(Idx); } -static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { +static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { unsigned Idx = isPairedLdSt(MI) ? 3 : 2; - return MI->getOperand(Idx); + return MI.getOperand(Idx); } -static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst, - MachineInstr *StoreInst) { +static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, + MachineInstr &StoreInst, + const AArch64InstrInfo *TII) { assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); int LoadSize = getMemScale(LoadInst); int StoreSize = getMemScale(StoreInst); - int UnscaledStOffset = isUnscaledLdSt(StoreInst) + int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst) ? getLdStOffsetOp(StoreInst).getImm() : getLdStOffsetOp(StoreInst).getImm() * StoreSize; - int UnscaledLdOffset = isUnscaledLdSt(LoadInst) + int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst) ? getLdStOffsetOp(LoadInst).getImm() : getLdStOffsetOp(LoadInst).getImm() * LoadSize; return (UnscaledStOffset <= UnscaledLdOffset) && (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } +static bool isPromotableZeroStoreOpcode(unsigned Opc) { + return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi; +} + +static bool isPromotableZeroStoreOpcode(MachineInstr &MI) { + return isPromotableZeroStoreOpcode(MI.getOpcode()); +} + +static bool isPromotableZeroStoreInst(MachineInstr &MI) { + return (isPromotableZeroStoreOpcode(MI)) && + getLdStRegOp(MI).getReg() == AArch64::WZR; +} + MachineBasicBlock::iterator -AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, +AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator MergeMI, const LdStPairFlags &Flags) { MachineBasicBlock::iterator NextI = I; ++NextI; @@ -623,128 +637,124 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // to skip one further. Either way we merge will invalidate the iterator, // and we don't need to scan the new instruction, as it's a pairwise // instruction, which we're not considering for further action anyway. - if (NextI == Paired) + if (NextI == MergeMI) ++NextI; - int SExtIdx = Flags.getSExtIdx(); - unsigned Opc = - SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); - bool IsUnscaled = isUnscaledLdSt(Opc); - int OffsetStride = IsUnscaled ? getMemScale(I) : 1; + unsigned Opc = I->getOpcode(); + bool IsScaled = !TII->isUnscaledLdSt(Opc); + int OffsetStride = IsScaled ? 1 : getMemScale(*I); bool MergeForward = Flags.getMergeForward(); - unsigned NewOpc = getMatchingPairOpcode(Opc); // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. - MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; + MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I; // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = - MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); + MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI, *Rt2MI; - if (getLdStOffsetOp(I).getImm() == - getLdStOffsetOp(Paired).getImm() + OffsetStride) { - RtMI = Paired; - Rt2MI = I; - // Here we swapped the assumption made for SExtIdx. - // I.e., we turn ldp I, Paired into ldp Paired, I. - // Update the index accordingly. - if (SExtIdx != -1) - SExtIdx = (SExtIdx + 1) % 2; + if (getLdStOffsetOp(*I).getImm() == + getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) { + RtMI = &*MergeMI; + Rt2MI = &*I; } else { - RtMI = I; - Rt2MI = Paired; + RtMI = &*I; + Rt2MI = &*MergeMI; } - int OffsetImm = getLdStOffsetOp(RtMI).getImm(); + int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + // Change the scaled offset from small to large type. + if (IsScaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + DebugLoc DL = I->getDebugLoc(); + MachineBasicBlock *MBB = I->getParent(); if (isNarrowLoad(Opc)) { - // Change the scaled offset from small to large type. - if (!IsUnscaled) { - assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); - OffsetImm /= 2; - } - MachineInstr *RtNewDest = MergeForward ? I : Paired; + MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI); // When merging small (< 32 bit) loads for big-endian targets, the order of // the component parts gets swapped. if (!Subtarget->isLittleEndian()) std::swap(RtMI, Rt2MI); // Construct the new load instruction. MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; - NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(NewOpc)) - .addOperand(getLdStRegOp(RtNewDest)) - .addOperand(BaseRegOp) - .addImm(OffsetImm) - .setMemRefs(I->mergeMemRefsWith(*Paired)); + NewMemMI = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) + .addOperand(getLdStRegOp(*RtNewDest)) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*MergeMI)); + (void)NewMemMI; DEBUG( dbgs() << "Creating the new load and extract. Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); - DEBUG(Paired->print(dbgs())); + DEBUG(MergeMI->print(dbgs())); DEBUG(dbgs() << " with instructions:\n "); DEBUG((NewMemMI)->print(dbgs())); - int Width = getMemScale(I) == 1 ? 8 : 16; + int Width = getMemScale(*I) == 1 ? 8 : 16; int LSBLow = 0; int LSBHigh = Width; int ImmsLow = LSBLow + Width - 1; int ImmsHigh = LSBHigh + Width - 1; - MachineInstr *ExtDestMI = MergeForward ? Paired : I; + MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I); if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { // Create the bitfield extract for high bits. - BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(Rt2MI))) - .addOperand(getLdStRegOp(Rt2MI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBHigh) - .addImm(ImmsHigh); + BitExtMI1 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI))) + .addOperand(getLdStRegOp(*Rt2MI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); // Create the bitfield extract for low bits. if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { // For unsigned, prefer to use AND for low bits. - BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(AArch64::ANDWri)) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) + BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(ImmsLow); } else { - BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(RtMI))) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBLow) - .addImm(ImmsLow); + BitExtMI2 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI))) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); } } else { // Create the bitfield extract for low bits. if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { // For unsigned, prefer to use AND for low bits. - BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(AArch64::ANDWri)) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) + BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(ImmsLow); } else { - BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(RtMI))) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBLow) - .addImm(ImmsLow); + BitExtMI1 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI))) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); } // Create the bitfield extract for high bits. - BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(Rt2MI))) - .addOperand(getLdStRegOp(Rt2MI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBHigh) - .addImm(ImmsHigh); + BitExtMI2 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI))) + .addOperand(getLdStRegOp(*Rt2MI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); } + (void)BitExtMI1; + (void)BitExtMI2; + DEBUG(dbgs() << " "); DEBUG((BitExtMI1)->print(dbgs())); DEBUG(dbgs() << " "); @@ -753,47 +763,122 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // Erase the old instructions. I->eraseFromParent(); - Paired->eraseFromParent(); + MergeMI->eraseFromParent(); return NextI; } + assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && + "Expected promotable zero store"); // Construct the new instruction. MachineInstrBuilder MIB; - if (isNarrowStore(Opc)) { - // Change the scaled offset from small to large type. - if (!IsUnscaled) { - assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); - OffsetImm /= 2; + MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) + .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*MergeMI)); + (void)MIB; + + DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(MergeMI->print(dbgs())); + DEBUG(dbgs() << " with instruction:\n "); + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + I->eraseFromParent(); + MergeMI->eraseFromParent(); + return NextI; +} + +MachineBasicBlock::iterator +AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + const LdStPairFlags &Flags) { + MachineBasicBlock::iterator NextI = I; + ++NextI; + // If NextI is the second of the two instructions to be merged, we need + // to skip one further. Either way we merge will invalidate the iterator, + // and we don't need to scan the new instruction, as it's a pairwise + // instruction, which we're not considering for further action anyway. + if (NextI == Paired) + ++NextI; + + int SExtIdx = Flags.getSExtIdx(); + unsigned Opc = + SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); + bool IsUnscaled = TII->isUnscaledLdSt(Opc); + int OffsetStride = IsUnscaled ? getMemScale(*I) : 1; + + bool MergeForward = Flags.getMergeForward(); + // Insert our new paired instruction after whichever of the paired + // instructions MergeForward indicates. + MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; + // Also based on MergeForward is from where we copy the base register operand + // so we get the flags compatible with the input code. + const MachineOperand &BaseRegOp = + MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I); + + int Offset = getLdStOffsetOp(*I).getImm(); + int PairedOffset = getLdStOffsetOp(*Paired).getImm(); + bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode()); + if (IsUnscaled != PairedIsUnscaled) { + // We're trying to pair instructions that differ in how they are scaled. If + // I is scaled then scale the offset of Paired accordingly. Otherwise, do + // the opposite (i.e., make Paired's offset unscaled). + int MemSize = getMemScale(*Paired); + if (PairedIsUnscaled) { + // If the unscaled offset isn't a multiple of the MemSize, we can't + // pair the operations together. + assert(!(PairedOffset % getMemScale(*Paired)) && + "Offset should be a multiple of the stride!"); + PairedOffset /= MemSize; + } else { + PairedOffset *= MemSize; } - MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(NewOpc)) - .addOperand(getLdStRegOp(I)) - .addOperand(BaseRegOp) - .addImm(OffsetImm) - .setMemRefs(I->mergeMemRefsWith(*Paired)); + } + + // Which register is Rt and which is Rt2 depends on the offset order. + MachineInstr *RtMI, *Rt2MI; + if (Offset == PairedOffset + OffsetStride) { + RtMI = &*Paired; + Rt2MI = &*I; + // Here we swapped the assumption made for SExtIdx. + // I.e., we turn ldp I, Paired into ldp Paired, I. + // Update the index accordingly. + if (SExtIdx != -1) + SExtIdx = (SExtIdx + 1) % 2; } else { - // Handle Unscaled - if (IsUnscaled) - OffsetImm /= OffsetStride; - MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(NewOpc)) - .addOperand(getLdStRegOp(RtMI)) - .addOperand(getLdStRegOp(Rt2MI)) - .addOperand(BaseRegOp) - .addImm(OffsetImm); + RtMI = &*I; + Rt2MI = &*Paired; + } + int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + // Scale the immediate offset, if necessary. + if (TII->isUnscaledLdSt(RtMI->getOpcode())) { + assert(!(OffsetImm % getMemScale(*RtMI)) && + "Unscaled offset cannot be scaled."); + OffsetImm /= getMemScale(*RtMI); } - (void)MIB; + // Construct the new instruction. + MachineInstrBuilder MIB; + DebugLoc DL = I->getDebugLoc(); + MachineBasicBlock *MBB = I->getParent(); + MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) + .addOperand(getLdStRegOp(*RtMI)) + .addOperand(getLdStRegOp(*Rt2MI)) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*Paired)); - // FIXME: Do we need/want to copy the mem operands from the source - // instructions? Probably. What uses them after this? + (void)MIB; DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); DEBUG(Paired->print(dbgs())); DEBUG(dbgs() << " with instruction:\n "); - if (SExtIdx != -1) { // Generate the sign extension for the proper result of the ldp. // I.e., with X1, that would be: @@ -814,26 +899,23 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // Insert this definition right after the generated LDP, i.e., before // InsertionPoint. MachineInstrBuilder MIBKill = - BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(TargetOpcode::KILL), DstRegW) + BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) .addReg(DstRegW) .addReg(DstRegX, RegState::Define); MIBKill->getOperand(2).setImplicit(); // Create the sign extension. MachineInstrBuilder MIBSXTW = - BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(AArch64::SBFMXri), DstRegX) + BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) .addReg(DstRegX) .addImm(0) .addImm(31); (void)MIBSXTW; DEBUG(dbgs() << " Extend operand:\n "); DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); - DEBUG(dbgs() << "\n"); } else { DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); } + DEBUG(dbgs() << "\n"); // Erase the old instructions. I->eraseFromParent(); @@ -848,10 +930,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, MachineBasicBlock::iterator NextI = LoadI; ++NextI; - int LoadSize = getMemScale(LoadI); - int StoreSize = getMemScale(StoreI); - unsigned LdRt = getLdStRegOp(LoadI).getReg(); - unsigned StRt = getLdStRegOp(StoreI).getReg(); + int LoadSize = getMemScale(*LoadI); + int StoreSize = getMemScale(*StoreI); + unsigned LdRt = getLdStRegOp(*LoadI).getReg(); + unsigned StRt = getLdStRegOp(*StoreI).getReg(); bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); assert((IsStoreXReg || @@ -881,15 +963,16 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, // performance and correctness are verified only in little-endian. if (!Subtarget->isLittleEndian()) return NextI; - bool IsUnscaled = isUnscaledLdSt(LoadI); - assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match"); + bool IsUnscaled = TII->isUnscaledLdSt(*LoadI); + assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) && + "Unsupported ld/st match"); assert(LoadSize <= StoreSize && "Invalid load size"); int UnscaledLdOffset = IsUnscaled - ? getLdStOffsetOp(LoadI).getImm() - : getLdStOffsetOp(LoadI).getImm() * LoadSize; + ? getLdStOffsetOp(*LoadI).getImm() + : getLdStOffsetOp(*LoadI).getImm() * LoadSize; int UnscaledStOffset = IsUnscaled - ? getLdStOffsetOp(StoreI).getImm() - : getLdStOffsetOp(StoreI).getImm() * StoreSize; + ? getLdStOffsetOp(*StoreI).getImm() + : getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); int Imms = Immr + Width - 1; @@ -926,6 +1009,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, .addImm(Imms); } } + (void)BitExtMI; DEBUG(dbgs() << "Promoting load by replacing :\n "); DEBUG(StoreI->print(dbgs())); @@ -944,16 +1028,18 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, /// trackRegDefsUses - Remember what registers the specified instruction uses /// and modifies. -static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, +static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs, BitVector &UsedRegs, const TargetRegisterInfo *TRI) { - for (const MachineOperand &MO : MI->operands()) { + for (const MachineOperand &MO : MI.operands()) { if (MO.isRegMask()) ModifiedRegs.setBitsNotInMask(MO.getRegMask()); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); + if (!Reg) + continue; if (MO.isDef()) { for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) ModifiedRegs.set(*AI); @@ -968,38 +1054,42 @@ static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { // Convert the byte-offset used by unscaled into an "element" offset used // by the scaled pair load/store instructions. - if (IsUnscaled) + if (IsUnscaled) { + // If the byte-offset isn't a multiple of the stride, there's no point + // trying to match it. + if (Offset % OffsetStride) + return false; Offset /= OffsetStride; - + } return Offset <= 63 && Offset >= -64; } // Do alignment, specialized to power of 2 and for signed ints, // avoiding having to do a C-style cast from uint_64t to int when -// using RoundUpToAlignment from include/llvm/Support/MathExtras.h. +// using alignTo from include/llvm/Support/MathExtras.h. // FIXME: Move this function to include/MathExtras.h? static int alignTo(int Num, int PowOf2) { return (Num + PowOf2 - 1) & ~(PowOf2 - 1); } -static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, +static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, const AArch64InstrInfo *TII) { // One of the instructions must modify memory. - if (!MIa->mayStore() && !MIb->mayStore()) + if (!MIa.mayStore() && !MIb.mayStore()) return false; // Both instructions must be memory operations. - if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore()) + if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore()) return false; return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); } -static bool mayAlias(MachineInstr *MIa, +static bool mayAlias(MachineInstr &MIa, SmallVectorImpl &MemInsns, const AArch64InstrInfo *TII) { - for (auto &MIb : MemInsns) - if (mayAlias(MIa, MIb, TII)) + for (MachineInstr *MIb : MemInsns) + if (mayAlias(MIa, *MIb, TII)) return true; return false; @@ -1008,40 +1098,43 @@ static bool mayAlias(MachineInstr *MIa, bool AArch64LoadStoreOpt::findMatchingStore( MachineBasicBlock::iterator I, unsigned Limit, MachineBasicBlock::iterator &StoreI) { - MachineBasicBlock::iterator E = I->getParent()->begin(); + MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator MBBI = I; - MachineInstr *FirstMI = I; - unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + MachineInstr &LoadMI = *I; + unsigned BaseReg = getLdStBaseOp(LoadMI).getReg(); + + // If the load is the first instruction in the block, there's obviously + // not any matching store. + if (MBBI == B) + return false; // Track which registers have been modified and used between the first insn // and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); + ModifiedRegs.reset(); + UsedRegs.reset(); - for (unsigned Count = 0; MBBI != E && Count < Limit;) { + unsigned Count = 0; + do { --MBBI; - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) - continue; - // Now that we know this is a real instruction, count it. - ++Count; + MachineInstr &MI = *MBBI; + + // Don't count DBG_VALUE instructions towards the search limit. + if (!MI.isDebugValue()) + ++Count; // If the load instruction reads directly from the address to which the // store instruction writes and the stored value is not modified, we can // promote the load. Since we do not handle stores with pre-/post-index, // it's unnecessary to check if BaseReg is modified by the store itself. - if (MI->mayStore() && isMatchingStore(FirstMI, MI) && + if (MI.mayStore() && isMatchingStore(LoadMI, MI) && BaseReg == getLdStBaseOp(MI).getReg() && - isLdOffsetInRangeOfSt(FirstMI, MI) && + isLdOffsetInRangeOfSt(LoadMI, MI, TII) && !ModifiedRegs[getLdStRegOp(MI).getReg()]) { StoreI = MBBI; return true; } - if (MI->isCall()) + if (MI.isCall()) return false; // Update modified / uses register lists. @@ -1053,139 +1146,165 @@ bool AArch64LoadStoreOpt::findMatchingStore( return false; // If we encounter a store aliased with the load, return early. - if (MI->mayStore() && mayAlias(FirstMI, MI, TII)) + if (MI.mayStore() && mayAlias(LoadMI, MI, TII)) return false; - } + } while (MBBI != B && Count < Limit); return false; } -/// findMatchingInsn - Scan the instructions looking for a load/store that can -/// be combined with the current instruction into a load/store pair. +// Returns true if FirstMI and MI are candidates for merging or pairing. +// Otherwise, returns false. +static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, + LdStPairFlags &Flags, + const AArch64InstrInfo *TII) { + // If this is volatile or if pairing is suppressed, not a candidate. + if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) + return false; + + // We should have already checked FirstMI for pair suppression and volatility. + assert(!FirstMI.hasOrderedMemoryRef() && + !TII->isLdStPairSuppressed(FirstMI) && + "FirstMI shouldn't get here if either of these checks are true."); + + unsigned OpcA = FirstMI.getOpcode(); + unsigned OpcB = MI.getOpcode(); + + // Opcodes match: nothing more to check. + if (OpcA == OpcB) + return true; + + // Try to match a sign-extended load/store with a zero-extended load/store. + bool IsValidLdStrOpc, PairIsValidLdStrOpc; + unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); + assert(IsValidLdStrOpc && + "Given Opc should be a Load or Store with an immediate"); + // OpcA will be the first instruction in the pair. + if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { + Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); + return true; + } + + // If the second instruction isn't even a load/store, bail out. + if (!PairIsValidLdStrOpc) + return false; + + // FIXME: We don't support merging narrow loads/stores with mixed + // scaled/unscaled offsets. + if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB)) + return false; + + // Try to match an unscaled load/store with a scaled load/store. + return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) && + getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); + + // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? +} + +/// Scan the instructions looking for a load/store that can be combined with the +/// current instruction into a wider equivalent or a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - LdStPairFlags &Flags, unsigned Limit) { + LdStPairFlags &Flags, unsigned Limit, + bool FindNarrowMerge) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; - MachineInstr *FirstMI = I; + MachineInstr &FirstMI = *I; ++MBBI; - unsigned Opc = FirstMI->getOpcode(); - bool MayLoad = FirstMI->mayLoad(); - bool IsUnscaled = isUnscaledLdSt(FirstMI); + bool MayLoad = FirstMI.mayLoad(); + bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); unsigned Reg = getLdStRegOp(FirstMI).getReg(); unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); - bool IsNarrowStore = isNarrowStore(Opc); - - // For narrow stores, find only the case where the stored value is WZR. - if (IsNarrowStore && Reg != AArch64::WZR) - return E; - - // Early exit if the first instruction modifies the base register. - // e.g., ldr x0, [x0] - if (FirstMI->modifiesRegister(BaseReg, TRI)) - return E; - - // Early exit if the offset if not possible to match. (6 bits of positive - // range, plus allow an extra one in case we find a later insn that matches - // with Offset-1) int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; - if (!(isNarrowLoad(Opc) || IsNarrowStore) && - !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) - return E; + bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); + ModifiedRegs.reset(); + UsedRegs.reset(); // Remember any instructions that read/write memory between FirstMI and MI. SmallVector MemInsns; for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { - MachineInstr *MI = MBBI; + MachineInstr &MI = *MBBI; // Skip DBG_VALUE instructions. Otherwise debug info can affect the // optimization by changing how far we scan. - if (MI->isDebugValue()) + if (MI.isDebugValue()) continue; // Now that we know this is a real instruction, count it. ++Count; - bool CanMergeOpc = Opc == MI->getOpcode(); Flags.setSExtIdx(-1); - if (!CanMergeOpc) { - bool IsValidLdStrOpc; - unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); - assert(IsValidLdStrOpc && - "Given Opc should be a Load or Store with an immediate"); - // Opc will be the first instruction in the pair. - Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); - CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); - } - - if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { - assert(MI->mayLoadOrStore() && "Expected memory operation."); + if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && + getLdStOffsetOp(MI).isImm()) { + assert(MI.mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. // These instructions all have scaled immediate operands, so we just // check for +1/-1. Make sure to check the new instruction offset is // actually an immediate and not a symbolic reference destined for // a relocation. - // - // Pairwise instructions have a 7-bit signed offset field. Single insns - // have a 12-bit unsigned offset field. To be a valid combine, the - // final offset must be in range. unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); int MIOffset = getLdStOffsetOp(MI).getImm(); + bool MIIsUnscaled = TII->isUnscaledLdSt(MI); + if (IsUnscaled != MIIsUnscaled) { + // We're trying to pair instructions that differ in how they are scaled. + // If FirstMI is scaled then scale the offset of MI accordingly. + // Otherwise, do the opposite (i.e., make MI's offset unscaled). + int MemSize = getMemScale(MI); + if (MIIsUnscaled) { + // If the unscaled offset isn't a multiple of the MemSize, we can't + // pair the operations together: bail and keep looking. + if (MIOffset % MemSize) + continue; + MIOffset /= MemSize; + } else { + MIOffset *= MemSize; + } + } + if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || (Offset + OffsetStride == MIOffset))) { int MinOffset = Offset < MIOffset ? Offset : MIOffset; - // If this is a volatile load/store that otherwise matched, stop looking - // as something is going on that we don't have enough information to - // safely transform. Similarly, stop if we see a hint to avoid pairs. - if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) - return E; - // If the resultant immediate offset of merging these instructions - // is out of range for a pairwise instruction, bail and keep looking. - bool MIIsUnscaled = isUnscaledLdSt(MI); - bool IsNarrowLoad = isNarrowLoad(MI->getOpcode()); - if (!IsNarrowLoad && - !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); - continue; - } - - if (IsNarrowLoad || IsNarrowStore) { + if (FindNarrowMerge) { // If the alignment requirements of the scaled wide load/store - // instruction can't express the offset of the scaled narrow - // input, bail and keep looking. - if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { + // instruction can't express the offset of the scaled narrow input, + // bail and keep looking. For promotable zero stores, allow only when + // the stored value is the same (i.e., WZR). + if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) || + (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); + MemInsns.push_back(&MI); continue; } } else { + // Pairwise instructions have a 7-bit signed offset field. Single + // insns have a 12-bit unsigned offset field. If the resultant + // immediate offset of merging these instructions is out of range for + // a pairwise instruction, bail and keep looking. + if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(&MI); + continue; + } // If the alignment requirements of the paired (scaled) instruction // can't express the offset of the unscaled input, bail and keep // looking. if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); + MemInsns.push_back(&MI); continue; } } // If the destination register of the loads is the same register, bail // and keep looking. A load-pair instruction with both destination // registers the same is UNPREDICTABLE and will result in an exception. - // For narrow stores, allow only when the stored value is the same - // (i.e., WZR). - if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || - (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) { + if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); + MemInsns.push_back(&MI); continue; } @@ -1194,7 +1313,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // and first alias with the second, we can combine the second into the // first. if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && - !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && + !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && !mayAlias(MI, MemInsns, TII)) { Flags.setMergeForward(false); return MBBI; @@ -1217,7 +1336,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // If the instruction wasn't a matching load or store. Stop searching if we // encounter a call instruction that might modify memory. - if (MI->isCall()) + if (MI.isCall()) return E; // Update modified / uses register lists. @@ -1229,8 +1348,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, return E; // Update list of instructions that read/write memory. - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + if (MI.mayLoadOrStore()) + MemInsns.push_back(&MI); } return E; } @@ -1258,22 +1377,24 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); MachineInstrBuilder MIB; - if (!isPairedLdSt(I)) { + if (!isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(Update)) - .addOperand(getLdStRegOp(I)) - .addOperand(getLdStBaseOp(I)) - .addImm(Value); + .addOperand(getLdStRegOp(*Update)) + .addOperand(getLdStRegOp(*I)) + .addOperand(getLdStBaseOp(*I)) + .addImm(Value) + .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } else { // Paired instruction. - int Scale = getMemScale(I); + int Scale = getMemScale(*I); MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(Update)) - .addOperand(getLdStRegOp(I, 0)) - .addOperand(getLdStRegOp(I, 1)) - .addOperand(getLdStBaseOp(I)) - .addImm(Value / Scale); + .addOperand(getLdStRegOp(*Update)) + .addOperand(getLdStRegOp(*I, 0)) + .addOperand(getLdStRegOp(*I, 1)) + .addOperand(getLdStBaseOp(*I)) + .addImm(Value / Scale) + .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } (void)MIB; @@ -1296,10 +1417,10 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, return NextI; } -bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, - MachineInstr *MI, +bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, + MachineInstr &MI, unsigned BaseReg, int Offset) { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::SUBXri: @@ -1309,20 +1430,20 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, case AArch64::ADDXri: // Make sure it's a vanilla immediate operand, not a relocation or // anything else we can't handle. - if (!MI->getOperand(2).isImm()) + if (!MI.getOperand(2).isImm()) break; // Watch out for 1 << 12 shifted value. - if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) + if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm())) break; // The update instruction source and destination register must be the // same as the load/store base register. - if (MI->getOperand(0).getReg() != BaseReg || - MI->getOperand(1).getReg() != BaseReg) + if (MI.getOperand(0).getReg() != BaseReg || + MI.getOperand(1).getReg() != BaseReg) break; bool IsPairedInsn = isPairedLdSt(MemMI); - int UpdateOffset = MI->getOperand(2).getImm(); + int UpdateOffset = MI.getOperand(2).getImm(); // For non-paired load/store instructions, the immediate must fit in a // signed 9-bit integer. if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) @@ -1343,7 +1464,7 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, // If we have a non-zero Offset, we check that it matches the amount // we're adding to the register. - if (!Offset || Offset == MI->getOperand(2).getImm()) + if (!Offset || Offset == MI.getOperand(2).getImm()) return true; break; } @@ -1351,9 +1472,9 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( - MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { + MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); - MachineInstr *MemMI = I; + MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); @@ -1376,22 +1497,20 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); + ModifiedRegs.reset(); + UsedRegs.reset(); ++MBBI; - for (unsigned Count = 0; MBBI != E; ++MBBI) { - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) + for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { + MachineInstr &MI = *MBBI; + // Skip DBG_VALUE instructions. + if (MI.isDebugValue()) continue; // Now that we know this is a real instruction, count it. ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) + if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -1409,7 +1528,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator I, unsigned Limit) { MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator E = I->getParent()->end(); - MachineInstr *MemMI = I; + MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); @@ -1430,22 +1549,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); - --MBBI; - for (unsigned Count = 0; MBBI != B; --MBBI) { - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) - continue; + ModifiedRegs.reset(); + UsedRegs.reset(); + unsigned Count = 0; + do { + --MBBI; + MachineInstr &MI = *MBBI; - // Now that we know this is a real instruction, count it. - ++Count; + // Don't count DBG_VALUE instructions towards the search limit. + if (!MI.isDebugValue()) + ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) + if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -1455,15 +1571,15 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( // return early. if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) return E; - } + } while (MBBI != B && Count < Limit); return E; } bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( MachineBasicBlock::iterator &MBBI) { - MachineInstr *MI = MBBI; + MachineInstr &MI = *MBBI; // If this is a volatile load, don't mess with it. - if (MI->hasOrderedMemoryRef()) + if (MI.hasOrderedMemoryRef()) return false; // Make sure this is a reg+imm. @@ -1471,9 +1587,9 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( if (!getLdStOffsetOp(MI).isImm()) return false; - // Look backward up to ScanLimit instructions. + // Look backward up to LdStLimit instructions. MachineBasicBlock::iterator StoreI; - if (findMatchingStore(MBBI, ScanLimit, StoreI)) { + if (findMatchingStore(MBBI, LdStLimit, StoreI)) { ++NumLoadsFromStoresPromoted; // Promote the load. Keeping the iterator straight is a // pain, so we let the merge routine tell us what the next instruction @@ -1484,40 +1600,70 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( return false; } +// Find narrow loads that can be converted into a single wider load with +// bitfield extract instructions. Also merge adjacent zero stores into a wider +// store. bool AArch64LoadStoreOpt::tryToMergeLdStInst( MachineBasicBlock::iterator &MBBI) { - MachineInstr *MI = MBBI; - MachineBasicBlock::iterator E = MI->getParent()->end(); - // If this is a volatile load/store, don't mess with it. - if (MI->hasOrderedMemoryRef()) - return false; + assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) && + "Expected narrow op."); + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator E = MI.getParent()->end(); - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!getLdStOffsetOp(MI).isImm()) + if (!TII->isCandidateToMergeOrPair(MI)) return false; - // Check if this load/store has a hint to avoid pair formation. - // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. - if (TII->isLdStPairSuppressed(MI)) + // For promotable zero stores, the stored value should be WZR. + if (isPromotableZeroStoreOpcode(MI) && + getLdStRegOp(MI).getReg() != AArch64::WZR) return false; - // Look ahead up to ScanLimit instructions for a pairable instruction. + // Look ahead up to LdStLimit instructions for a mergable instruction. LdStPairFlags Flags; - MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); - if (Paired != E) { + MachineBasicBlock::iterator MergeMI = + findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); + if (MergeMI != E) { if (isNarrowLoad(MI)) { ++NumNarrowLoadsPromoted; - } else if (isNarrowStore(MI)) { + } else if (isPromotableZeroStoreInst(MI)) { ++NumZeroStoresPromoted; - } else { - ++NumPairCreated; - if (isUnscaledLdSt(MI)) - ++NumUnscaledPairCreated; } + // Keeping the iterator straight is a pain, so we let the merge routine tell + // us what the next instruction is after it's done mucking about. + MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags); + return true; + } + return false; +} - // Merge the loads into a pair. Keeping the iterator straight is a - // pain, so we let the merge routine tell us what the next instruction - // is after it's done mucking about. +// Find loads and stores that can be merged into a single load or store pair +// instruction. +bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator E = MI.getParent()->end(); + + if (!TII->isCandidateToMergeOrPair(MI)) + return false; + + // Early exit if the offset is not possible to match. (6 bits of positive + // range, plus allow an extra one in case we find a later insn that matches + // with Offset-1) + bool IsUnscaled = TII->isUnscaledLdSt(MI); + int Offset = getLdStOffsetOp(MI).getImm(); + int OffsetStride = IsUnscaled ? getMemScale(MI) : 1; + if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) + return false; + + // Look ahead up to LdStLimit instructions for a pairable instruction. + LdStPairFlags Flags; + MachineBasicBlock::iterator Paired = + findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); + if (Paired != E) { + ++NumPairCreated; + if (TII->isUnscaledLdSt(MI)) + ++NumUnscaledPairCreated; + // Keeping the iterator straight is a pain, so we let the merge routine tell + // us what the next instruction is after it's done mucking about. MBBI = mergePairedInsns(MBBI, Paired, Flags); return true; } @@ -1527,7 +1673,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStInst( bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt) { bool Modified = false; - // Three tranformations to do here: + // Four tranformations to do here: // 1) Find loads that directly read from stores and promote them by // replacing with mov instructions. If the store is wider than the load, // the load will be replaced with a bitfield extract. @@ -1536,35 +1682,11 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // ldrh w2, [x0, #6] // ; becomes // str w1, [x0, #4] - // lsr w2, w1, #16 - // 2) Find narrow loads that can be converted into a single wider load - // with bitfield extract instructions. - // e.g., - // ldrh w0, [x2] - // ldrh w1, [x2, #2] - // ; becomes - // ldr w0, [x2] - // ubfx w1, w0, #16, #16 - // and w0, w0, #ffff - // 3) Find loads and stores that can be merged into a single load or store - // pair instruction. - // e.g., - // ldr x0, [x2] - // ldr x1, [x2, #8] - // ; becomes - // ldp x0, x1, [x2] - // 4) Find base register updates that can be merged into the load or store - // as a base-reg writeback. - // e.g., - // ldr x0, [x2] - // add x2, x2, #4 - // ; becomes - // ldr x0, [x2], #4 - + // lsr w2, w1, #16 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr *MI = MBBI; - switch (MI->getOpcode()) { + MachineInstr &MI = *MBBI; + switch (MI.getOpcode()) { default: // Just move on to the next instruction. ++MBBI; @@ -1586,47 +1708,49 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; break; } - // FIXME: Do the other instructions. } } - + // 2) Find narrow loads that can be converted into a single wider load + // with bitfield extract instructions. + // e.g., + // ldrh w0, [x2] + // ldrh w1, [x2, #2] + // ; becomes + // ldr w0, [x2] + // ubfx w1, w0, #16, #16 + // and w0, w0, #ffff + // + // Also merge adjacent zero stores into a wider store. + // e.g., + // strh wzr, [x0] + // strh wzr, [x0, #2] + // ; becomes + // str wzr, [x0] for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); enableNarrowLdOpt && MBBI != E;) { - MachineInstr *MI = MBBI; - switch (MI->getOpcode()) { - default: - // Just move on to the next instruction. - ++MBBI; - break; - // Scaled instructions. - case AArch64::LDRBBui: - case AArch64::LDRHHui: - case AArch64::LDRSBWui: - case AArch64::LDRSHWui: - case AArch64::STRBBui: - case AArch64::STRHHui: - // Unscaled instructions. - case AArch64::LDURBBi: - case AArch64::LDURHHi: - case AArch64::LDURSBWi: - case AArch64::LDURSHWi: - case AArch64::STURBBi: - case AArch64::STURHHi: { + MachineInstr &MI = *MBBI; + unsigned Opc = MI.getOpcode(); + if (isPromotableZeroStoreOpcode(Opc) || + (EnableNarrowLdMerge && isNarrowLoad(Opc))) { if (tryToMergeLdStInst(MBBI)) { Modified = true; - break; - } + } else + ++MBBI; + } else ++MBBI; - break; - } - // FIXME: Do the other instructions. - } } + // 3) Find loads and stores that can be merged into a single load or store + // pair instruction. + // e.g., + // ldr x0, [x2] + // ldr x1, [x2, #8] + // ; becomes + // ldp x0, x1, [x2] for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr *MI = MBBI; - switch (MI->getOpcode()) { + MachineInstr &MI = *MBBI; + switch (MI.getOpcode()) { default: // Just move on to the next instruction. ++MBBI; @@ -1655,23 +1779,28 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: { - if (tryToMergeLdStInst(MBBI)) { + if (tryToPairLdStInst(MBBI)) { Modified = true; break; } ++MBBI; break; } - // FIXME: Do the other instructions. } } - + // 4) Find base register updates that can be merged into the load or store + // as a base-reg writeback. + // e.g., + // ldr x0, [x2] + // add x2, x2, #4 + // ; becomes + // ldr x0, [x2], #4 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr *MI = MBBI; + MachineInstr &MI = *MBBI; // Do update merging. It's simpler to keep this separate from the above - // switch, though not strictly necessary. - unsigned Opc = MI->getOpcode(); + // switchs, though not strictly necessary. + unsigned Opc = MI.getOpcode(); switch (Opc) { default: // Just move on to the next instruction. @@ -1726,7 +1855,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // merged into: // ldr x0, [x20], #32 MachineBasicBlock::iterator Update = - findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); + findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); @@ -1736,7 +1865,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, } // Don't know how to handle pre/post-index versions, so move to the next // instruction. - if (isUnscaledLdSt(Opc)) { + if (TII->isUnscaledLdSt(Opc)) { ++MBBI; break; } @@ -1746,7 +1875,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // ldr x1, [x0] // merged into: // ldr x1, [x0, #8]! - Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); + Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); @@ -1764,7 +1893,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // add x0, x0, #64 // merged into: // ldr x1, [x0, #64]! - Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); + Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); @@ -1777,29 +1906,29 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; break; } - // FIXME: Do the other instructions. } } return Modified; } -bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { - bool ProfitableArch = Subtarget->isCortexA57(); - // FIXME: The benefit from converting narrow loads into a wider load could be - // microarchitectural as it assumes that a single load with two bitfield - // extracts is cheaper than two narrow loads. Currently, this conversion is - // enabled only in cortex-a57 on which performance benefits were verified. - return ProfitableArch && !Subtarget->requiresStrictAlign(); -} - bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + if (skipFunction(*Fn.getFunction())) + return false; + Subtarget = &static_cast(Fn.getSubtarget()); TII = static_cast(Subtarget->getInstrInfo()); TRI = Subtarget->getRegisterInfo(); + // Resize the modified and used register bitfield trackers. We do this once + // per function and then clear the bitfield each time we optimize a load or + // store. + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + bool Modified = false; - bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); + bool enableNarrowLdOpt = + Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign(); for (auto &MBB : Fn) Modified |= optimizeBlock(MBB, enableNarrowLdOpt); @@ -1809,6 +1938,11 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep // loads and stores near one another? +// FIXME: When pairing store instructions it's very possible for this pass to +// hoist a store with a KILL marker above another use (without a KILL marker). +// The resulting IR is invalid, but nothing uses the KILL markers after this +// pass, so it's never caused a problem in practice. + /// createAArch64LoadStoreOptimizationPass - returns an instance of the /// load / store optimization pass. FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 318f83953505..49e7767741ea 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -48,6 +48,9 @@ class AArch64FunctionInfo : public MachineFunctionInfo { /// \brief Amount of stack frame size, not including callee-saved registers. unsigned LocalStackSize; + /// \brief Amount of stack frame size used for saving callee-saved registers. + unsigned CalleeSavedStackSize; + /// \brief Number of TLS accesses using the special (combinable) /// _TLS_MODULE_BASE_ symbol. unsigned NumLocalDynamicTLSAccesses; @@ -76,18 +79,28 @@ class AArch64FunctionInfo : public MachineFunctionInfo { /// copies. bool IsSplitCSR; + /// True when the stack gets realigned dynamically because the size of stack + /// frame is unknown at compile time. e.g., in case of VLAs. + bool StackRealigned; + + /// True when the callee-save stack area has unused gaps that may be used for + /// other stack allocations. + bool CalleeSaveStackHasFreeSpace; + public: AArch64FunctionInfo() : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), - IsSplitCSR(false) {} + IsSplitCSR(false), StackRealigned(false), + CalleeSaveStackHasFreeSpace(false) {} explicit AArch64FunctionInfo(MachineFunction &MF) : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), - IsSplitCSR(false) { + IsSplitCSR(false), StackRealigned(false), + CalleeSaveStackHasFreeSpace(false) { (void)MF; } @@ -102,12 +115,25 @@ public: bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } + bool isStackRealigned() const { return StackRealigned; } + void setStackRealigned(bool s) { StackRealigned = s; } + + bool hasCalleeSaveStackFreeSpace() const { + return CalleeSaveStackHasFreeSpace; + } + void setCalleeSaveStackHasFreeSpace(bool s) { + CalleeSaveStackHasFreeSpace = s; + } + bool isSplitCSR() const { return IsSplitCSR; } void setIsSplitCSR(bool s) { IsSplitCSR = s; } void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } unsigned getLocalStackSize() const { return LocalStackSize; } + void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; } + unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; } + void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamicTLSAccesses; @@ -140,15 +166,15 @@ public: SmallVector Args; public: - typedef SmallVectorImpl LOHArgs; + typedef ArrayRef LOHArgs; - MILOHDirective(MCLOHType Kind, const LOHArgs &Args) + MILOHDirective(MCLOHType Kind, LOHArgs Args) : Kind(Kind), Args(Args.begin(), Args.end()) { assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!"); } MCLOHType getKind() const { return Kind; } - const LOHArgs &getArgs() const { return Args; } + LOHArgs getArgs() const { return Args; } }; typedef MILOHDirective::LOHArgs MILOHArgs; @@ -157,7 +183,7 @@ public: const MILOHContainer &getLOHContainer() const { return LOHContainerSet; } /// Add a LOH directive of this @p Kind and this @p Args. - void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) { + void addLOHDirective(MCLOHType Kind, MILOHArgs Args) { LOHContainerSet.push_back(MILOHDirective(Kind, Args)); LOHRelated.insert(Args.begin(), Args.end()); } diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index 5394875a6bc1..038162c6f54a 100644 --- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -320,7 +320,7 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg, const MachineInstr &MI) { const LiveInterval &LI = LIs.getInterval(reg); - SlotIndex SI = LIs.getInstructionIndex(&MI); + SlotIndex SI = LIs.getInstructionIndex(MI); return LI.expiredAt(SI); } diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp index 79c09d9f058d..b1e40510b2ae 100644 --- a/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -85,6 +85,21 @@ namespace { class AArch64PromoteConstant : public ModulePass { public: + struct PromotedConstant { + bool ShouldConvert = false; + GlobalVariable *GV = nullptr; + }; + typedef SmallDenseMap PromotionCacheTy; + + struct UpdateRecord { + Constant *C; + Instruction *User; + unsigned Op; + + UpdateRecord(Constant *C, Instruction *User, unsigned Op) + : C(C), User(User), Op(Op) {} + }; + static char ID; AArch64PromoteConstant() : ModulePass(ID) {} @@ -94,9 +109,12 @@ public: /// global variables with module scope. bool runOnModule(Module &M) override { DEBUG(dbgs() << getPassName() << '\n'); + if (skipModule(M)) + return false; bool Changed = false; + PromotionCacheTy PromotionCache; for (auto &MF : M) { - Changed |= runOnFunction(MF); + Changed |= runOnFunction(MF, PromotionCache); } return Changed; } @@ -105,7 +123,7 @@ private: /// Look for interesting constants used within the given function. /// Promote them into global variables, load these global variables within /// the related function, so that the number of inserted load is minimal. - bool runOnFunction(Function &F); + bool runOnFunction(Function &F, PromotionCacheTy &PromotionCache); // This transformation requires dominator info void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -115,79 +133,72 @@ private: } /// Type to store a list of Uses. - typedef SmallVector Uses; + typedef SmallVector, 4> Uses; /// Map an insertion point to all the uses it dominates. typedef DenseMap InsertionPoints; - /// Map a function to the required insertion point of load for a - /// global variable. - typedef DenseMap InsertionPointsPerFunc; /// Find the closest point that dominates the given Use. - Instruction *findInsertionPoint(Use &Use); + Instruction *findInsertionPoint(Instruction &User, unsigned OpNo); /// Check if the given insertion point is dominated by an existing /// insertion point. /// If true, the given use is added to the list of dominated uses for /// the related existing point. /// \param NewPt the insertion point to be checked - /// \param Use the use to be added into the list of dominated uses + /// \param User the user of the constant + /// \param OpNo the operand number of the use /// \param InsertPts existing insertion points /// \pre NewPt and all instruction in InsertPts belong to the same function /// \return true if one of the insertion point in InsertPts dominates NewPt, /// false otherwise - bool isDominated(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts); + bool isDominated(Instruction *NewPt, Instruction *User, unsigned OpNo, + InsertionPoints &InsertPts); /// Check if the given insertion point can be merged with an existing /// insertion point in a common dominator. /// If true, the given use is added to the list of the created insertion /// point. /// \param NewPt the insertion point to be checked - /// \param Use the use to be added into the list of dominated uses + /// \param User the user of the constant + /// \param OpNo the operand number of the use /// \param InsertPts existing insertion points /// \pre NewPt and all instruction in InsertPts belong to the same function /// \pre isDominated returns false for the exact same parameters. /// \return true if it exists an insertion point in InsertPts that could /// have been merged with NewPt in a common dominator, /// false otherwise - bool tryAndMerge(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts); + bool tryAndMerge(Instruction *NewPt, Instruction *User, unsigned OpNo, + InsertionPoints &InsertPts); /// Compute the minimal insertion points to dominates all the interesting /// uses of value. /// Insertion points are group per function and each insertion point /// contains a list of all the uses it dominates within the related function - /// \param Val constant to be examined - /// \param[out] InsPtsPerFunc output storage of the analysis - void computeInsertionPoints(Constant *Val, - InsertionPointsPerFunc &InsPtsPerFunc); + /// \param User the user of the constant + /// \param OpNo the operand number of the constant + /// \param[out] InsertPts output storage of the analysis + void computeInsertionPoint(Instruction *User, unsigned OpNo, + InsertionPoints &InsertPts); /// Insert a definition of a new global variable at each point contained in /// InsPtsPerFunc and update the related uses (also contained in /// InsPtsPerFunc). - bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc); - - /// Compute the minimal insertion points to dominate all the interesting - /// uses of Val and insert a definition of a new global variable - /// at these points. - /// Also update the uses of Val accordingly. - /// Currently a use of Val is considered interesting if: - /// - Val is not UndefValue - /// - Val is not zeroinitialized - /// - Replacing Val per a load of a global variable is valid. - /// \see shouldConvert for more details - bool computeAndInsertDefinitions(Constant *Val); - - /// Promote the given constant into a global variable if it is expected to - /// be profitable. - /// \return true if Cst has been promoted - bool promoteConstant(Constant *Cst); + void insertDefinitions(Function &F, GlobalVariable &GV, + InsertionPoints &InsertPts); + + /// Do the constant promotion indicated by the Updates records, keeping track + /// of globals in PromotionCache. + void promoteConstants(Function &F, SmallVectorImpl &Updates, + PromotionCacheTy &PromotionCache); /// Transfer the list of dominated uses of IPI to NewPt in InsertPts. /// Append Use to this list and delete the entry of IPI in InsertPts. - static void appendAndTransferDominatedUses(Instruction *NewPt, Use &Use, + static void appendAndTransferDominatedUses(Instruction *NewPt, + Instruction *User, unsigned OpNo, InsertionPoints::iterator &IPI, InsertionPoints &InsertPts) { // Record the dominated use. - IPI->second.push_back(&Use); + IPI->second.emplace_back(User, OpNo); // Transfer the dominated uses of IPI to NewPt // Inserting into the DenseMap may invalidate existing iterator. // Keep a copy of the key to find the iterator to erase. Keep a copy of the @@ -285,10 +296,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr, // Do not mess with inline asm. const CallInst *CI = dyn_cast(Instr); - if (CI && isa(CI->getCalledValue())) - return false; - - return true; + return !(CI && isa(CI->getCalledValue())); } /// Check if the given Cst should be converted into @@ -305,7 +313,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr, /// for the regular approach, even for float). /// Again, the simplest solution would be to promote every /// constant and rematerialize them when they are actually cheap to create. -static bool shouldConvert(const Constant *Cst) { +static bool shouldConvertImpl(const Constant *Cst) { if (isa(Cst)) return false; @@ -328,18 +336,28 @@ static bool shouldConvert(const Constant *Cst) { return isConstantUsingVectorTy(Cst->getType()); } -Instruction *AArch64PromoteConstant::findInsertionPoint(Use &Use) { - Instruction *User = cast(Use.getUser()); +static bool +shouldConvert(Constant &C, + AArch64PromoteConstant::PromotionCacheTy &PromotionCache) { + auto Converted = PromotionCache.insert( + std::make_pair(&C, AArch64PromoteConstant::PromotedConstant())); + if (Converted.second) + Converted.first->second.ShouldConvert = shouldConvertImpl(&C); + return Converted.first->second.ShouldConvert; +} +Instruction *AArch64PromoteConstant::findInsertionPoint(Instruction &User, + unsigned OpNo) { // If this user is a phi, the insertion point is in the related // incoming basic block. - if (PHINode *PhiInst = dyn_cast(User)) - return PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator(); + if (PHINode *PhiInst = dyn_cast(&User)) + return PhiInst->getIncomingBlock(OpNo)->getTerminator(); - return User; + return &User; } -bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use, +bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Instruction *User, + unsigned OpNo, InsertionPoints &InsertPts) { DominatorTree &DT = getAnalysis( @@ -358,14 +376,15 @@ bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use, DEBUG(dbgs() << "Insertion point dominated by:\n"); DEBUG(IPI.first->print(dbgs())); DEBUG(dbgs() << '\n'); - IPI.second.push_back(&Use); + IPI.second.emplace_back(User, OpNo); return true; } } return false; } -bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use, +bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User, + unsigned OpNo, InsertionPoints &InsertPts) { DominatorTree &DT = getAnalysis( *NewPt->getParent()->getParent()).getDomTree(); @@ -385,7 +404,7 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use, DEBUG(dbgs() << "Merge insertion point with:\n"); DEBUG(IPI->first->print(dbgs())); DEBUG(dbgs() << "\nat considered insertion point.\n"); - appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts); + appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts); return true; } @@ -409,149 +428,141 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use, DEBUG(dbgs() << '\n'); DEBUG(NewPt->print(dbgs())); DEBUG(dbgs() << '\n'); - appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts); + appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts); return true; } return false; } -void AArch64PromoteConstant::computeInsertionPoints( - Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) { - DEBUG(dbgs() << "** Compute insertion points **\n"); - for (Use &Use : Val->uses()) { - Instruction *User = dyn_cast(Use.getUser()); - - // If the user is not an Instruction, we cannot modify it. - if (!User) - continue; - - // Filter out uses that should not be converted. - if (!shouldConvertUse(Val, User, Use.getOperandNo())) - continue; +void AArch64PromoteConstant::computeInsertionPoint( + Instruction *User, unsigned OpNo, InsertionPoints &InsertPts) { + DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n"); + DEBUG(User->print(dbgs())); + DEBUG(dbgs() << '\n'); - DEBUG(dbgs() << "Considered use, opidx " << Use.getOperandNo() << ":\n"); - DEBUG(User->print(dbgs())); - DEBUG(dbgs() << '\n'); + Instruction *InsertionPoint = findInsertionPoint(*User, OpNo); - Instruction *InsertionPoint = findInsertionPoint(Use); + DEBUG(dbgs() << "Considered insertion point:\n"); + DEBUG(InsertionPoint->print(dbgs())); + DEBUG(dbgs() << '\n'); - DEBUG(dbgs() << "Considered insertion point:\n"); - DEBUG(InsertionPoint->print(dbgs())); - DEBUG(dbgs() << '\n'); + if (isDominated(InsertionPoint, User, OpNo, InsertPts)) + return; + // This insertion point is useful, check if we can merge some insertion + // point in a common dominator or if NewPt dominates an existing one. + if (tryAndMerge(InsertionPoint, User, OpNo, InsertPts)) + return; - // Check if the current insertion point is useless, i.e., it is dominated - // by another one. - InsertionPoints &InsertPts = - InsPtsPerFunc[InsertionPoint->getParent()->getParent()]; - if (isDominated(InsertionPoint, Use, InsertPts)) - continue; - // This insertion point is useful, check if we can merge some insertion - // point in a common dominator or if NewPt dominates an existing one. - if (tryAndMerge(InsertionPoint, Use, InsertPts)) - continue; - - DEBUG(dbgs() << "Keep considered insertion point\n"); + DEBUG(dbgs() << "Keep considered insertion point\n"); - // It is definitely useful by its own - InsertPts[InsertionPoint].push_back(&Use); - } + // It is definitely useful by its own + InsertPts[InsertionPoint].emplace_back(User, OpNo); } -bool AArch64PromoteConstant::insertDefinitions( - Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) { - // We will create one global variable per Module. - DenseMap ModuleToMergedGV; - bool HasChanged = false; +static void ensurePromotedGV(Function &F, Constant &C, + AArch64PromoteConstant::PromotedConstant &PC) { + assert(PC.ShouldConvert && + "Expected that we should convert this to a global"); + if (PC.GV) + return; + PC.GV = new GlobalVariable( + *F.getParent(), C.getType(), true, GlobalValue::InternalLinkage, nullptr, + "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal); + PC.GV->setInitializer(&C); + DEBUG(dbgs() << "Global replacement: "); + DEBUG(PC.GV->print(dbgs())); + DEBUG(dbgs() << '\n'); + ++NumPromoted; +} - // Traverse all insertion points in all the function. - for (const auto &FctToInstPtsIt : InsPtsPerFunc) { - const InsertionPoints &InsertPts = FctToInstPtsIt.second; -// Do more checking for debug purposes. +void AArch64PromoteConstant::insertDefinitions(Function &F, + GlobalVariable &PromotedGV, + InsertionPoints &InsertPts) { #ifndef NDEBUG - DominatorTree &DT = getAnalysis( - *FctToInstPtsIt.first).getDomTree(); + // Do more checking for debug purposes. + DominatorTree &DT = getAnalysis(F).getDomTree(); #endif - assert(!InsertPts.empty() && "Empty uses does not need a definition"); - - Module *M = FctToInstPtsIt.first->getParent(); - GlobalVariable *&PromotedGV = ModuleToMergedGV[M]; - if (!PromotedGV) { - PromotedGV = new GlobalVariable( - *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr, - "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal); - PromotedGV->setInitializer(Cst); - DEBUG(dbgs() << "Global replacement: "); - DEBUG(PromotedGV->print(dbgs())); - DEBUG(dbgs() << '\n'); - ++NumPromoted; - HasChanged = true; - } - - for (const auto &IPI : InsertPts) { - // Create the load of the global variable. - IRBuilder<> Builder(IPI.first); - LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV); - DEBUG(dbgs() << "**********\n"); - DEBUG(dbgs() << "New def: "); - DEBUG(LoadedCst->print(dbgs())); - DEBUG(dbgs() << '\n'); + assert(!InsertPts.empty() && "Empty uses does not need a definition"); + + for (const auto &IPI : InsertPts) { + // Create the load of the global variable. + IRBuilder<> Builder(IPI.first); + LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV); + DEBUG(dbgs() << "**********\n"); + DEBUG(dbgs() << "New def: "); + DEBUG(LoadedCst->print(dbgs())); + DEBUG(dbgs() << '\n'); - // Update the dominated uses. - for (Use *Use : IPI.second) { + // Update the dominated uses. + for (auto Use : IPI.second) { #ifndef NDEBUG - assert(DT.dominates(LoadedCst, findInsertionPoint(*Use)) && - "Inserted definition does not dominate all its uses!"); + assert(DT.dominates(LoadedCst, + findInsertionPoint(*Use.first, Use.second)) && + "Inserted definition does not dominate all its uses!"); #endif - DEBUG(dbgs() << "Use to update " << Use->getOperandNo() << ":"); - DEBUG(Use->getUser()->print(dbgs())); - DEBUG(dbgs() << '\n'); - Use->set(LoadedCst); - ++NumPromotedUses; - } + DEBUG({ + dbgs() << "Use to update " << Use.second << ":"; + Use.first->print(dbgs()); + dbgs() << '\n'; + }); + Use.first->setOperand(Use.second, LoadedCst); + ++NumPromotedUses; } } - return HasChanged; } -bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) { - InsertionPointsPerFunc InsertPtsPerFunc; - computeInsertionPoints(Val, InsertPtsPerFunc); - return insertDefinitions(Val, InsertPtsPerFunc); -} - -bool AArch64PromoteConstant::promoteConstant(Constant *Cst) { - assert(Cst && "Given variable is not a valid constant."); - - if (!shouldConvert(Cst)) - return false; - - DEBUG(dbgs() << "******************************\n"); - DEBUG(dbgs() << "Candidate constant: "); - DEBUG(Cst->print(dbgs())); - DEBUG(dbgs() << '\n'); - - return computeAndInsertDefinitions(Cst); +void AArch64PromoteConstant::promoteConstants( + Function &F, SmallVectorImpl &Updates, + PromotionCacheTy &PromotionCache) { + // Promote the constants. + for (auto U = Updates.begin(), E = Updates.end(); U != E;) { + DEBUG(dbgs() << "** Compute insertion points **\n"); + auto First = U; + Constant *C = First->C; + InsertionPoints InsertPts; + do { + computeInsertionPoint(U->User, U->Op, InsertPts); + } while (++U != E && U->C == C); + + auto &Promotion = PromotionCache[C]; + ensurePromotedGV(F, *C, Promotion); + insertDefinitions(F, *Promotion.GV, InsertPts); + } } -bool AArch64PromoteConstant::runOnFunction(Function &F) { +bool AArch64PromoteConstant::runOnFunction(Function &F, + PromotionCacheTy &PromotionCache) { // Look for instructions using constant vector. Promote that constant to a // global variable. Create as few loads of this variable as possible and // update the uses accordingly. - bool LocalChange = false; - SmallPtrSet AlreadyChecked; - + SmallVector Updates; for (Instruction &I : instructions(&F)) { // Traverse the operand, looking for constant vectors. Replace them by a // load of a global variable of constant vector type. - for (Value *Op : I.operand_values()) { - Constant *Cst = dyn_cast(Op); + for (Use &U : I.operands()) { + Constant *Cst = dyn_cast(U); // There is no point in promoting global values as they are already // global. Do not promote constant expressions either, as they may // require some code expansion. - if (Cst && !isa(Cst) && !isa(Cst) && - AlreadyChecked.insert(Cst).second) - LocalChange |= promoteConstant(Cst); + if (!Cst || isa(Cst) || isa(Cst)) + continue; + + // Check if this constant is worth promoting. + if (!shouldConvert(*Cst, PromotionCache)) + continue; + + // Check if this use should be promoted. + unsigned OpNo = &U - I.op_begin(); + if (!shouldConvertUse(Cst, &I, OpNo)) + continue; + + Updates.emplace_back(Cst, &I, OpNo); } } - return LocalChange; + + if (Updates.empty()) + return false; + + promoteConstants(F, Updates, PromotionCache); + return true; } diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp new file mode 100644 index 000000000000..60d8bbd260bb --- /dev/null +++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp @@ -0,0 +1,182 @@ +//=- AArch64RedundantCopyElimination.cpp - Remove useless copy for AArch64 -=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// This pass removes unnecessary zero copies in BBs that are targets of +// cbz/cbnz instructions. For instance, the copy instruction in the code below +// can be removed because the CBZW jumps to BB#2 when W0 is zero. +// BB#1: +// CBZW %W0, +// BB#2: +// %W0 = COPY %WZR +// This pass should be run after register allocation. +// +// FIXME: This should be extended to handle any constant other than zero. E.g., +// cmp w0, #1 +// b.eq .BB1 +// BB1: +// mov w0, #1 +// +// FIXME: This could also be extended to check the whole dominance subtree below +// the comparison if the compile time regression is acceptable. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-copyelim" + +STATISTIC(NumCopiesRemoved, "Number of copies removed."); + +namespace llvm { +void initializeAArch64RedundantCopyEliminationPass(PassRegistry &); +} + +namespace { +class AArch64RedundantCopyElimination : public MachineFunctionPass { + const MachineRegisterInfo *MRI; + const TargetRegisterInfo *TRI; + +public: + static char ID; + AArch64RedundantCopyElimination() : MachineFunctionPass(ID) {} + bool optimizeCopy(MachineBasicBlock *MBB); + bool runOnMachineFunction(MachineFunction &MF) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { + return "AArch64 Redundant Copy Elimination"; + } +}; +char AArch64RedundantCopyElimination::ID = 0; +} + +INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim", + "AArch64 redundant copy elimination pass", false, false) + +static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) { + unsigned Opc = MI.getOpcode(); + // Check if the current basic block is the target block to which the + // CBZ/CBNZ instruction jumps when its Wt/Xt is zero. + if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) && + MBB == MI.getOperand(1).getMBB()) + return true; + else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) && + MBB != MI.getOperand(1).getMBB()) + return true; + + return false; +} + +bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { + // Check if the current basic block has a single predecessor. + if (MBB->pred_size() != 1) + return false; + + MachineBasicBlock *PredMBB = *MBB->pred_begin(); + MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr(); + if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2) + return false; + + ++CompBr; + do { + --CompBr; + if (guaranteesZeroRegInBlock(*CompBr, MBB)) + break; + } while (CompBr != PredMBB->begin() && CompBr->isTerminator()); + + // We've not found a CBZ/CBNZ, time to bail out. + if (!guaranteesZeroRegInBlock(*CompBr, MBB)) + return false; + + unsigned TargetReg = CompBr->getOperand(0).getReg(); + if (!TargetReg) + return false; + assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) && + "Expect physical register"); + + // Remember all registers aliasing with TargetReg. + SmallSetVector TargetRegs; + for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI) + TargetRegs.insert(*AI); + + bool Changed = false; + MachineBasicBlock::iterator LastChange = MBB->begin(); + unsigned SmallestDef = TargetReg; + // Remove redundant Copy instructions unless TargetReg is modified. + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { + MachineInstr *MI = &*I; + ++I; + if (MI->isCopy() && MI->getOperand(0).isReg() && + MI->getOperand(1).isReg()) { + + unsigned DefReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + + if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) && + !MRI->isReserved(DefReg) && + (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) { + DEBUG(dbgs() << "Remove redundant Copy : "); + DEBUG((MI)->print(dbgs())); + + MI->eraseFromParent(); + Changed = true; + LastChange = I; + NumCopiesRemoved++; + SmallestDef = + TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef; + continue; + } + } + + if (MI->modifiesRegister(TargetReg, TRI)) + break; + } + + if (!Changed) + return false; + + // Otherwise, we have to fixup the use-def chain, starting with the + // CBZ/CBNZ. Conservatively mark as much as we can live. + CompBr->clearRegisterKills(SmallestDef, TRI); + + if (std::none_of(TargetRegs.begin(), TargetRegs.end(), + [&](unsigned Reg) { return MBB->isLiveIn(Reg); })) + MBB->addLiveIn(TargetReg); + + // Clear any kills of TargetReg between CompBr and the last removed COPY. + for (MachineInstr &MMI : + make_range(MBB->begin()->getIterator(), LastChange->getIterator())) + MMI.clearRegisterKills(SmallestDef, TRI); + + return true; +} + +bool AArch64RedundantCopyElimination::runOnMachineFunction( + MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + TRI = MF.getSubtarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + bool Changed = false; + for (MachineBasicBlock &MBB : MF) + Changed |= optimizeCopy(&MBB); + return Changed; +} + +FunctionPass *llvm::createAArch64RedundantCopyEliminationPass() { + return new AArch64RedundantCopyElimination(); +} diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp new file mode 100644 index 000000000000..0a1831bd9a8c --- /dev/null +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -0,0 +1,168 @@ +//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for +/// AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AArch64RegisterBankInfo.h" +#include "AArch64InstrInfo.h" // For XXXRegClassID. +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) + : RegisterBankInfo(AArch64::NumRegisterBanks) { + // Initialize the GPR bank. + createRegisterBank(AArch64::GPRRegBankID, "GPR"); + // The GPR register bank is fully defined by all the registers in + // GR64all + its subclasses. + addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI); + const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID); + (void)RBGPR; + assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) && + "Subclass not added?"); + assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); + + // Initialize the FPR bank. + createRegisterBank(AArch64::FPRRegBankID, "FPR"); + // The FPR register bank is fully defined by all the registers in + // GR64all + its subclasses. + addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI); + const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); + (void)RBFPR; + assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) && + "Subclass not added?"); + assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) && + "Subclass not added?"); + assert(RBFPR.getSize() == 512 && + "FPRs should hold up to 512-bit via QQQQ sequence"); + + // Initialize the CCR bank. + createRegisterBank(AArch64::CCRRegBankID, "CCR"); + addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI); + const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID); + (void)RBCCR; + assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) && + "Class not added?"); + assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit"); + + assert(verify(TRI) && "Invalid register bank information"); +} + +unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A, + const RegisterBank &B, + unsigned Size) const { + // What do we do with different size? + // copy are same size. + // Will introduce other hooks for different size: + // * extract cost. + // * build_sequence cost. + // TODO: Add more accurate cost for FPR to/from GPR. + return RegisterBankInfo::copyCost(A, B, Size); +} + +const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass( + const TargetRegisterClass &RC) const { + switch (RC.getID()) { + case AArch64::FPR8RegClassID: + case AArch64::FPR16RegClassID: + case AArch64::FPR32RegClassID: + case AArch64::FPR64RegClassID: + case AArch64::FPR128RegClassID: + case AArch64::FPR128_loRegClassID: + case AArch64::DDRegClassID: + case AArch64::DDDRegClassID: + case AArch64::DDDDRegClassID: + case AArch64::QQRegClassID: + case AArch64::QQQRegClassID: + case AArch64::QQQQRegClassID: + return getRegBank(AArch64::FPRRegBankID); + case AArch64::GPR32commonRegClassID: + case AArch64::GPR32RegClassID: + case AArch64::GPR32spRegClassID: + case AArch64::GPR32sponlyRegClassID: + case AArch64::GPR32allRegClassID: + case AArch64::GPR64commonRegClassID: + case AArch64::GPR64RegClassID: + case AArch64::GPR64spRegClassID: + case AArch64::GPR64sponlyRegClassID: + case AArch64::GPR64allRegClassID: + case AArch64::tcGPR64RegClassID: + case AArch64::WSeqPairsClassRegClassID: + case AArch64::XSeqPairsClassRegClassID: + return getRegBank(AArch64::GPRRegBankID); + case AArch64::CCRRegClassID: + return getRegBank(AArch64::CCRRegBankID); + default: + llvm_unreachable("Register class not supported"); + } +} + +RegisterBankInfo::InstructionMappings +AArch64RegisterBankInfo::getInstrAlternativeMappings( + const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_OR: { + // 32 and 64-bit or can be mapped on either FPR or + // GPR for the same cost. + const MachineFunction &MF = *MI.getParent()->getParent(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + if (Size != 32 && Size != 64) + break; + + // If the instruction has any implicit-defs or uses, + // do not mess with it. + if (MI.getNumOperands() != 3) + break; + InstructionMappings AltMappings; + InstructionMapping GPRMapping(/*ID*/ 1, /*Cost*/ 1, /*NumOperands*/ 3); + InstructionMapping FPRMapping(/*ID*/ 2, /*Cost*/ 1, /*NumOperands*/ 3); + for (unsigned Idx = 0; Idx != 3; ++Idx) { + GPRMapping.setOperandMapping(Idx, Size, + getRegBank(AArch64::GPRRegBankID)); + FPRMapping.setOperandMapping(Idx, Size, + getRegBank(AArch64::FPRRegBankID)); + } + AltMappings.emplace_back(std::move(GPRMapping)); + AltMappings.emplace_back(std::move(FPRMapping)); + return AltMappings; + } + default: + break; + } + return RegisterBankInfo::getInstrAlternativeMappings(MI); +} + +void AArch64RegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + switch (OpdMapper.getMI().getOpcode()) { + case TargetOpcode::G_OR: { + // Those ID must match getInstrAlternativeMappings. + assert((OpdMapper.getInstrMapping().getID() == 1 || + OpdMapper.getInstrMapping().getID() == 2) && + "Don't know how to handle that ID"); + return applyDefaultMapping(OpdMapper); + } + default: + llvm_unreachable("Don't know how to handle that operation"); + } +} diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h new file mode 100644 index 000000000000..907bcfdea161 --- /dev/null +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -0,0 +1,69 @@ +//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" + +namespace llvm { + +class TargetRegisterInfo; + +namespace AArch64 { +enum { + GPRRegBankID = 0, /// General Purpose Registers: W, X. + FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q. + CCRRegBankID = 2, /// Conditional register: NZCV. + NumRegisterBanks +}; +} // End AArch64 namespace. + +/// This class provides the information for the target register banks. +class AArch64RegisterBankInfo : public RegisterBankInfo { + /// See RegisterBankInfo::applyMapping. + void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + +public: + AArch64RegisterBankInfo(const TargetRegisterInfo &TRI); + /// Get the cost of a copy from \p B to \p A, or put differently, + /// get the cost of A = COPY B. Since register banks may cover + /// different size, \p Size specifies what will be the size in bits + /// that will be copied around. + /// + /// \note Since this is a copy, both registers have the same size. + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, + unsigned Size) const override; + + /// Get a register bank that covers \p RC. + /// + /// \pre \p RC is a user-defined register class (as opposed as one + /// generated by TableGen). + /// + /// \note The mapping RC -> RegBank could be built while adding the + /// coverage for the register banks. However, we do not do it, because, + /// at least for now, we only need this information for register classes + /// that are used in the description of instruction. In other words, + /// there are just a handful of them and we do not want to waste space. + /// + /// \todo This should be TableGen'ed. + const RegisterBank & + getRegBankFromRegClass(const TargetRegisterClass &RC) const override; + + /// Get the alternative mappings for \p MI. + /// Alternative in the sense different from getInstrMapping. + InstructionMappings + getInstrAlternativeMappings(const MachineInstr &MI) const override; +}; +} // End llvm namespace. +#endif diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 32b4888f2f64..af867da4823d 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -25,7 +25,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/Function.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" @@ -51,6 +50,13 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return MF->getInfo()->isSplitCSR() ? CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : CSR_AArch64_CXX_TLS_Darwin_SaveList; + if (MF->getSubtarget().getTargetLowering() + ->supportSwiftError() && + MF->getFunction()->getAttributes().hasAttrSomewhere( + Attribute::SwiftError)) + return CSR_AArch64_AAPCS_SwiftError_SaveList; + if (MF->getFunction()->getCallingConv() == CallingConv::PreserveMost) + return CSR_AArch64_RT_MostRegs_SaveList; else return CSR_AArch64_AAPCS_SaveList; } @@ -74,6 +80,12 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_AArch64_AllRegs_RegMask; if (CC == CallingConv::CXX_FAST_TLS) return CSR_AArch64_CXX_TLS_Darwin_RegMask; + if (MF.getSubtarget().getTargetLowering() + ->supportSwiftError() && + MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return CSR_AArch64_AAPCS_SwiftError_RegMask; + if (CC == CallingConv::PreserveMost) + return CSR_AArch64_RT_MostRegs_RegMask; else return CSR_AArch64_AAPCS_RegMask; } @@ -190,9 +202,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { // If it's wrong, we'll materialize the constant and still get to the // object; it's just suboptimal. Negative offsets use the unscaled // load/store instructions, which have a 9-bit signed immediate. - if (MFI->getLocalFrameSize() < 256) - return false; - return true; + return MFI->getLocalFrameSize() >= 256; } return false; @@ -231,9 +241,7 @@ bool AArch64RegisterInfo::requiresFrameIndexScavenging( bool AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); - // Only consider eliminating leaf frames. - if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) && - MFI->adjustsStack())) + if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack()) return true; return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken(); } @@ -396,8 +404,6 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); } -namespace llvm { - unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const AArch64FrameLowering *TFI = getFrameLowering(MF); @@ -437,5 +443,3 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, return 16; } } - -} // namespace llvm diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td index a8c8b176efa9..5fbaff00a5e7 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1,4 +1,4 @@ -//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=// +//=- AArch64RegisterInfo.td - Describe the AArch64 Registers -*- tablegen -*-=// // // The LLVM Compiler Infrastructure // diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td index d709bee7b9eb..93ca079275c8 100644 --- a/lib/Target/AArch64/AArch64SchedA53.td +++ b/lib/Target/AArch64/AArch64SchedA53.td @@ -19,13 +19,13 @@ def CortexA53Model : SchedMachineModel { let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order. let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. - let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency. let LoadLatency = 3; // Optimistic load latency assuming bypass. // This is overriden by OperandCycles if the // Itineraries are queried instead. let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation // Specification - Instruction Timings" // v 1.0 Spreadsheet + let CompleteModel = 1; } @@ -109,6 +109,8 @@ def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5; def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6; let ResourceCycles = [3]; } +def : WriteRes { let Unsupported = 1; } + // Branch def : WriteRes; def : WriteRes; diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td index ca4457af8525..a266351f7ffc 100644 --- a/lib/Target/AArch64/AArch64SchedA57.td +++ b/lib/Target/AArch64/AArch64SchedA57.td @@ -30,6 +30,7 @@ def CortexA57Model : SchedMachineModel { // Enable partial & runtime unrolling. The magic number is chosen based on // experiments and benchmarking data. let LoopMicroOpBufferSize = 16; + let CompleteModel = 1; } //===----------------------------------------------------------------------===// @@ -96,6 +97,8 @@ def : SchedAlias; def : SchedAlias; def : SchedAlias; +def : WriteRes { let Unsupported = 1; } + def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td index a2a180237789..9fd3ae6818e5 100644 --- a/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/lib/Target/AArch64/AArch64SchedCyclone.td @@ -1,4 +1,4 @@ -//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=// +//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=// // // The LLVM Compiler Infrastructure // @@ -17,6 +17,7 @@ def CycloneModel : SchedMachineModel { let MicroOpBufferSize = 192; // Based on the reorder buffer. let LoadLatency = 4; // Optimistic load latency. let MispredictPenalty = 16; // 14-19 cycles are typical. + let CompleteModel = 1; } //===----------------------------------------------------------------------===// @@ -107,7 +108,7 @@ def WriteX : SchedWriteRes<[]> { let Latency = 0; } // The move is replaced by a single nop micro-op. // MOVZ Rd, #0 // AND Rd, Rzr, #imm -def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>; +def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>; def WriteImmZ : SchedWriteVariant<[ SchedVar, SchedVar]>; @@ -116,8 +117,8 @@ def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; // Move GPR is a register rename and single nop micro-op. // ORR Xd, XZR, Xm // ADD Xd, Xn, #0 -def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>; -def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>; +def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>; +def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>; def WriteMov : SchedWriteVariant<[ SchedVar, SchedVar, @@ -726,7 +727,7 @@ def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], (instrs LD3Rv1d,LD3Rv2d)>; def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], - (instrs LD3Rv2d_POST,LD3Rv2d_POST)>; + (instrs LD3Rv1d_POST,LD3Rv2d_POST)>; def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], (instregex "LD4Fourv(8b|4h|2s)$")>; @@ -851,6 +852,9 @@ def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; +// Atomic operations are not supported. +def : WriteRes { let Unsupported = 1; } + //--- // Unused SchedRead types //--- diff --git a/lib/Target/AArch64/AArch64SchedKryo.td b/lib/Target/AArch64/AArch64SchedKryo.td new file mode 100644 index 000000000000..4e491a04c78d --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedKryo.td @@ -0,0 +1,133 @@ +//==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Qualcomm Kryo to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// The issue width is set to five, matching the five issue queues for expanded +// uops. Now, the latency spreadsheet has information based on fragmented uops, +// but these do not actually take up an issue queue. + +def KryoModel : SchedMachineModel { + let IssueWidth = 5; // 5-wide issue for expanded uops + let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch + + // Enable partial & runtime unrolling. The magic number is chosen based on + // experiments and benchmarking data. + let LoopMicroOpBufferSize = 16; + let CompleteModel = 1; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Kryo. + +let SchedModel = KryoModel in { + def KryoUnitXA : ProcResource<1>; // Type X(A) micro-ops + def KryoUnitXB : ProcResource<1>; // Type X(B) micro-ops + def KryoUnitYA : ProcResource<1>; // Type Y(A) micro-ops + def KryoUnitYB : ProcResource<1>; // Type Y(B) micro-ops + def KryoUnitX : ProcResGroup<[KryoUnitXA, // Type X micro-ops + KryoUnitXB]>; + def KryoUnitY : ProcResGroup<[KryoUnitYA, // Type Y micro-ops + KryoUnitYB]>; + def KryoUnitXY : ProcResGroup<[KryoUnitXA, // Type XY micro-ops + KryoUnitXB, + KryoUnitYA, + KryoUnitYB]>; + def KryoUnitLSA : ProcResource<1>; // Type LS(A) micro-ops + def KryoUnitLSB : ProcResource<1>; // Type LS(B) micro-ops + def KryoUnitLS : ProcResGroup<[KryoUnitLSA, // Type LS micro-ops + KryoUnitLSB]>; +} + +let SchedModel = KryoModel in { + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latency for +// Kryo. + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes + { let Latency = 2; let NumMicroOps = 2; } +def : WriteRes + { let Latency = 2; let NumMicroOps = 2; } +def : WriteRes + { let Latency = 2; let NumMicroOps = 2; } +def : WriteRes { let Latency = 2; } +def : WriteRes + { let Latency = 8; let NumMicroOps = 1; } // Fragent -1 +def : WriteRes + { let Latency = 8; let NumMicroOps = 1; } // Fragent -1 +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes + { let Latency = 3; let NumMicroOps = 2; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes + { let Latency = 6; let NumMicroOps = 2; } +def : WriteRes + { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1 +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { let Latency = 4; } + +def : WriteRes { let Unsupported = 1; } + +// No forwarding logic is modelled yet. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + + +//===----------------------------------------------------------------------===// +// Specialize the coarse model by associating instruction groups with the +// subtarget-defined types. As the modeled is refined, this will override most +// of the above SchedWriteRes and SchedAlias mappings. + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[WriteI], (instrs COPY)>; + + +// Detailed Refinedments +// ----------------------------------------------------------------------------- +include "AArch64SchedKryoDetails.td" + + +} // SchedModel = KryoModel diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td new file mode 100644 index 000000000000..426ae6103e4b --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td @@ -0,0 +1,2358 @@ +//=- AArch64SchedKryoDetails.td - QC Kryo Scheduling Defs ----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the uop and latency details for the machine model for the +// Qualcomm Kryo subtarget. +// +//===----------------------------------------------------------------------===// + +def KryoWrite_3cyc_X_noRSV_138ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_noRSV_138ln], + (instregex "(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)")>; + +def KryoWrite_3cyc_X_X_139ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_X_139ln], + (instregex "(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift")>; + +def KryoWrite_4cyc_XY_XY_noRSV_172ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_172ln], + (instregex "(S|U)ABA(v8i8|v4i16|v2i32)")>; +def KryoWrite_4cyc_XY_XY_XY_XY_178ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> { + let Latency = 4; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_4cyc_XY_XY_XY_XY_178ln], + (instregex "(S|U)ABA(v16i8|v8i16|v4i32)")>; +def KryoWrite_3cyc_XY_XY_XY_XY_177ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_XY_XY_177ln], + (instregex "(S|U)ABALv.*")>; +def KryoWrite_3cyc_XY_XY_166ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_166ln], + (instregex "(S|U)(ABD|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_3cyc_XY_noRSV_159ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_159ln], + (instregex "(S|U)(ABD|RHADD)(v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_XY_165ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_165ln], + (instregex "(S|U)ABDLv.*")>; +def KryoWrite_3cyc_X_noRSV_154ln : + SchedWriteRes<[KryoUnitX]> { +let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_noRSV_154ln], + (instregex "(S|U)ADALP(v8i8|v4i16|v2i32)_v.*")>; +def KryoWrite_3cyc_X_X_155ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_X_155ln], + (instregex "(S|U)ADALP(v16i8|v8i16|v4i32)_v.*")>; +def KryoWrite_2cyc_XY_XY_151ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_151ln], + (instregex "(S|U)(ADD|SUB)Lv.*")>; +def KryoWrite_2cyc_XY_noRSV_148ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_148ln], + (instregex "((S|U)ADDLP|ABS)(v2i32|v4i16|v8i8)(_v.*)?")>; +def KryoWrite_2cyc_XY_XY_150ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_150ln], + (instregex "((S|U)ADDLP|ABS)(v2i64|v4i32|v8i16|v16i8)(_v.*)?")>; +def KryoWrite_3cyc_XY_XY_XY_noRSV_179ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_XY_noRSV_179ln], + (instrs SADDLVv4i32v, UADDLVv4i32v)>; +def KryoWrite_5cyc_XY_XY_XY_noRSV_180ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> { + let Latency = 5; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_5cyc_XY_XY_XY_noRSV_180ln], + (instrs SADDLVv8i16v, UADDLVv8i16v)>; +def KryoWrite_6cyc_XY_XY_X_noRSV_181ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_6cyc_XY_XY_X_noRSV_181ln], + (instrs SADDLVv16i8v, UADDLVv16i8v)>; +def KryoWrite_3cyc_XY_noRSV_158ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_158ln], + (instrs SADDLVv4i16v, UADDLVv4i16v, ADDVv4i16v)>; +def KryoWrite_4cyc_X_noRSV_169ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_169ln], + (instrs SADDLVv8i8v, UADDLVv8i8v, ADDVv8i8v)>; +def KryoWrite_2cyc_XY_XY_XY_XY_176ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_XY_XY_176ln], + (instregex "(S|U)(ADDW|SUBW)v.*")>; +def KryoWrite_4cyc_X_noRSV_40ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_40ln], + (instregex "(S|U)CVTFS(W|X)(D|S)ri")>; +def KryoWrite_4cyc_X_noRSV_97ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_97ln], + (instregex "(S|U)CVTFU(W|X)(D|S)ri")>; +def KryoWrite_4cyc_X_noRSV_110ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_110ln], + (instregex "(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>; +def KryoWrite_4cyc_X_X_114ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_114ln], + (instregex "(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>; +def KryoWrite_1cyc_XA_Y_98ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_98ln], + (instregex "(S|U)DIV(_Int)?(W|X)r")>; +def KryoWrite_2cyc_XY_XY_152ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_152ln], + (instregex "(S|U)H(ADD|SUB)(v16i8|v8i16|v4i32)")>; +def KryoWrite_2cyc_XY_noRSV_149ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_149ln], + (instregex "((S|U)H(ADD|SUB)|ADDP)(v8i8|v4i16|v2i32)")>; +def KryoWrite_4cyc_X_70ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_X_70ln], + (instregex "(S|U)(MADDL|MSUBL)rrr")>; +def KryoWrite_4cyc_X_X_191ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_191ln], + (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def KryoWrite_1cyc_XY_195ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_195ln], + (instregex "(S|U)MOVv.*")>; +def KryoWrite_5cyc_X_71ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_5cyc_X_71ln], + (instrs SMULHrr, UMULHrr)>; +def KryoWrite_3cyc_XY_noRSV_186ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_186ln], + (instregex "^(S|U)QADD(v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_XY_187ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_187ln], + (instregex "^(S|U)QADD(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_3cyc_XY_noRSV_69ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_69ln], + (instregex "(S|U|SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64)")>; +def KryoWrite_3cyc_XY_noRSV_248ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_248ln], + (instregex "(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>; +def KryoWrite_3cyc_XY_XY_250ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_250ln], + (instregex "(S|U)(QSHLU?|RSHR)(v16i8|v8i16|v4i32|v2i64)_shift$")>; +def KryoWrite_3cyc_XY_noRSV_246ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_246ln], + (instregex "(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32)$")>; +def KryoWrite_3cyc_XY_XY_251ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_251ln], + (instregex "(S|U)(QSHL|RSHL|QRSHL)(v16i8|v8i16|v4i32|v2i64)$")>; +def KryoWrite_6cyc_XY_X_238ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_XY_X_238ln], + (instregex "((S|U)QR?SHRN|SQR?SHRUN)(v16i8|v8i16|v4i32)_shift$")>; +def KryoWrite_3cyc_XY_noRSV_249ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_249ln], + (instregex "((S|U)QR?SHRN|SQR?SHRUN)(s|h|b)?")>; +def KryoWrite_6cyc_XY_X_noRSV_252ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_252ln], + (instregex "((S|U)QR?SHRN|SQR?SHRUN)(v8i8|v4i16|v2i32)_shift?")>; +def KryoWrite_3cyc_XY_noRSV_161ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_161ln], + (instregex "(S|U)QSUB(v8i8|v4i16|v2i32|v1i64|v1i32|v1i16|v1i8)")>; +def KryoWrite_3cyc_XY_noRSV_163ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_163ln], + (instregex "(S|U)QXTU?N(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_noRSV_162ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_162ln], + (instregex "(S|U)QXTU?N(v1i8|v1i16|v1i32)")>; +def KryoWrite_3cyc_XY_noRSV_247ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_247ln], + (instregex "(S|U)RSHR(d|(v8i8|v4i16|v2i32)_shift)$")>; +def KryoWrite_2cyc_XY_noRSV_239ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_239ln], + (instregex "(S|U)SHL(d|v8i8|v4i16|v2i32|v1i64)$")>; +def KryoWrite_2cyc_XY_XY_243ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_243ln], + (instregex "(S|U)SHL(v16i8|v8i16|v4i32|v2i64)$")>; +def KryoWrite_2cyc_XY_XY_241ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_241ln], + (instregex "(S|U)?SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>; +def KryoWrite_2cyc_XY_noRSV_240ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_240ln], + (instregex "((S|U)SHR|SHL)(d|(v8i8|v4i16|v2i32)_shift)$")>; +def KryoWrite_2cyc_XY_XY_242ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_242ln], + (instregex "((S|U)SHR|SHL)(v16i8|v8i16|v4i32|v2i64)_shift$")>; +def KryoWrite_2cyc_XY_XY_183ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_183ln], + (instregex "(S|U)(MAX|MIN)P?(v16i8|v8i16|v4i32)")>; +def KryoWrite_2cyc_XY_noRSV_182ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_182ln], + (instregex "(S|U)(MAX|MIN)P?(v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_noRSV_184ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_184ln], + (instregex "(S|U)(MAX|MIN)V(v4i16v|v8i8v|v4i32)")>; +def KryoWrite_4cyc_X_noRSV_185ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_185ln], + (instregex "(S|U)(MAX|MIN)V(v16i8v|v8i16v)")>; +def KryoWrite_2cyc_XY_noRSV_67ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_67ln], + (instrs ABSv1i64)>; +def KryoWrite_1cyc_XY_63ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_63ln, ReadI, ReadI], + (instregex "ADC.*")>; +def KryoWrite_1cyc_XY_63_1ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_63_1ln], + (instregex "ADR.*")>; +def KryoWrite_1cyc_XY_62ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_62ln, ReadI], + (instregex "ADDS?(W|X)ri")>; +def KryoWrite_2cyc_XY_XY_64ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_64ln, ReadI, ReadI], + (instregex "ADDS?(W|X)r(r|s|x)(64)?")>; +def KryoWrite_1cyc_XY_noRSV_65ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_65ln], + (instrs ADDv1i64)>; +def KryoWrite_1cyc_XY_noRSV_144ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_144ln], + (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>; +def KryoWrite_1cyc_XY_XY_146ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_146ln], + (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_4cyc_XY_X_noRSV_171ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_XY_X_noRSV_171ln], + (instregex "(ADD|SUB)HNv.*")>; +def KryoWrite_1cyc_XY_noRSV_66ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_66ln], + (instrs ADDPv2i64p)>; +def KryoWrite_2cyc_XY_XY_153ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_153ln], + (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_3cyc_XY_XY_noRSV_170ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_noRSV_170ln], + (instrs ADDVv4i32v)>; +def KryoWrite_4cyc_XY_XY_noRSV_173ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_173ln], + (instrs ADDVv8i16v)>; +def KryoWrite_5cyc_XY_X_noRSV_174ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_5cyc_XY_X_noRSV_174ln], + (instrs ADDVv16i8v)>; +def KryoWrite_3cyc_XY_XY_X_X_27ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_27ln], + (instrs AESDrr, AESErr)>; +def KryoWrite_2cyc_X_X_22ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_X_X_22ln], + (instrs AESIMCrr, AESMCrr)>; +def KryoWrite_1cyc_XY_noRSV_76ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_76ln], + (instregex "((AND|ORN|EOR|EON)S?(Wr[rsi]|v8i8|v4i16|v2i32)|(ORR|BIC)S?(Wr[rs]|v8i8|v4i16|v2i32))")>; +def KryoWrite_1cyc_XY_XY_79ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_79ln], + (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>; +def KryoWrite_1cyc_X_72ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_X_72ln], + (instregex "(S|U)?BFM.*")>; +def KryoWrite_1cyc_XY_noRSV_77ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_77ln], + (instregex "(BIC|ORR)S?Wri")>; +def KryoWrite_1cyc_XY_XY_78ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_78ln], + (instregex "(BIC|ORR)S?Xri")>; +def KryoWrite_1cyc_X_noRSV_74ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln], + (instrs BIFv8i8, BITv8i8, BSLv8i8)>; +def KryoWrite_1cyc_X_X_75ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_75ln], + (instrs BIFv16i8, BITv16i8, BSLv16i8)>; +def KryoWrite_0cyc_noRSV_11ln : + SchedWriteRes<[]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_noRSV_11ln], + (instrs BRK, DCPS1, DCPS2, DCPS3, HLT, HVC, ISB, HINT, SMC, SVC)>; +def KryoWrite_0cyc_XY_16ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_XY_16ln, ReadI], + (instregex "(CCMN|CCMP)(W|X)i")>; +def KryoWrite_0cyc_XY_16_1ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_XY_16_1ln, ReadI, ReadI], + (instregex "(CCMN|CCMP)(W|X)r")>; +def KryoWrite_2cyc_XY_3ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_2cyc_XY_3ln, ReadI], + (instregex "(CLS|CLZ)(W|X)r")>; +def KryoWrite_2cyc_XY_noRSV_7ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_7ln], + (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>; +def KryoWrite_2cyc_XY_XY_8ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_8ln], + (instregex "(CLS|CLZ|CNT)(v2i32|v4i16|v8i8)")>; +def KryoWrite_2cyc_XY_noRSV_80ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_80ln], + (instregex "CM(EQ|GE|HS|GT|HI|TST)(v8i8|v4i16|v2i32|v1i64)$")>; +def KryoWrite_2cyc_XY_XY_83ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_83ln], + (instregex "CM(EQ|GE|HS|GT|HI|TST)(v16i8|v8i16|v4i32|v2i64)$")>; +def KryoWrite_2cyc_XY_noRSV_81ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_81ln], + (instregex "CM(EQ|LE|GE|GT|LT)(v8i8|v4i16|v2i32|v1i64)rz$")>; +def KryoWrite_2cyc_XY_XY_82ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_82ln], + (instregex "CM(EQ|LE|GE|GT|LT)(v16i8|v8i16|v4i32|v2i64)rz$")>; +def KryoWrite_3cyc_XY_4ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_XY_4ln, ReadI, ReadISReg], + (instregex "CRC32.*")>; +def KryoWrite_1cyc_XY_20ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_20ln, ReadI, ReadI], + (instregex "CSEL(W|X)r")>; +def KryoWrite_1cyc_X_17ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_X_17ln, ReadI, ReadI], + (instregex "(CSINC|CSNEG)(W|X)r")>; +def KryoWrite_1cyc_XY_18ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_18ln, ReadI, ReadI], + (instregex "(CSINV)(W|X)r")>; +def KryoWrite_3cyc_LS_X_13ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_X_13ln], + (instrs DRPS)>; +def KryoWrite_0cyc_LS_10ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_LS_10ln], + (instrs DSB, DMB, CLREX)>; +def KryoWrite_1cyc_X_noRSV_196ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_noRSV_196ln], + (instregex "DUP(v8i8|v4i16|v2i32)(gpr|lane)")>; +def KryoWrite_1cyc_X_X_197ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_197ln], + (instregex "DUP(v16i8|v8i16|v4i32|v2i64)(gpr|lane)")>; +def KryoWrite_3cyc_LS_LS_X_15ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_X_15ln], + (instrs ERET)>; +def KryoWrite_1cyc_X_noRSV_207ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_noRSV_207ln], + (instrs EXTv8i8)>; +def KryoWrite_1cyc_X_X_212ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_212ln], + (instrs EXTv16i8)>; +def KryoWrite_2cyc_XY_X_136ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_X_136ln], + (instrs EXTRWrri, EXTRXrri)>; +def KryoWrite_2cyc_XY_noRSV_35ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_35ln], + (instregex "F(MAX|MIN)(NM)?P?(D|S)rr")>; +def KryoWrite_2cyc_XY_XY_106ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_106ln], + (instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2i64p|v2f64|v4f32)")>; +def KryoWrite_2cyc_XY_noRSV_104ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_104ln], + (instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f32|v2i32p)")>; +def KryoWrite_3cyc_XY_noRSV_107ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_107ln], + (instregex "F(MAX|MIN)(NM)?Vv4i32v")>; +def KryoWrite_3cyc_XY_noRSV_101ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_101ln], + (instregex "FABD(32|64|v2f32)")>; +def KryoWrite_3cyc_XY_XY_103ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_103ln], + (instregex "(FABD|FADD|FSUB|FADDP)(v4f32|v2f64)")>; +def KryoWrite_1cyc_XY_noRSV_48ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_48ln], + (instregex "F(ABS|NEG)(D|S)r")>; +def KryoWrite_1cyc_XY_noRSV_124ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_124ln], + (instregex "F(ABS|NEG)v2f32")>; +def KryoWrite_1cyc_XY_XY_125ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_125ln], + (instregex "F(ABS|NEG)(v2f64|v4f32)")>; +def KryoWrite_2cyc_XY_noRSV_33ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_33ln], + (instregex "(FAC(GE|GT)|FCM(EQ|GE|GT))(32|64)")>; +def KryoWrite_3cyc_XY_noRSV_30ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_30ln], + (instregex "(FADD|FSUB)(D|S)rr")>; +def KryoWrite_3cyc_XY_noRSV_100ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_100ln], + (instregex "(FADD|FSUB|FADDP)v2f32")>; +def KryoWrite_3cyc_XY_noRSV_29ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_29ln], + (instregex "FADDP(v2i32p|v2i64p)")>; +def KryoWrite_0cyc_XY_31ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_XY_31ln], + (instregex "FCCMPE?(D|S)rr")>; +def KryoWrite_2cyc_XY_noRSV_34ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_34ln], + (instregex "FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64)rz")>; +def KryoWrite_2cyc_XY_XY_36ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_36ln], + (instregex "FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz")>; +def KryoWrite_2cyc_XY_noRSV_105ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_105ln], + (instregex "FCM(EQ|LE|GE|GT|LT)v2i32rz")>; +def KryoWrite_0cyc_XY_32ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_XY_32ln], + (instregex "FCMPE?(D|S)r(r|i)")>; +def KryoWrite_1cyc_XY_noRSV_49ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_49ln], + (instrs FCSELDrrr, FCSELSrrr)>; +def KryoWrite_4cyc_X_noRSV_41ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_41ln], + (instrs FCVTDHr, FCVTDSr, FCVTHDr, FCVTHSr, FCVTSDr, FCVTSHr)>; +def KryoWrite_4cyc_X_38ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_X_38ln], + (instregex "FCVT(((A|N|M|P)(S|U)(S|U)|Z(S|U)_Int(S|U))(W|X)(D|S)ri?|Z(S|U)(d|s))$")>; +def KryoWrite_4cyc_X_noRSV_113ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_113ln], + (instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v1i32|v1i64|v2f32)$")>; +def KryoWrite_4cyc_X_X_117ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_117ln], + (instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v4f32|v2f64)$")>; +def KryoWrite_5cyc_X_X_XY_noRSV_119ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitXY]> { + let Latency = 5; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_5cyc_X_X_XY_noRSV_119ln], + (instregex "FCVTX?N(v2f32|v4f32|v2i32|v4i16|v4i32|v8i16)$")>; +def KryoWrite_4cyc_X_X_116ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_116ln], + (instregex "FCVTL(v2i32|v4i16|v4i32|v8i16)$")>; +def KryoWrite_4cyc_X_noRSV_112ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_112ln], + (instrs FCVTXNv1i64)>; +def KryoWrite_4cyc_X_37ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_X_37ln], + (instregex "FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>; +def KryoWrite_4cyc_X_noRSV_111ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_111ln], + (instregex "FCVTZ(S|U)(v2f32|v1i32|v1i64|v2i32(_shift)?)$")>; +def KryoWrite_4cyc_X_X_115ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_115ln], + (instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>; +def KryoWrite_1cyc_XA_Y_noRSV_43ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln], + (instrs FDIVDrr, FDIVSrr)>; +def KryoWrite_1cyc_XA_Y_noRSV_121ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln], + (instrs FDIVv2f32)>; +def KryoWrite_1cyc_XA_Y_XA_Y_123ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln], + (instrs FDIVv2f64, FDIVv4f32)>; +def KryoWrite_5cyc_X_noRSV_55ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_55ln], + (instregex "FN?M(ADD|SUB)Srrr")>; +def KryoWrite_6cyc_X_noRSV_57ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_noRSV_57ln], + (instregex "FN?M(ADD|SUB)Drrr")>; +def KryoWrite_5cyc_X_noRSV_51ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_51ln], + (instrs FMLAv2f32, FMLSv2f32, FMLAv1i32_indexed, FMLSv1i32_indexed)>; +def KryoWrite_5cyc_X_X_56ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_X_56ln], + (instrs FMLAv4f32, FMLSv4f32)>; +def KryoWrite_6cyc_X_X_61ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_X_61ln], + (instrs FMLAv2f64, FMLSv2f64)>; +def KryoWrite_5cyc_X_noRSV_128ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_128ln], + (instrs FMLAv2i32_indexed, FMLSv2i32_indexed)>; +def KryoWrite_5cyc_X_X_131ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_X_131ln], + (instrs FMLAv4i32_indexed, FMLSv4i32_indexed)>; +def KryoWrite_6cyc_X_X_134ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_X_134ln], + (instrs FMLAv2i64_indexed, FMLSv2i64_indexed)>; +def KryoWrite_6cyc_X_noRSV_60ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_noRSV_60ln], + (instrs FMLAv1i64_indexed, FMLSv1i64_indexed, FMULv1i64_indexed, FMULXv1i64_indexed)>; +def KryoWrite_1cyc_XY_45ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_45ln], + (instregex "FMOV(XDHigh|DXHigh|DX)r")>; +def KryoWrite_1cyc_XY_noRSV_47ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_47ln], + (instregex "FMOV(Di|Dr|Si|Sr|SWr|WSr|XDr|v.*_ns)")>; +def KryoWrite_5cyc_X_noRSV_53ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_53ln], + (instrs FMULv1i32_indexed, FMULXv1i32_indexed)>; +def KryoWrite_5cyc_X_noRSV_127ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_127ln], + (instrs FMULv2f32, FMULXv2f32, FMULv2i32_indexed, FMULXv2i32_indexed)>; +def KryoWrite_5cyc_X_X_130ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_X_130ln], + (instrs FMULv4f32, FMULXv4f32, FMULv4i32_indexed, FMULXv4i32_indexed)>; +def KryoWrite_6cyc_X_X_133ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_X_133ln], + (instrs FMULv2f64, FMULXv2f64, FMULv2i64_indexed, FMULXv2i64_indexed)>; +def KryoWrite_5cyc_X_noRSV_54ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_54ln], + (instrs FMULSrr, FNMULSrr, FMULX32)>; +def KryoWrite_6cyc_X_noRSV_59ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_noRSV_59ln], + (instrs FMULDrr, FNMULDrr, FMULX64)>; +def KryoWrite_3cyc_XY_noRSV_28ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_28ln], + (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64 )>; +def KryoWrite_3cyc_XY_noRSV_99ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_99ln], + (instrs FRECPEv2f32, FRSQRTEv2f32)>; +def KryoWrite_3cyc_XY_XY_102ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_102ln], + (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>; +def KryoWrite_5cyc_X_noRSV_52ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_52ln], + (instrs FRECPS32, FRSQRTS32)>; +def KryoWrite_6cyc_X_noRSV_58ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_noRSV_58ln], + (instrs FRECPS64, FRSQRTS64)>; +def KryoWrite_5cyc_X_noRSV_126ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_126ln], + (instrs FRECPSv2f32, FRSQRTSv2f32)>; +def KryoWrite_5cyc_X_X_129ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_X_129ln], + (instrs FRECPSv4f32, FRSQRTSv4f32)>; +def KryoWrite_6cyc_X_X_132ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_X_132ln], + (instrs FRECPSv2f64, FRSQRTSv2f64)>; +def KryoWrite_3cyc_XY_noRSV_50ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_50ln], + (instrs FRECPXv1i32, FRECPXv1i64)>; +def KryoWrite_2cyc_XY_noRSV_39ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_39ln], + (instregex "FRINT(A|I|M|N|P|X|Z)(S|D)r")>; +def KryoWrite_2cyc_XY_noRSV_108ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_108ln], + (instregex "FRINT(A|I|M|N|P|X|Z)v2f32")>; +def KryoWrite_2cyc_XY_XY_109ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_109ln], + (instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>; +def KryoWrite_1cyc_XA_Y_noRSV_42ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln], + (instregex "FSQRT(S|D)r")>; +def KryoWrite_1cyc_XA_Y_noRSV_120ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln], + (instregex "FSQRTv2f32")>; +def KryoWrite_1cyc_XA_Y_XA_Y_122ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln], + (instregex "FSQRT(v2f64|v4f32)")>; +def KryoWrite_1cyc_X_201ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_X_201ln], + (instregex "INSv.*")>; +def KryoWrite_3cyc_LS_255ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_255ln], + (instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)$")>; +def KryoWrite_4cyc_LS_X_270ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_LS_X_270ln], + (instregex "LD1(i8|i16|i32)$")>; +def KryoWrite_3cyc_LS_noRSV_285ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_285ln], + (instregex "LD1One(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_289ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_289ln, WriteAdr], + (instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)_POST$")>; +def KryoWrite_4cyc_LS_XY_X_298ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_X_298ln, WriteAdr], + (instregex "LD1(i8|i16|i32)_POST$")>; +def KryoWrite_3cyc_LS_LS_LS_308ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_308ln], + (instregex "LD1Three(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_XY_noRSV_317ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_317ln, WriteAdr], + (instregex "LD1One(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_LS_LS_LS_328ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_328ln, WriteAdr], + (instregex "LD1Four(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_332ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_332ln, WriteAdr], + (instregex "LD1Three(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln], + (instregex "LD1Three(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln], + (instregex "LD1Four(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln], + (instregex "LD1Four(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln, WriteAdr], + (instregex "LD1Three(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 7; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln, WriteAdr], + (instregex "LD1Four(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_LS_281ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_281ln], + (instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_noRSV_noRSV_311ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_311ln], + (instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_313ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_313ln, WriteAdr], + (instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln, WriteAdr], + (instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_256ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_256ln], + (instregex "LD1R(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_noRSV_286ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_286ln], + (instregex "LD1R(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_290ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_290ln, WriteAdr], + (instregex "LD1R(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_XY_noRSV_318ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_318ln, WriteAdr], + (instregex "LD1R(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_257ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_257ln], + (instregex "LD2i64$")>; +def KryoWrite_3cyc_LS_XY_291ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_291ln, WriteAdr], + (instregex "LD2i64_POST$")>; +def KryoWrite_4cyc_LS_X_X_296ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_LS_X_X_296ln], + (instregex "LD2(i8|i16|i32)$")>; +def KryoWrite_4cyc_LS_XY_X_X_321ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_321ln, WriteAdr], + (instregex "LD2(i8|i16|i32)_POST$")>; +def KryoWrite_3cyc_LS_LS_282ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_282ln], + (instregex "LD2R(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_noRSV_noRSV_312ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_312ln], + (instregex "LD2R(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_314ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_314ln, WriteAdr], + (instregex "LD2R(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln, WriteAdr], + (instregex "LD2R(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_LS_283ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_283ln], + (instregex "LD3i64$")>; +def KryoWrite_3cyc_LS_LS_LS_309ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_309ln], + (instregex "LD3Threev2d$")>; +def KryoWrite_3cyc_LS_XY_LS_315ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_315ln, WriteAdr], + (instregex "LD3i64_POST$")>; +def KryoWrite_4cyc_LS_X_X_X_320ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_4cyc_LS_X_X_X_320ln], + (instregex "LD3(i8|i16|i32)$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_331ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_331ln, WriteAdr], + (instregex "LD3Threev2d_POST$")>; +def KryoWrite_4cyc_LS_XY_X_X_X_338ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_338ln, WriteAdr], + (instregex "LD3(i8|i16|i32)_POST$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln], + (instregex "LD3Three(v8b|v4h|v2s)$")>; +def KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 9; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln, WriteAdr], + (instregex "LD3Three(v8b|v4h|v2s)_POST$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 10; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln], + (instregex "LD3Three(v16b|v8h|v4s)$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 11; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln, WriteAdr], + (instregex "LD3Three(v16b|v8h|v4s)_POST$")>; +def KryoWrite_3cyc_LS_LS_LS_310ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_310ln], + (instregex "LD3R(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_333ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_333ln, WriteAdr], + (instregex "LD3R(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln], + (instregex "LD3R(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln, WriteAdr], + (instregex "LD3R(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_LS_284ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_284ln], + (instregex "LD4i64$")>; +def KryoWrite_3cyc_LS_XY_LS_316ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_316ln, WriteAdr], + (instregex "LD4i64_POST$")>; +def KryoWrite_3cyc_LS_LS_LS_LS_329ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_329ln], + (instregex "LD4Four(v2d)$")>; +def KryoWrite_4cyc_LS_X_X_X_X_337ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_4cyc_LS_X_X_X_X_337ln], + (instregex "LD4(i8|i16|i32)$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln, WriteAdr], + (instregex "LD4Four(v2d)_POST$")>; +def KryoWrite_4cyc_LS_XY_X_X_X_X_355ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_X_355ln, WriteAdr], + (instregex "LD4(i8|i16|i32)_POST$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 10; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln], + (instregex "LD4Four(v8b|v4h|v2s)$")>; +def KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 11; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln, WriteAdr], + (instregex "LD4Four(v8b|v4h|v2s)_POST$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 12; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln], + (instregex "LD4Four(v16b|v8h|v4s)$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 13; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln, WriteAdr], + (instregex "LD4Four(v16b|v8h|v4s)_POST$")>; +def KryoWrite_3cyc_LS_LS_LS_LS_330ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_330ln], + (instregex "LD4R(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln, WriteAdr], + (instregex "LD4R(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln], + (instregex "LD4R(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 7; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln, WriteAdr], + (instregex "LD4R(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_LS_400ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_400ln], + (instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>; +def KryoWrite_3cyc_LS_LS_401ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_401ln, WriteLDHi], + (instrs LDNPQi)>; +def KryoWrite_3cyc_LS_noRSV_noRSV_408ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_408ln, WriteLDHi], + (instrs LDNPDi, LDNPSi)>; +def KryoWrite_3cyc_LS_394ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_394ln, WriteLDHi], + (instrs LDNPWi, LDNPXi)>; +def KryoWrite_3cyc_LS_LS_402ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_402ln, WriteLDHi], + (instrs LDPQi)>; +def KryoWrite_3cyc_LS_noRSV_noRSV_409ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_409ln, WriteLDHi], + (instrs LDPDi, LDPSi)>; +def KryoWrite_3cyc_LS_XY_LS_410ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_410ln, WriteLDHi, WriteAdr], + (instregex "LDPQ(post|pre)")>; +def KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln, WriteLDHi, WriteAdr], + (instregex "LDP(D|S)(post|pre)")>; +def KryoWrite_3cyc_LS_393ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_393ln, WriteLDHi], + (instrs LDPWi, LDPXi)>; +def KryoWrite_3cyc_LS_XY_403ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_403ln, WriteLDHi, WriteAdr], + (instregex "LDP(W|X)(post|pre)")>; +def KryoWrite_4cyc_LS_395ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_LS_395ln, WriteLDHi], + (instrs LDPSWi)>; +def KryoWrite_4cyc_LS_XY_405ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_405ln, WriteLDHi, WriteAdr], + (instrs LDPSWpost, LDPSWpre)>; +def KryoWrite_3cyc_LS_264ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_264ln], + (instrs LDRQui, LDRQl)>; +def KryoWrite_4cyc_X_LS_271ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_LS_271ln], + (instrs LDRQroW, LDRQroX)>; +def KryoWrite_3cyc_LS_noRSV_287ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_287ln], + (instregex "LDR((D|S)l|(D|S|H|B)ui)")>; +def KryoWrite_3cyc_LS_XY_293ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_293ln, WriteAdr], + (instrs LDRQpost, LDRQpre)>; +def KryoWrite_4cyc_X_LS_noRSV_297ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_X_LS_noRSV_297ln], + (instregex "LDR(D|S|H|B)ro(W|X)")>; +def KryoWrite_3cyc_LS_XY_noRSV_319ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_319ln, WriteAdr], + (instregex "LDR(D|S|H|B)(post|pre)")>; +def KryoWrite_3cyc_LS_261ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_261ln], + (instregex "LDR(BB|HH|W|X)ui")>; +def KryoWrite_3cyc_LS_XY_292ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_292ln, WriteAdr], + (instregex "LDR(BB|HH|W|X)(post|pre)")>; +def KryoWrite_4cyc_X_LS_272ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_LS_272ln], + (instregex "(LDR(BB|HH|W|X)ro(W|X)|PRFMro(W|X))")>; +def KryoWrite_3cyc_LS_262ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_262ln], + (instrs LDRWl, LDRXl)>; +def KryoWrite_4cyc_LS_268ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_LS_268ln], + (instregex "LDRS(BW|BX|HW|HX|W)ui")>; +def KryoWrite_5cyc_X_LS_273ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_LS_273ln], + (instregex "LDRS(BW|BX|HW|HX|W)ro(W|X)")>; +def KryoWrite_4cyc_LS_XY_294ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_294ln, WriteAdr], + (instregex "LDRS(BW|BX|HW|HX|W)(post|pre)")>; +def KryoWrite_4cyc_LS_269ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_LS_269ln], + (instrs LDRSWl)>; +def KryoWrite_3cyc_LS_260ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_260ln], + (instregex "LDTR(B|H|W|X)i")>; +def KryoWrite_4cyc_LS_267ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_LS_267ln], + (instregex "LDTRS(BW|BX|HW|HX|W)i")>; +def KryoWrite_3cyc_LS_263ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_263ln], + (instrs LDURQi)>; +def KryoWrite_3cyc_LS_noRSV_288ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_288ln], + (instregex "LDUR(D|S|H|B)i")>; +def KryoWrite_3cyc_LS_259ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_259ln], + (instregex "LDUR(BB|HH|W|X)i")>; +def KryoWrite_4cyc_LS_266ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_LS_266ln], + (instregex "LDURS(B|H)?(W|X)i")>; +def KryoWrite_3cyc_LS_258ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_258ln], + (instregex "LDXP(W|X)")>; +def KryoWrite_3cyc_LS_258_1ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_258_1ln], + (instregex "LDXR(B|H|W|X)")>; +def KryoWrite_2cyc_XY_XY_137ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_137ln], + (instrs LSLVWr, LSLVXr)>; +def KryoWrite_1cyc_XY_135ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_135ln], + (instregex "(LS|AS|RO)RV(W|X)r")>; +def KryoWrite_4cyc_X_84ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_X_84ln], + (instrs MADDWrrr, MSUBWrrr)>; +def KryoWrite_5cyc_X_85ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_5cyc_X_85ln], + (instrs MADDXrrr, MSUBXrrr)>; +def KryoWrite_4cyc_X_noRSV_188ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_188ln], + (instregex "(MLA|MLS|MUL)(v8i8|v4i16|v2i32)(_indexed)?")>; +def KryoWrite_4cyc_X_X_192ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_192ln], + (instregex "(MLA|MLS|MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?")>; +def KryoWrite_1cyc_XY_noRSV_198ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_198ln], + (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)")>; +def KryoWrite_1cyc_XY_XY_199ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_199ln], + (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)")>; +def KryoWrite_1cyc_X_89ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_X_89ln], + (instrs MOVKWi, MOVKXi)>; +def KryoWrite_1cyc_XY_91ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_91ln], + (instrs MOVNWi, MOVNXi)>; +def KryoWrite_1cyc_XY_90ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_90ln], + (instrs MOVZWi, MOVZXi)>; +def KryoWrite_2cyc_XY_93ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_2cyc_XY_93ln], + (instrs MRS)>; +def KryoWrite_0cyc_X_87ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_X_87ln], + (instrs MSRpstateImm4)>; +def : InstRW<[KryoWrite_0cyc_X_87ln], + (instrs MSRpstateImm1)>; +def KryoWrite_0cyc_XY_88ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_XY_88ln], + (instrs MSR)>; +def KryoWrite_1cyc_XY_noRSV_143ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_143ln], + (instregex "NEG(v8i8|v4i16|v2i32|v1i64)")>; +def KryoWrite_1cyc_XY_XY_145ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_145ln], + (instregex "NEG(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_1cyc_XY_noRSV_193ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_193ln], + (instrs NOTv8i8)>; +def KryoWrite_1cyc_XY_XY_194ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_194ln], + (instrs NOTv16i8)>; +def KryoWrite_2cyc_XY_noRSV_234ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_234ln], + (instrs PMULv8i8)>; +def KryoWrite_2cyc_XY_XY_236ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_236ln], + (instrs PMULv16i8)>; +def KryoWrite_2cyc_XY_XY_235ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_235ln], + (instrs PMULLv8i8, PMULLv16i8)>; +def KryoWrite_3cyc_XY_XY_237ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_237ln], + (instrs PMULLv1i64, PMULLv2i64)>; +def KryoWrite_0cyc_LS_254ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_LS_254ln], + (instrs PRFMl, PRFMui)>; +def KryoWrite_0cyc_LS_253ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_LS_253ln], + (instrs PRFUMi)>; +def KryoWrite_6cyc_XY_X_noRSV_175ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_175ln], + (instregex "R(ADD|SUB)HNv.*")>; +def KryoWrite_2cyc_XY_204ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_2cyc_XY_204ln], + (instrs RBITWr, RBITXr)>; +def KryoWrite_2cyc_XY_noRSV_218ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_218ln], + (instrs RBITv8i8)>; +def KryoWrite_2cyc_XY_XY_219ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_219ln], + (instrs RBITv16i8)>; +def KryoWrite_1cyc_X_202ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_X_202ln], + (instregex "REV(16|32)?(W|X)r")>; +def KryoWrite_1cyc_XY_noRSV_214ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_214ln], + (instregex "REV(16|32|64)(v8i8|v4i16|v2i32)")>; +def KryoWrite_1cyc_XY_XY_216ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_216ln], + (instregex "REV(16|32|64)(v16i8|v8i16|v4i32)")>; +def KryoWrite_3cyc_X_noRSV_244ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_noRSV_244ln], + (instregex "S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)")>; +def KryoWrite_3cyc_X_X_245ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_X_245ln], + (instregex "S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift")>; +def KryoWrite_1cyc_XY_2ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_2ln, ReadI, ReadI], + (instregex "SBCS?(W|X)r")>; +def KryoWrite_2cyc_XA_XA_XA_24ln : + SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> { + let Latency = 2; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_2cyc_XA_XA_XA_24ln], + (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr)>; +def KryoWrite_1cyc_XY_noRSV_21ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_21ln], + (instrs SHA1Hrr)>; +def KryoWrite_2cyc_X_X_23ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_X_X_23ln], + (instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>; +def KryoWrite_4cyc_XA_XA_XA_25ln : + SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_XA_XA_XA_25ln], + (instrs SHA256Hrrr, SHA256H2rrr)>; +def KryoWrite_3cyc_XY_XY_X_X_26ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_26ln], + (instrs SHA256SU1rrr)>; +def KryoWrite_4cyc_X_noRSV_189ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_189ln], + (instregex "SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?")>; +def KryoWrite_3cyc_XY_noRSV_68ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_68ln], + (instregex "SQ(ABS|NEG)(v1i8|v1i16|v1i32|v1i64)")>; +def KryoWrite_3cyc_XY_noRSV_157ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_157ln], + (instregex "SQ(ABS|NEG)(v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_XY_164ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_164ln], + (instregex "SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_4cyc_X_noRSV_190ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_190ln], + (instregex "SQD(MLAL|MLSL|MULL)(i16|i32)")>; +def KryoWrite_0cyc_LS_Y_274ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_274ln], + (instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))$")>; +def KryoWrite_1cyc_LS_Y_X_301ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_301ln], + (instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))_POST$")>; +def KryoWrite_1cyc_LS_Y_XY_305ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_305ln], + (instregex "ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_323ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_323ln], + (instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln], + (instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY]> { + let Latency = 0; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln], + (instregex "ST1Three(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY, + KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 7; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln], + (instregex "ST1Three(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln], + (instregex "ST1Four(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY, + KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 9; +} +def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln], + (instregex "ST1Four(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_0cyc_LS_Y_275ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_275ln], + (instregex "ST2(Two(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64))$")>; +def KryoWrite_1cyc_LS_Y_XY_306ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_306ln], + (instregex "ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_322ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_322ln], + (instregex "ST2Two(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 5; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln], + (instregex "ST2Two(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_324ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_324ln], + (instregex "ST3(Threev1d|(i8|i16|i32|i64))$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 5; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln], + (instregex "ST3(Threev1d|(i8|i16|i32|i64))_POST$")>; +def KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY]> { + let Latency = 1; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln], + (instregex "ST3Three(v8b|v4h|v2s)$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY]> { + let Latency = 0; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln], + (instregex "ST3Threev2d$")>; +def KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, + KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 7; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln], + (instregex "ST3Three(v8b|v4h|v2s)_POST$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY, + KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 7; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln], + (instregex "ST3Threev2d_POST$")>; +def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, + KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 12; +} +def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln], + (instregex "ST3Three(v16b|v8h|v4s)$")>; +def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, + KryoUnitXY, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 13; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln], + (instregex "ST3Three(v16b|v8h|v4s)_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_325ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_325ln], + (instregex "ST4(Fourv1d|(i8|i16|i32|i64))$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 5; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln], + (instregex "ST4(Fourv1d|(i8|i16|i32|i64))_POST$")>; +def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, + KryoUnitX, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln], + (instregex "ST4Four(v8b|v4h|v2s)$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln], + (instregex "ST4Fourv2d$")>; +def KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, + KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 9; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln], + (instregex "ST4Four(v8b|v4h|v2s)_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY, + KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 9; +} +def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln], + (instregex "ST4Fourv2d_POST$")>; +def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, + KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, + KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, + KryoUnitY]> { + let Latency = 1; let NumMicroOps = 16; +} +def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln], + (instregex "ST4Four(v16b|v8h|v4s)$")>; +def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, + KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, + KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitX, KryoUnitX, + KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 17; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln], + (instregex "ST4Four(v16b|v8h|v4s)_POST$")>; +def KryoWrite_0cyc_LS_LS_Y_299ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_0cyc_LS_LS_Y_299ln], + (instregex "STLR(B|H|W|X)")>; +def KryoWrite_3cyc_LS_LS_Y_307ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_Y_307ln], + (instregex "STLX(P(W|X)|R(B|H|W|X))")>; +def KryoWrite_0cyc_LS_Y_276ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_276ln], + (instrs STNPDi, STNPSi)>; +def KryoWrite_0cyc_LS_Y_LS_Y_326ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_326ln], + (instrs STNPQi)>; +def KryoWrite_0cyc_LS_Y_280ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_280ln], + (instrs STNPWi, STNPXi)>; +def KryoWrite_0cyc_LS_Y_277ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_277ln], + (instregex "STP(D|S)i")>; +def KryoWrite_1cyc_LS_Y_X_303ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_303ln], + (instregex "STP(D|S)(post|pre)")>; +def KryoWrite_0cyc_LS_Y_LS_Y_327ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_327ln], + (instrs STPQi)>; +def KryoWrite_1cyc_LS_Y_X_LS_Y_343ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 5; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_LS_Y_343ln], + (instrs STPQpost, STPQpre)>; +def KryoWrite_0cyc_LS_Y_279ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_279ln], + (instregex "STP(W|X)i")>; +def KryoWrite_1cyc_LS_X_Y_300ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_300ln], + (instregex "STP(W|X)(post|pre)")>; +def KryoWrite_0cyc_LS_Y_278ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_278ln], + (instregex "STR(Q|D|S|H|B)ui")>; +def KryoWrite_1cyc_X_LS_Y_295ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_X_LS_Y_295ln], + (instregex "STR(D|S|H|B)ro(W|X)")>; +def KryoWrite_1cyc_LS_Y_X_304ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_304ln], + (instregex "STR(Q|D|S|H|B)(post|pre)")>; +def KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, + KryoUnitY]> { + let Latency = 2; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln], + (instregex "STRQro(W|X)")>; +def KryoWrite_0cyc_LS_Y_399ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_399ln], + (instregex "STR(BB|HH|W|X)ui")>; +def KryoWrite_1cyc_X_LS_Y_406ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_X_LS_Y_406ln], + (instregex "STR(BB|HH|W|X)ro(W|X)")>; +def KryoWrite_1cyc_LS_X_Y_407ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_407ln], + (instregex "STR(BB|HH|W|X)(post|pre)")>; +def KryoWrite_0cyc_LS_Y_398ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_398ln], + (instregex "STTR(B|H|W|X)i")>; +def KryoWrite_0cyc_LS_Y_396ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_396ln], + (instregex "STUR(Q|D|S|H|B)i")>; +def KryoWrite_0cyc_LS_Y_397ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_397ln], + (instregex "STUR(BB|HH|W|X)i")>; +def KryoWrite_3cyc_LS_Y_404ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_Y_404ln], + (instregex "STX(P(W|X)|R(B|H|W|X))")>; +def KryoWrite_3cyc_XY_noRSV_160ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_160ln], + (instregex "^(SU|US)QADD(v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_XY_167ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_167ln], + (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_1cyc_XY_1ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_1ln, ReadI], + (instregex "SUBS?(W|X)ri")>; +def KryoWrite_2cyc_XY_XY_5ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_5ln, ReadI, ReadIEReg], + (instregex "SUBS?(W|X)rx")>; +def KryoWrite_2cyc_XY_XY_5_1ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_5_1ln, ReadI, ReadISReg], + (instregex "SUBS?(W|X)rs")>; +def KryoWrite_1cyc_XY_noRSV_6ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_6ln, ReadI, ReadI], + (instregex "SUBS?(W|X)rr")>; +def KryoWrite_0cyc_LS_9ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_LS_9ln], + (instregex "SYSL?xt")>; +def KryoWrite_1cyc_X_noRSV_205ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_noRSV_205ln], + (instrs TBLv8i8One)>; +def KryoWrite_1cyc_X_X_208ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_208ln], + (instrs TBLv16i8One)>; +def KryoWrite_2cyc_X_X_X_noRSV_222ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_2cyc_X_X_X_noRSV_222ln], + (instrs TBLv8i8Two)>; +def KryoWrite_2cyc_X_X_X_X_X_X_224ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 2; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_2cyc_X_X_X_X_X_X_224ln], + (instrs TBLv16i8Two)>; +def KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln], + (instrs TBLv8i8Three)>; +def KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln], + (instrs TBLv8i8Four)>; +def KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 11; +} +def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln], + (instrs TBLv16i8Three)>; +def KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 15; +} +def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln], + (instrs TBLv16i8Four)>; +def KryoWrite_2cyc_X_X_noRSV_220ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_2cyc_X_X_noRSV_220ln], + (instrs TBXv8i8One)>; +def KryoWrite_2cyc_X_X_X_X_221ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_2cyc_X_X_X_X_221ln], + (instrs TBXv16i8One)>; +def KryoWrite_3cyc_X_X_X_X_noRSV_223ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_X_X_X_X_noRSV_223ln], + (instrs TBXv8i8Two)>; +def KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 7; +} +def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln], + (instrs TBXv8i8Three)>; +def KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln], + (instrs TBXv16i8Two)>; +def KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 9; +} +def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln], + (instrs TBXv8i8Four)>; +def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY, + KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 13; +} +def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln], + (instrs TBXv16i8Three)>; +def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 17; +} +def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln], + (instrs TBXv16i8Four)>; +def KryoWrite_1cyc_XY_XY_217ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_217ln], + (instregex "((TRN1|TRN2|ZIP1|UZP1|UZP2)v2i64|ZIP2(v2i64|v4i32|v8i16|v16i8))")>; +def KryoWrite_1cyc_X_X_211ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_211ln], + (instregex "(TRN1|TRN2)(v4i32|v8i16|v16i8)")>; +def KryoWrite_1cyc_X_XY_213ln : + SchedWriteRes<[KryoUnitX, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_XY_213ln], + (instregex "(TRN1|TRN2)(v2i32|v4i16|v8i8)")>; +def KryoWrite_3cyc_XY_noRSV_156ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_156ln], + (instrs URECPEv2i32, URSQRTEv2i32)>; +def KryoWrite_3cyc_XY_XY_168ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_168ln], + (instrs URECPEv4i32, URSQRTEv4i32)>; +def KryoWrite_1cyc_X_X_210ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_210ln], + (instregex "(UZP1|UZP2)(v4i32|v8i16|v16i8)")>; +def KryoWrite_1cyc_X_noRSV_206ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_noRSV_206ln], + (instregex "(UZP1|UZP2|ZIP1|ZIP2)(v2i32|v4i16|v8i8)")>; +def KryoWrite_1cyc_XY_noRSV_215ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_215ln], + (instregex "XTNv.*")>; +def KryoWrite_1cyc_X_X_209ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_209ln], + (instregex "ZIP1(v4i32|v8i16|v16i8)")>; diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td index 6525628dbfd6..2288b8dfc223 100644 --- a/lib/Target/AArch64/AArch64SchedM1.td +++ b/lib/Target/AArch64/AArch64SchedM1.td @@ -19,9 +19,8 @@ def ExynosM1Model : SchedMachineModel { let IssueWidth = 4; // Up to 4 uops per cycle. - let MinLatency = 0; // OoO. let MicroOpBufferSize = 96; // ROB size. - let LoopMicroOpBufferSize = 32; // Instruction queue size. + let LoopMicroOpBufferSize = 24; // Based on the instruction queue size. let LoadLatency = 4; // Optimistic load cases. let MispredictPenalty = 14; // Minimum branch misprediction penalty. let CompleteModel = 0; // Use the default model otherwise. @@ -142,12 +141,13 @@ def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 3; } // Other miscellaneous instructions. -def : WriteRes { let Latency = 1; } +def : WriteRes { let Unsupported = 1; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } //===----------------------------------------------------------------------===// -// Fast forwarding. +// Generic fast forwarding. // TODO: Add FP register forwarding rules. @@ -187,6 +187,10 @@ def M1WriteNEONH : SchedWriteRes<[M1UnitNALU, M1UnitFST]> { let Latency = 3; } def M1WriteNEONI : SchedWriteRes<[M1UnitFST, M1UnitL]> { let Latency = 9; } +def M1WriteNEONJ : SchedWriteRes<[M1UnitNMISC, + M1UnitFMAC]> { let Latency = 6; } +def M1WriteNEONK : SchedWriteRes<[M1UnitNMISC, + M1UnitFMAC]> { let Latency = 7; } def M1WriteALU1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; } def M1WriteB : SchedWriteRes<[M1UnitB]> { let Latency = 1; } // FIXME: This is the worst case, conditional branch and link. @@ -305,8 +309,10 @@ def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>; def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>; def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>; def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>; -def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v")>; -def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v")>; +def : InstRW<[M1WriteNEONJ], (instregex "^FMULX?v.i")>; +def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v.f")>; +def : InstRW<[M1WriteNEONK], (instregex "^FML[AS]v.i")>; +def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v.f")>; def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>; // ASIMD miscellaneous instructions. @@ -337,16 +343,19 @@ def : InstRW<[WriteSequence<[M1WriteNAL12], 4>], (instregex "^TB[LX]v16i8Four")>; def : InstRW<[M1WriteNEOND], (instregex "^[SU]MOVv")>; def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>; -def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>; -def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>; -def : InstRW<[M1WriteNALU1], (instregex "^ZIP(1|2)v")>; +def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)[12](v8i8|v4i16|v2i32)")>; +def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>; // ASIMD load instructions. // ASIMD store instructions. // Cryptography instructions. -def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>; +def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } +def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>; +def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>; + def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>; def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>; def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>; diff --git a/lib/Target/AArch64/AArch64SchedVulcan.td b/lib/Target/AArch64/AArch64SchedVulcan.td new file mode 100644 index 000000000000..0aa2462eba83 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedVulcan.td @@ -0,0 +1,855 @@ +//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// 1. Introduction +// +// This file defines the machine model for Broadcom Vulcan to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// 2. Pipeline Description. + +def VulcanModel : SchedMachineModel { + let IssueWidth = 4; // 4 micro-ops dispatched at a time. + let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer. + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 12; // Extra cycles for mispredicted branch. + // Determined via a mix of micro-arch details and experimentation. + let LoopMicroOpBufferSize = 32; + let PostRAScheduler = 1; // Using PostRA sched. + let CompleteModel = 1; +} + +// Define the issue ports. + +// Port 0: ALU, FP/SIMD. +def VulcanP0 : ProcResource<1>; + +// Port 1: ALU, FP/SIMD, integer mul/div. +def VulcanP1 : ProcResource<1>; + +// Port 2: ALU, Branch. +def VulcanP2 : ProcResource<1>; + +// Port 3: Store data. +def VulcanP3 : ProcResource<1>; + +// Port 4: Load/store. +def VulcanP4 : ProcResource<1>; + +// Port 5: Load/store. +def VulcanP5 : ProcResource<1>; + +let SchedModel = VulcanModel in { + +// Define groups for the functional units on each +// issue port. Each group created will be used +// by a WriteRes later on. +// +// NOTE: Some groups only contain one member. This +// is a way to create names for the various functional +// units that share a single issue port. For example, +// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for +// FP ops on port 1. + +// Integer divide and multiply micro-ops only on port 1. +def VulcanI1 : ProcResGroup<[VulcanP1]>; + +// Branch micro-ops only on port 2. +def VulcanI2 : ProcResGroup<[VulcanP2]>; + +// ALU micro-ops on ports 0, 1, and 2. +def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>; + +// Crypto FP/SIMD micro-ops only on port 1. +def VulcanF1 : ProcResGroup<[VulcanP1]>; + +// FP/SIMD micro-ops on ports 0 and 1. +def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>; + +// Store data micro-ops only on port 3. +def VulcanSD : ProcResGroup<[VulcanP3]>; + +// Load/store micro-ops on ports 4 and 5. +def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>; + +// 60 entry unified scheduler. +def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2, + VulcanP3, VulcanP4, VulcanP5]> { + let BufferSize=60; +} + +// Define commonly used write types for InstRW specializations. +// All definitions follow the format: VulcanWrite_Cyc_. + +// 3 cycles on I1. +def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; } + +// 4 cycles on I1. +def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; } + +// 1 cycle on I0, I1, or I2. +def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; } + +// 5 cycles on F1. +def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; } + +// 7 cycles on F1. +def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; } + +// 4 cycles on F0 or F1. +def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; } + +// 5 cycles on F0 or F1. +def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; } + +// 6 cycles on F0 or F1. +def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; } + +// 7 cycles on F0 or F1. +def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; } + +// 8 cycles on F0 or F1. +def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; } + +// 16 cycles on F0 or F1. +def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> { + let Latency = 16; + let ResourceCycles = [8]; +} + +// 23 cycles on F0 or F1. +def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> { + let Latency = 23; + let ResourceCycles = [11]; +} + +// 1 cycles on LS0 or LS1. +def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; } + +// 4 cycles on LS0 or LS1. +def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; } + +// 5 cycles on LS0 or LS1. +def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; } + +// 6 cycles on LS0 or LS1. +def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; } + +// 5 cycles on LS0 or LS1 and I0, I1, or I2. +def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2. +def VulcanWrite_6Cyc_LS01_I012_I012 : + SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 1 cycles on LS0 or LS1 and F0 or F1. +def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 5 cycles on LS0 or LS1 and F0 or F1. +def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 6 cycles on LS0 or LS1 and F0 or F1. +def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { + let Latency = 6; + let NumMicroOps = 2; +} + +// 7 cycles on LS0 or LS1 and F0 or F1. +def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { + let Latency = 7; + let NumMicroOps = 2; +} + +// 8 cycles on LS0 or LS1 and F0 or F1. +def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { + let Latency = 8; + let NumMicroOps = 2; +} + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +} + + +//===----------------------------------------------------------------------===// +// 3. Instruction Tables. + +let SchedModel = VulcanModel in { + +//--- +// 3.1 Branch Instructions +//--- + +// Branch, immed +// Branch and link, immed +// Compare and branch +def : WriteRes { let Latency = 1; } + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { let Unsupported = 1; } + +// Branch, register +// Branch and link, register != LR +// Branch and link, register = LR +def : WriteRes { let Latency = 1; } + +//--- +// 3.2 Arithmetic and Logical Instructions +// 3.3 Move and Shift Instructions +//--- + +// ALU, basic +// Conditional compare +// Conditional select +// Address generation +def : WriteRes { let Latency = 1; } +def : InstRW<[WriteI], (instrs COPY)>; + +// ALU, extend and/or shift +def : WriteRes { + let Latency = 2; + let ResourceCycles = [2]; +} + +def : WriteRes { + let Latency = 2; + let ResourceCycles = [2]; +} + +// Move immed +def : WriteRes { let Latency = 1; } + +// Variable shift +def : WriteRes { let Latency = 1; } + +//--- +// 3.4 Divide and Multiply Instructions +//--- + +// Divide, W-form +// Latency range of 13-23. Take the average. +def : WriteRes { + let Latency = 18; + let ResourceCycles = [18]; +} + +// Divide, X-form +// Latency range of 13-39. Take the average. +def : WriteRes { + let Latency = 26; + let ResourceCycles = [26]; +} + +// Multiply accumulate, W-form +def : WriteRes { let Latency = 5; } + +// Multiply accumulate, X-form +def : WriteRes { let Latency = 5; } + +// Bitfield extract, two reg +def : WriteRes { let Latency = 1; } + +// Bitfield move, basic +// Bitfield move, insert +// NOTE: Handled by WriteIS. + +// Count leading +def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$", + "^CLZ(W|X)r$")>; + +// Reverse bits/bytes +// NOTE: Handled by WriteI. + +//--- +// 3.6 Load Instructions +// 3.10 FP Load Instructions +//--- + +// Load register, literal +// Load register, unscaled immed +// Load register, immed unprivileged +// Load register, unsigned immed +def : WriteRes { let Latency = 4; } + +// Load register, immed post-index +// NOTE: Handled by WriteLD, WriteI. +// Load register, immed pre-index +// NOTE: Handled by WriteLD, WriteAdr. +def : WriteRes { let Latency = 1; } + +// Load register offset, basic +// Load register, register offset, scale by 4/8 +// Load register, register offset, scale by 2 +// Load register offset, extend +// Load register, register offset, extend, scale by 4/8 +// Load register, register offset, extend, scale by 2 +def VulcanWriteLDIdx : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +def VulcanReadAdrBase : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +// Load pair, immed offset, normal +// Load pair, immed offset, signed words, base != SP +// Load pair, immed offset signed words, base = SP +// LDP only breaks into *one* LS micro-op. Thus +// the resources are handling by WriteLD. +def : WriteRes { + let Latency = 5; +} + +// Load pair, immed pre-index, normal +// Load pair, immed pre-index, signed words +// Load pair, immed post-index, normal +// Load pair, immed post-index, signed words +// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr. + +//-- +// 3.7 Store Instructions +// 3.11 FP Store Instructions +//-- + +// Store register, unscaled immed +// Store register, immed unprivileged +// Store register, unsigned immed +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store register, immed post-index +// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase + +// Store register, immed pre-index +// NOTE: Handled by WriteAdr, WriteST + +// Store register, register offset, basic +// Store register, register offset, scaled by 4/8 +// Store register, register offset, scaled by 2 +// Store register, register offset, extend +// Store register, register offset, extend, scale by 4/8 +// Store register, register offset, extend, scale by 1 +def : WriteRes { + let Latency = 1; + let NumMicroOps = 3; +} + +// Store pair, immed offset, W-form +// Store pair, immed offset, X-form +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store pair, immed post-index, W-form +// Store pair, immed post-index, X-form +// Store pair, immed pre-index, W-form +// Store pair, immed pre-index, X-form +// NOTE: Handled by WriteAdr, WriteSTP. + +//--- +// 3.8 FP Data Processing Instructions +//--- + +// FP absolute value +// FP min/max +// FP negate +def : WriteRes { let Latency = 5; } + +// FP arithmetic +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>; + +// FP compare +def : WriteRes { let Latency = 5; } + +// FP divide, S-form +// FP square root, S-form +def : WriteRes { + let Latency = 16; + let ResourceCycles = [8]; +} + +// FP divide, D-form +// FP square root, D-form +def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>; + +// FP multiply +// FP multiply accumulate +def : WriteRes { let Latency = 6; } + +// FP round to integral +def : InstRW<[VulcanWrite_7Cyc_F01], + (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; + +// FP select +def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>; + +//--- +// 3.9 FP Miscellaneous Instructions +//--- + +// FP convert, from vec to vec reg +// FP convert, from gen to vec reg +// FP convert, from vec to gen reg +def : WriteRes { let Latency = 7; } + +// FP move, immed +// FP move, register +def : WriteRes { let Latency = 4; } + +// FP transfer, from gen to vec reg +// FP transfer, from vec to gen reg +def : WriteRes { let Latency = 4; } +def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; + +//--- +// 3.12 ASIMD Integer Instructions +//--- + +// ASIMD absolute diff, D-form +// ASIMD absolute diff, Q-form +// ASIMD absolute diff accum, D-form +// ASIMD absolute diff accum, Q-form +// ASIMD absolute diff accum long +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD compare +// ASIMD logical (AND, BIC, EOR) +// ASIMD max/min, basic +// ASIMD max/min, reduce, 4H/4S +// ASIMD max/min, reduce, 8B/8H +// ASIMD max/min, reduce, 16B +// ASIMD multiply, D-form +// ASIMD multiply, Q-form +// ASIMD multiply accumulate long +// ASIMD multiply accumulate saturating long +// ASIMD multiply long +// ASIMD pairwise add and accumulate +// ASIMD shift accumulate +// ASIMD shift by immed, basic +// ASIMD shift by immed and insert, basic, D-form +// ASIMD shift by immed and insert, basic, Q-form +// ASIMD shift by immed, complex +// ASIMD shift by register, basic, D-form +// ASIMD shift by register, basic, Q-form +// ASIMD shift by register, complex, D-form +// ASIMD shift by register, complex, Q-form +def : WriteRes { let Latency = 7; } + +// ASIMD arith, reduce, 4H/4S +// ASIMD arith, reduce, 8B/8H +// ASIMD arith, reduce, 16B +def : InstRW<[VulcanWrite_5Cyc_F01], + (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; + +// ASIMD logical (MOV, MVN, ORN, ORR) +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>; + +// ASIMD polynomial (8x8) multiply long +def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>; + +//--- +// 3.13 ASIMD Floating-point Instructions +//--- + +// ASIMD FP absolute value +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>; + +// ASIMD FP arith, normal, D-form +// ASIMD FP arith, normal, Q-form +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>; + +// ASIMD FP arith,pairwise, D-form +// ASIMD FP arith, pairwise, Q-form +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>; + +// ASIMD FP compare, D-form +// ASIMD FP compare, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>; +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv", + "^FCMGTv", "^FCMLEv", + "^FCMLTv")>; + +// ASIMD FP convert, long +// ASIMD FP convert, narrow +// ASIMD FP convert, other, D-form +// ASIMD FP convert, other, Q-form +// NOTE: Handled by WriteV. + +// ASIMD FP divide, D-form, F32 +def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>; + +// ASIMD FP max/min, normal, D-form +// ASIMD FP max/min, normal, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv", + "^FMINv", "^FMINNMv")>; + +// ASIMD FP max/min, pairwise, D-form +// ASIMD FP max/min, pairwise, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv", + "^FMINPv", "^FMINNMPv")>; + +// ASIMD FP max/min, reduce +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv", + "^FMINVv", "^FMINNMVv")>; + +// ASIMD FP multiply, D-form, FZ +// ASIMD FP multiply, D-form, no FZ +// ASIMD FP multiply, Q-form, FZ +// ASIMD FP multiply, Q-form, no FZ +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>; + +// ASIMD FP multiply accumulate, Dform, FZ +// ASIMD FP multiply accumulate, Dform, no FZ +// ASIMD FP multiply accumulate, Qform, FZ +// ASIMD FP multiply accumulate, Qform, no FZ +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>; + +// ASIMD FP negate +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>; + +// ASIMD FP round, D-form +// ASIMD FP round, Q-form +// NOTE: Handled by WriteV. + +//-- +// 3.14 ASIMD Miscellaneous Instructions +//-- + +// ASIMD bit reverse +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>; + +// ASIMD bitwise insert, D-form +// ASIMD bitwise insert, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>; + +// ASIMD count, D-form +// ASIMD count, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>; + +// ASIMD duplicate, gen reg +// ASIMD duplicate, element +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>; + +// ASIMD extract +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>; + +// ASIMD extract narrow +// ASIMD extract narrow, saturating +// NOTE: Handled by WriteV. + +// ASIMD insert, element to element +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>; + +// ASIMD move, integer immed +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>; + +// ASIMD move, FP immed +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>; + +// ASIMD reciprocal estimate, D-form +// ASIMD reciprocal estimate, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], + (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", + "^FRSQRTEv", "^URSQRTEv")>; + +// ASIMD reciprocal step, D-form, FZ +// ASIMD reciprocal step, D-form, no FZ +// ASIMD reciprocal step, Q-form, FZ +// ASIMD reciprocal step, Q-form, no FZ +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>; + +// ASIMD reverse +def : InstRW<[VulcanWrite_5Cyc_F01], + (instregex "^REV16v", "^REV32v", "^REV64v")>; + +// ASIMD table lookup, D-form +// ASIMD table lookup, Q-form +def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>; + +// ASIMD transfer, element to word or word +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>; + +// ASIMD transfer gen reg to element +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>; + +// ASIMD transpose +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v", + "^UZP1v", "^UZP2v")>; + +// ASIMD unzip/zip +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>; + +//-- +// 3.15 ASIMD Load Instructions +//-- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[VulcanWrite_4Cyc_LS01], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[VulcanWrite_4Cyc_LS01], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[VulcanWrite_5Cyc_LS01], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[VulcanWrite_6Cyc_LS01], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[VulcanWrite_5Cyc_LS01_F01], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_5Cyc_LS01_F01], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[VulcanWrite_5Cyc_LS01_F01], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_8Cyc_LS01_F01], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lone, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], + (instregex "^LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[VulcanWrite_7Cyc_LS01_F01], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_8Cyc_LS01_F01], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], + (instregex "^LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[VulcanWrite_6Cyc_LS01_F01], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +//-- +// 3.16 ASIMD Store Instructions +//-- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[VulcanWrite_1Cyc_LS01], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[VulcanWrite_1Cyc_LS01], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[VulcanWrite_1Cyc_LS01], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[VulcanWrite_1Cyc_LS01], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], + (instregex "^ST1i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], + (instregex "^ST2i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H +// ASIMD store, 4 element, one lane, S +// ASIMD store, 4 element, one lane, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST4i(8|16|32|64)_POST$")>; + +//-- +// 3.17 Cryptography Extensions +//-- + +// Crypto AES ops +def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>; + +// Crypto polynomial (64x64) multiply long +def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>; + +// Crypto SHA1 xor ops +// Crypto SHA1 schedule acceleration ops +// Crypto SHA256 schedule acceleration op (1 u-op) +// Crypto SHA256 schedule acceleration op (2 u-ops) +// Crypto SHA256 hash acceleration ops +def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>; + +//-- +// 3.18 CRC +//-- + +// CRC checksum ops +def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>; + +} // SchedModel = VulcanModel diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td index eaa9110ab1bc..ce81f48acf71 100644 --- a/lib/Target/AArch64/AArch64Schedule.td +++ b/lib/Target/AArch64/AArch64Schedule.td @@ -51,15 +51,15 @@ def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled). def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST. // Predicate for determining when a shiftable register is shifted. -def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>; +def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(*MI)}]>; // Predicate for determining when a extendedable register is extended. -def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>; +def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(*MI)}]>; // ScaledIdxPred is true if a WriteLDIdx operand will be // scaled. Subtargets can use this to dynamically select resources and // latency for WriteLDIdx and ReadAdrBase. -def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>; +def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(*MI)}]>; // Serialized two-level address load. // EXAMPLE: LOADGot @@ -92,6 +92,8 @@ def WriteV : SchedWrite; // Vector ops. def WriteVLD : SchedWrite; // Vector loads. def WriteVST : SchedWrite; // Vector stores. +def WriteAtomic : SchedWrite; // Atomic memory operations (CAS, Swap, LDOP) + // Read the unwritten lanes of the VLD's destination registers. def ReadVLD : SchedRead; diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index f40293021d74..66a8f332513a 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -17,7 +17,7 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-selectiondag-info" SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( - SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const { // Check to see if there is a specialized entry-point for memory zeroing. @@ -44,10 +44,16 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), 0) + DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args)) .setDiscardResult(); std::pair CallResult = TLI.LowerCallTo(CLI); return CallResult.second; } return SDValue(); } +bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner( + CodeGenOpt::Level OptLevel) const { + if (OptLevel >= CodeGenOpt::Aggressive) + return true; + return false; +} diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 97421b45b122..7e4f11091226 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -7,24 +7,24 @@ // //===----------------------------------------------------------------------===// // -// This file defines the AArch64 subclass for TargetSelectionDAGInfo. +// This file defines the AArch64 subclass for SelectionDAGTargetInfo. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H #define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H -#include "llvm/Target/TargetSelectionDAGInfo.h" +#include "llvm/CodeGen/SelectionDAGTargetInfo.h" namespace llvm { -class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo { +class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo { public: - - SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, - SDValue Dst, SDValue Src, SDValue Size, - unsigned Align, bool isVolatile, + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const override; + bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override; }; } diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp index 1c6b15790ea9..f904b2379416 100644 --- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -115,6 +115,9 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) { } bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + const TargetSubtargetInfo &ST = MF.getSubtarget(); TII = static_cast(ST.getInstrInfo()); TRI = ST.getRegisterInfo(); @@ -141,8 +144,8 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { if (!isNarrowFPStore(MI)) continue; unsigned BaseReg; - unsigned Offset; - if (TII->getMemOpBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) { + int64_t Offset; + if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) { if (PrevBaseReg == BaseReg) { // If this block can take STPs, skip ahead to the next block. if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent())) @@ -150,7 +153,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { // Otherwise, continue unpairing the stores in this block. DEBUG(dbgs() << "Unpairing store " << MI << "\n"); SuppressSTP = true; - TII->suppressLdStPair(&MI); + TII->suppressLdStPair(MI); } PrevBaseReg = BaseReg; } else diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index f6ee8cf47a6a..7dd8ccbe6c25 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -11,10 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "AArch64Subtarget.h" #include "AArch64InstrInfo.h" #include "AArch64PBQPRegAlloc.h" -#include "AArch64Subtarget.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Support/TargetRegistry.h" @@ -44,58 +43,83 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) { CPUString = "generic"; ParseSubtargetFeatures(CPUString, FS); + initializeProperties(); + return *this; } +void AArch64Subtarget::initializeProperties() { + // Initialize CPU specific properties. We should add a tablegen feature for + // this in the future so we can specify it together with the subtarget + // features. + switch (ARMProcFamily) { + case Cyclone: + CacheLineSize = 64; + PrefetchDistance = 280; + MinPrefetchStride = 2048; + MaxPrefetchIterationsAhead = 3; + break; + case CortexA57: + MaxInterleaveFactor = 4; + break; + case ExynosM1: + PrefFunctionAlignment = 4; + PrefLoopAlignment = 3; + break; + case Kryo: + MaxInterleaveFactor = 4; + VectorInsertExtractBaseCost = 2; + CacheLineSize = 128; + PrefetchDistance = 740; + MinPrefetchStride = 1024; + MaxPrefetchIterationsAhead = 11; + break; + case Vulcan: + MaxInterleaveFactor = 4; + break; + case CortexA35: break; + case CortexA53: break; + case CortexA72: break; + case CortexA73: break; + case Others: break; + } +} + AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) - : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), - HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false), - HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false), - HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), - StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian), - CPUString(CPU), TargetTriple(TT), FrameLowering(), + : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()), + IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(), - TLInfo(TM, *this) {} + TLInfo(TM, *this), GISel() {} + +const CallLowering *AArch64Subtarget::getCallLowering() const { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getCallLowering(); +} + +const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getRegBankInfo(); +} -/// ClassifyGlobalReference - Find the target operand flags that describe -/// how a global value should be referenced for the current subtarget. +/// Find the target operand flags that describe how a global value should be +/// referenced for the current subtarget. unsigned char AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, - const TargetMachine &TM) const { - bool isDef = GV->isStrongDefinitionForLinker(); - + const TargetMachine &TM) const { // MachO large model always goes via a GOT, simply to get a single 8-byte // absolute relocation on all global addresses. if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) return AArch64II::MO_GOT; + if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) + return AArch64II::MO_GOT; + // The small code mode's direct accesses use ADRP, which cannot necessarily // produce the value 0 (if the code is above 4GB). - if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) { - // In PIC mode use the GOT, but in absolute mode use a constant pool load. - if (TM.getRelocationModel() == Reloc::Static) - return AArch64II::MO_CONSTPOOL; - else - return AArch64II::MO_GOT; - } - - // If symbol visibility is hidden, the extra load is not needed if - // the symbol is definitely defined in the current translation unit. - - // The handling of non-hidden symbols in PIC mode is rather target-dependent: - // + On MachO, if the symbol is defined in this module the GOT can be - // skipped. - // + On ELF, the R_AARCH64_COPY relocation means that even symbols actually - // defined could end up in unexpected places. Use a GOT. - if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) { - if (isTargetMachO()) - return isDef ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT; - else - // No need to go through the GOT for local symbols on ELF. - return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT; - } + if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) + return AArch64II::MO_GOT; return AArch64II::MO_NO_FLAG; } @@ -114,8 +138,7 @@ const char *AArch64Subtarget::getBZeroEntry() const { } void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, MachineInstr *end, - unsigned NumRegionInstrs) const { + unsigned NumRegionInstrs) const { // LNT run (at least on Cyclone) showed reasonably significant gains for // bi-directional scheduling. 253.perlbmk. Policy.OnlyTopDown = false; @@ -123,8 +146,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, // Enabling or Disabling the latency heuristic is a close call: It seems to // help nearly no benchmark on out-of-order architectures, on the other hand // it regresses register pressure on a few benchmarking. - if (isCyclone()) - Policy.DisableLatencyHeuristic = true; + Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; } bool AArch64Subtarget::enableEarlyIfConversion() const { @@ -146,8 +168,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const { std::unique_ptr AArch64Subtarget::getCustomPBQPConstraints() const { - if (!isCortexA57()) - return nullptr; - - return llvm::make_unique(); + return balanceFPOps() ? llvm::make_unique() : nullptr; } diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 151133b2f32c..16a35405c892 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -19,6 +19,7 @@ #include "AArch64InstrInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64SelectionDAGInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetSubtargetInfo.h" #include @@ -32,38 +33,64 @@ class StringRef; class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { -protected: - enum ARMProcFamilyEnum { +public: + enum ARMProcFamilyEnum : uint8_t { Others, CortexA35, CortexA53, CortexA57, + CortexA72, + CortexA73, Cyclone, - ExynosM1 + ExynosM1, + Kryo, + Vulcan }; +protected: /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. - ARMProcFamilyEnum ARMProcFamily; + ARMProcFamilyEnum ARMProcFamily = Others; - bool HasV8_1aOps; - bool HasV8_2aOps; + bool HasV8_1aOps = false; + bool HasV8_2aOps = false; - bool HasFPARMv8; - bool HasNEON; - bool HasCrypto; - bool HasCRC; - bool HasPerfMon; - bool HasFullFP16; - bool HasSPE; + bool HasFPARMv8 = false; + bool HasNEON = false; + bool HasCrypto = false; + bool HasCRC = false; + bool HasRAS = false; + bool HasPerfMon = false; + bool HasFullFP16 = false; + bool HasSPE = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. - bool HasZeroCycleRegMove; + bool HasZeroCycleRegMove = false; // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. - bool HasZeroCycleZeroing; + bool HasZeroCycleZeroing = false; // StrictAlign - Disallow unaligned memory accesses. - bool StrictAlign; + bool StrictAlign = false; + bool MergeNarrowLoads = false; + bool UseAA = false; + bool PredictableSelectIsExpensive = false; + bool BalanceFPOps = false; + bool CustomAsCheapAsMove = false; + bool UsePostRAScheduler = false; + bool Misaligned128StoreIsSlow = false; + bool AvoidQuadLdStPairs = false; + bool UseAlternateSExtLoadCVTF32Pattern = false; + bool HasMacroOpFusion = false; + bool DisableLatencySchedHeuristic = false; + bool UseRSqrt = false; + uint8_t MaxInterleaveFactor = 2; + uint8_t VectorInsertExtractBaseCost = 3; + uint16_t CacheLineSize = 0; + uint16_t PrefetchDistance = 0; + uint16_t MinPrefetchStride = 1; + unsigned MaxPrefetchIterationsAhead = UINT_MAX; + unsigned PrefFunctionAlignment = 0; + unsigned PrefLoopAlignment = 0; // ReserveX18 - X18 is not available as a general purpose register. bool ReserveX18; @@ -80,12 +107,20 @@ protected: AArch64InstrInfo InstrInfo; AArch64SelectionDAGInfo TSInfo; AArch64TargetLowering TLInfo; + /// Gather the accessor points to GlobalISel-related APIs. + /// This is used to avoid ifndefs spreading around while GISel is + /// an optional library. + std::unique_ptr GISel; + private: /// initializeSubtargetDependencies - Initializes using CPUString and the /// passed in feature string so that we can use initializer lists for /// subtarget initialization. AArch64Subtarget &initializeSubtargetDependencies(StringRef FS); + /// Initialize properties based on the selected processor family. + void initializeProperties(); + public: /// This constructor initializes the data members to match that /// of the specified triple. @@ -93,6 +128,11 @@ public: const std::string &FS, const TargetMachine &TM, bool LittleEndian); + /// This object will take onwership of \p GISelAccessor. + void setGISelAccessor(GISelAccessor &GISel) { + this->GISel.reset(&GISel); + } + const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } @@ -106,10 +146,20 @@ public: const AArch64RegisterInfo *getRegisterInfo() const override { return &getInstrInfo()->getRegisterInfo(); } + const CallLowering *getCallLowering() const override; + const RegisterBankInfo *getRegBankInfo() const override; const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { - return isGeneric() || isCortexA53() || isCortexA57(); + return UsePostRAScheduler; + } + + /// Returns ARM processor family. + /// Avoid this function! CPU specifics should be kept local to this class + /// and preferably modeled with SubtargetFeatures or properties in + /// initializeProperties(). + ARMProcFamilyEnum getProcFamily() const { + return ARMProcFamily; } bool hasV8_1aOps() const { return HasV8_1aOps; } @@ -126,6 +176,33 @@ public: bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + bool hasRAS() const { return HasRAS; } + bool mergeNarrowLoads() const { return MergeNarrowLoads; } + bool balanceFPOps() const { return BalanceFPOps; } + bool predictableSelectIsExpensive() const { + return PredictableSelectIsExpensive; + } + bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } + bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } + bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; } + bool useAlternateSExtLoadCVTF32Pattern() const { + return UseAlternateSExtLoadCVTF32Pattern; + } + bool hasMacroOpFusion() const { return HasMacroOpFusion; } + bool useRSqrt() const { return UseRSqrt; } + unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } + unsigned getVectorInsertExtractBaseCost() const { + return VectorInsertExtractBaseCost; + } + unsigned getCacheLineSize() const { return CacheLineSize; } + unsigned getPrefetchDistance() const { return PrefetchDistance; } + unsigned getMinPrefetchStride() const { return MinPrefetchStride; } + unsigned getMaxPrefetchIterationsAhead() const { + return MaxPrefetchIterationsAhead; + } + unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; } + unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; @@ -146,13 +223,7 @@ public: bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } - bool isGeneric() const { return CPUString == "generic"; } - bool isCyclone() const { return CPUString == "cyclone"; } - bool isCortexA57() const { return CPUString == "cortex-a57"; } - bool isCortexA53() const { return CPUString == "cortex-a53"; } - bool isExynosM1() const { return CPUString == "exynos-m1"; } - - bool useAA() const override { return isCortexA53(); } + bool useAA() const override { return UseAA; } /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. @@ -174,8 +245,7 @@ public: /// returns null. const char *getBZeroEntry() const; - void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin, - MachineInstr *end, + void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; bool enableEarlyIfConversion() const override; diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td new file mode 100644 index 000000000000..a3736c0868fb --- /dev/null +++ b/lib/Target/AArch64/AArch64SystemOperands.td @@ -0,0 +1,1018 @@ +//===- AArch64SystemOperands.td ----------------------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the symbolic operands permitted for various kinds of +// AArch64 system instruction. +// +//===----------------------------------------------------------------------===// + +include "llvm/TableGen/SearchableTable.td" + +//===----------------------------------------------------------------------===// +// AT (address translate) instruction options. +//===----------------------------------------------------------------------===// + +class AT op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<16> Encoding; + let Encoding{15-14} = op0; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; +} + +def : AT<"S1E1R", 0b01, 0b000, 0b0111, 0b1000, 0b000>; +def : AT<"S1E2R", 0b01, 0b100, 0b0111, 0b1000, 0b000>; +def : AT<"S1E3R", 0b01, 0b110, 0b0111, 0b1000, 0b000>; +def : AT<"S1E1W", 0b01, 0b000, 0b0111, 0b1000, 0b001>; +def : AT<"S1E2W", 0b01, 0b100, 0b0111, 0b1000, 0b001>; +def : AT<"S1E3W", 0b01, 0b110, 0b0111, 0b1000, 0b001>; +def : AT<"S1E0R", 0b01, 0b000, 0b0111, 0b1000, 0b010>; +def : AT<"S1E0W", 0b01, 0b000, 0b0111, 0b1000, 0b011>; +def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>; +def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>; +def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>; +def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>; +def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>; +def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>; + + +//===----------------------------------------------------------------------===// +// DMB/DSB (data barrier) instruction options. +//===----------------------------------------------------------------------===// + +class DB encoding> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<4> Encoding = encoding; +} + +def : DB<"oshld", 0x1>; +def : DB<"oshst", 0x2>; +def : DB<"osh", 0x3>; +def : DB<"nshld", 0x5>; +def : DB<"nshst", 0x6>; +def : DB<"nsh", 0x7>; +def : DB<"ishld", 0x9>; +def : DB<"ishst", 0xa>; +def : DB<"ish", 0xb>; +def : DB<"ld", 0xd>; +def : DB<"st", 0xe>; +def : DB<"sy", 0xf>; + +//===----------------------------------------------------------------------===// +// DC (data cache maintenance) instruction options. +//===----------------------------------------------------------------------===// + +class DC op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<16> Encoding; + let Encoding{15-14} = op0; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; +} + +def : DC<"ZVA", 0b01, 0b011, 0b0111, 0b0100, 0b001>; +def : DC<"IVAC", 0b01, 0b000, 0b0111, 0b0110, 0b001>; +def : DC<"ISW", 0b01, 0b000, 0b0111, 0b0110, 0b010>; +def : DC<"CVAC", 0b01, 0b011, 0b0111, 0b1010, 0b001>; +def : DC<"CSW", 0b01, 0b000, 0b0111, 0b1010, 0b010>; +def : DC<"CVAU", 0b01, 0b011, 0b0111, 0b1011, 0b001>; +def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>; +def : DC<"CISW", 0b01, 0b000, 0b0111, 0b1110, 0b010>; + +//===----------------------------------------------------------------------===// +// IC (instruction cache maintenance) instruction options. +//===----------------------------------------------------------------------===// + +class IC op1, bits<4> crn, bits<4> crm, bits<3> op2, + bit needsreg> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<14> Encoding; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; + bit NeedsReg = needsreg; +} + +def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>; +def : IC<"IALLU", 0b000, 0b0111, 0b0101, 0b000, 0>; +def : IC<"IVAU", 0b000, 0b0111, 0b0001, 0b000, 1>; + +//===----------------------------------------------------------------------===// +// ISB (instruction-fetch barrier) instruction options. +//===----------------------------------------------------------------------===// + +class ISB encoding> : SearchableTable{ + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<4> Encoding; + let Encoding = encoding; +} + +def : ISB<"sy", 0xf>; + +//===----------------------------------------------------------------------===// +// PRFM (prefetch) instruction options. +//===----------------------------------------------------------------------===// + +class PRFM encoding> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<5> Encoding; + let Encoding = encoding; +} + +def : PRFM<"pldl1keep", 0x00>; +def : PRFM<"pldl1strm", 0x01>; +def : PRFM<"pldl2keep", 0x02>; +def : PRFM<"pldl2strm", 0x03>; +def : PRFM<"pldl3keep", 0x04>; +def : PRFM<"pldl3strm", 0x05>; +def : PRFM<"plil1keep", 0x08>; +def : PRFM<"plil1strm", 0x09>; +def : PRFM<"plil2keep", 0x0a>; +def : PRFM<"plil2strm", 0x0b>; +def : PRFM<"plil3keep", 0x0c>; +def : PRFM<"plil3strm", 0x0d>; +def : PRFM<"pstl1keep", 0x10>; +def : PRFM<"pstl1strm", 0x11>; +def : PRFM<"pstl2keep", 0x12>; +def : PRFM<"pstl2strm", 0x13>; +def : PRFM<"pstl3keep", 0x14>; +def : PRFM<"pstl3strm", 0x15>; + +//===----------------------------------------------------------------------===// +// PState instruction options. +//===----------------------------------------------------------------------===// + +class PState encoding> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<5> Encoding; + let Encoding = encoding; + code Requires = [{ {} }]; +} + +def : PState<"SPSel", 0b00101>; +def : PState<"DAIFSet", 0b11110>; +def : PState<"DAIFClr", 0b11111>; +// v8.1a "Privileged Access Never" extension-specific PStates +let Requires = [{ {AArch64::HasV8_1aOps} }] in +def : PState<"PAN", 0b00100>; +// v8.2a "User Access Override" extension-specific PStates +let Requires = [{ {AArch64::HasV8_2aOps} }] in +def : PState<"UAO", 0b00011>; + + +//===----------------------------------------------------------------------===// +// PSB instruction options. +//===----------------------------------------------------------------------===// + +class PSB encoding> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<5> Encoding; + let Encoding = encoding; +} + +def : PSB<"csync", 0x11>; + +//===----------------------------------------------------------------------===// +// TLBI (translation lookaside buffer invalidate) instruction options. +//===----------------------------------------------------------------------===// + +class TLBI op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2, bit needsreg = 1> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<16> Encoding; + let Encoding{15-14} = op0; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; + bit NeedsReg = needsreg; +} + +def : TLBI<"IPAS2E1IS", 0b01, 0b100, 0b1000, 0b0000, 0b001>; +def : TLBI<"IPAS2LE1IS", 0b01, 0b100, 0b1000, 0b0000, 0b101>; +def : TLBI<"VMALLE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"ALLE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"ALLE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"VAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b001>; +def : TLBI<"VAE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b001>; +def : TLBI<"VAE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b001>; +def : TLBI<"ASIDE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b010>; +def : TLBI<"VAAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b011>; +def : TLBI<"ALLE1IS", 0b01, 0b100, 0b1000, 0b0011, 0b100, 0>; +def : TLBI<"VALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b101>; +def : TLBI<"VALE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b101>; +def : TLBI<"VALE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b101>; +def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>; +def : TLBI<"VAALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b111>; +def : TLBI<"IPAS2E1", 0b01, 0b100, 0b1000, 0b0100, 0b001>; +def : TLBI<"IPAS2LE1", 0b01, 0b100, 0b1000, 0b0100, 0b101>; +def : TLBI<"VMALLE1", 0b01, 0b000, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"ALLE2", 0b01, 0b100, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"ALLE3", 0b01, 0b110, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"VAE1", 0b01, 0b000, 0b1000, 0b0111, 0b001>; +def : TLBI<"VAE2", 0b01, 0b100, 0b1000, 0b0111, 0b001>; +def : TLBI<"VAE3", 0b01, 0b110, 0b1000, 0b0111, 0b001>; +def : TLBI<"ASIDE1", 0b01, 0b000, 0b1000, 0b0111, 0b010>; +def : TLBI<"VAAE1", 0b01, 0b000, 0b1000, 0b0111, 0b011>; +def : TLBI<"ALLE1", 0b01, 0b100, 0b1000, 0b0111, 0b100, 0>; +def : TLBI<"VALE1", 0b01, 0b000, 0b1000, 0b0111, 0b101>; +def : TLBI<"VALE2", 0b01, 0b100, 0b1000, 0b0111, 0b101>; +def : TLBI<"VALE3", 0b01, 0b110, 0b1000, 0b0111, 0b101>; +def : TLBI<"VMALLS12E1", 0b01, 0b100, 0b1000, 0b0111, 0b110, 0>; +def : TLBI<"VAALE1", 0b01, 0b000, 0b1000, 0b0111, 0b111>; + + +//===----------------------------------------------------------------------===// +// MRS/MSR (system register read/write) instruction options. +//===----------------------------------------------------------------------===// + +class SysReg op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<16> Encoding; + let Encoding{15-14} = op0; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; + bit Readable = ?; + bit Writeable = ?; + code Requires = [{ {} }]; +} + +class RWSysReg op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> + : SysReg { + let Readable = 1; + let Writeable = 1; +} + +class ROSysReg op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> + : SysReg { + let Readable = 1; + let Writeable = 0; +} + +class WOSysReg op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> + : SysReg { + let Readable = 0; + let Writeable = 1; +} + +//===---------------------- +// Read-only regs +//===---------------------- + +// Op0 Op1 CRn CRm Op2 +def : ROSysReg<"MDCCSR_EL0", 0b10, 0b011, 0b0000, 0b0001, 0b000>; +def : ROSysReg<"DBGDTRRX_EL0", 0b10, 0b011, 0b0000, 0b0101, 0b000>; +def : ROSysReg<"MDRAR_EL1", 0b10, 0b000, 0b0001, 0b0000, 0b000>; +def : ROSysReg<"OSLSR_EL1", 0b10, 0b000, 0b0001, 0b0001, 0b100>; +def : ROSysReg<"DBGAUTHSTATUS_EL1", 0b10, 0b000, 0b0111, 0b1110, 0b110>; +def : ROSysReg<"PMCEID0_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b110>; +def : ROSysReg<"PMCEID1_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b111>; +def : ROSysReg<"MIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b000>; +def : ROSysReg<"CCSIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b000>; +def : ROSysReg<"CLIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b001>; +def : ROSysReg<"CTR_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b001>; +def : ROSysReg<"MPIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b101>; +def : ROSysReg<"REVIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b110>; +def : ROSysReg<"AIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b111>; +def : ROSysReg<"DCZID_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b111>; +def : ROSysReg<"ID_PFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b000>; +def : ROSysReg<"ID_PFR1_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b001>; +def : ROSysReg<"ID_DFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b010>; +def : ROSysReg<"ID_AFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b011>; +def : ROSysReg<"ID_MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b100>; +def : ROSysReg<"ID_MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b101>; +def : ROSysReg<"ID_MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b110>; +def : ROSysReg<"ID_MMFR3_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b111>; +def : ROSysReg<"ID_ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b000>; +def : ROSysReg<"ID_ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b001>; +def : ROSysReg<"ID_ISAR2_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b010>; +def : ROSysReg<"ID_ISAR3_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b011>; +def : ROSysReg<"ID_ISAR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b100>; +def : ROSysReg<"ID_ISAR5_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b101>; +def : ROSysReg<"ID_AA64PFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b000>; +def : ROSysReg<"ID_AA64PFR1_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b001>; +def : ROSysReg<"ID_AA64DFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b000>; +def : ROSysReg<"ID_AA64DFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b001>; +def : ROSysReg<"ID_AA64AFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b100>; +def : ROSysReg<"ID_AA64AFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b101>; +def : ROSysReg<"ID_AA64ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b000>; +def : ROSysReg<"ID_AA64ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b001>; +def : ROSysReg<"ID_AA64MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b000>; +def : ROSysReg<"ID_AA64MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b001>; +def : ROSysReg<"ID_AA64MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b010> { + let Requires = [{ {AArch64::HasV8_2aOps} }]; +} +def : ROSysReg<"MVFR0_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b000>; +def : ROSysReg<"MVFR1_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b001>; +def : ROSysReg<"MVFR2_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b010>; +def : ROSysReg<"RVBAR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b001>; +def : ROSysReg<"RVBAR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b001>; +def : ROSysReg<"RVBAR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b001>; +def : ROSysReg<"ISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b000>; +def : ROSysReg<"CNTPCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b001>; +def : ROSysReg<"CNTVCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b010>; +def : ROSysReg<"ID_MMFR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b110>; + +// Trace registers +// Op0 Op1 CRn CRm Op2 +def : ROSysReg<"TRCSTATR", 0b10, 0b001, 0b0000, 0b0011, 0b000>; +def : ROSysReg<"TRCIDR8", 0b10, 0b001, 0b0000, 0b0000, 0b110>; +def : ROSysReg<"TRCIDR9", 0b10, 0b001, 0b0000, 0b0001, 0b110>; +def : ROSysReg<"TRCIDR10", 0b10, 0b001, 0b0000, 0b0010, 0b110>; +def : ROSysReg<"TRCIDR11", 0b10, 0b001, 0b0000, 0b0011, 0b110>; +def : ROSysReg<"TRCIDR12", 0b10, 0b001, 0b0000, 0b0100, 0b110>; +def : ROSysReg<"TRCIDR13", 0b10, 0b001, 0b0000, 0b0101, 0b110>; +def : ROSysReg<"TRCIDR0", 0b10, 0b001, 0b0000, 0b1000, 0b111>; +def : ROSysReg<"TRCIDR1", 0b10, 0b001, 0b0000, 0b1001, 0b111>; +def : ROSysReg<"TRCIDR2", 0b10, 0b001, 0b0000, 0b1010, 0b111>; +def : ROSysReg<"TRCIDR3", 0b10, 0b001, 0b0000, 0b1011, 0b111>; +def : ROSysReg<"TRCIDR4", 0b10, 0b001, 0b0000, 0b1100, 0b111>; +def : ROSysReg<"TRCIDR5", 0b10, 0b001, 0b0000, 0b1101, 0b111>; +def : ROSysReg<"TRCIDR6", 0b10, 0b001, 0b0000, 0b1110, 0b111>; +def : ROSysReg<"TRCIDR7", 0b10, 0b001, 0b0000, 0b1111, 0b111>; +def : ROSysReg<"TRCOSLSR", 0b10, 0b001, 0b0001, 0b0001, 0b100>; +def : ROSysReg<"TRCPDSR", 0b10, 0b001, 0b0001, 0b0101, 0b100>; +def : ROSysReg<"TRCDEVAFF0", 0b10, 0b001, 0b0111, 0b1010, 0b110>; +def : ROSysReg<"TRCDEVAFF1", 0b10, 0b001, 0b0111, 0b1011, 0b110>; +def : ROSysReg<"TRCLSR", 0b10, 0b001, 0b0111, 0b1101, 0b110>; +def : ROSysReg<"TRCAUTHSTATUS", 0b10, 0b001, 0b0111, 0b1110, 0b110>; +def : ROSysReg<"TRCDEVARCH", 0b10, 0b001, 0b0111, 0b1111, 0b110>; +def : ROSysReg<"TRCDEVID", 0b10, 0b001, 0b0111, 0b0010, 0b111>; +def : ROSysReg<"TRCDEVTYPE", 0b10, 0b001, 0b0111, 0b0011, 0b111>; +def : ROSysReg<"TRCPIDR4", 0b10, 0b001, 0b0111, 0b0100, 0b111>; +def : ROSysReg<"TRCPIDR5", 0b10, 0b001, 0b0111, 0b0101, 0b111>; +def : ROSysReg<"TRCPIDR6", 0b10, 0b001, 0b0111, 0b0110, 0b111>; +def : ROSysReg<"TRCPIDR7", 0b10, 0b001, 0b0111, 0b0111, 0b111>; +def : ROSysReg<"TRCPIDR0", 0b10, 0b001, 0b0111, 0b1000, 0b111>; +def : ROSysReg<"TRCPIDR1", 0b10, 0b001, 0b0111, 0b1001, 0b111>; +def : ROSysReg<"TRCPIDR2", 0b10, 0b001, 0b0111, 0b1010, 0b111>; +def : ROSysReg<"TRCPIDR3", 0b10, 0b001, 0b0111, 0b1011, 0b111>; +def : ROSysReg<"TRCCIDR0", 0b10, 0b001, 0b0111, 0b1100, 0b111>; +def : ROSysReg<"TRCCIDR1", 0b10, 0b001, 0b0111, 0b1101, 0b111>; +def : ROSysReg<"TRCCIDR2", 0b10, 0b001, 0b0111, 0b1110, 0b111>; +def : ROSysReg<"TRCCIDR3", 0b10, 0b001, 0b0111, 0b1111, 0b111>; + +// GICv3 registers +// Op0 Op1 CRn CRm Op2 +def : ROSysReg<"ICC_IAR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b000>; +def : ROSysReg<"ICC_IAR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b000>; +def : ROSysReg<"ICC_HPPIR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b010>; +def : ROSysReg<"ICC_HPPIR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b010>; +def : ROSysReg<"ICC_RPR_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b011>; +def : ROSysReg<"ICH_VTR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b001>; +def : ROSysReg<"ICH_EISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b011>; +def : ROSysReg<"ICH_ELSR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b101>; + +// v8.1a "Limited Ordering Regions" extension-specific system register +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::HasV8_1aOps} }] in +def : ROSysReg<"LORID_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b111>; + +// v8.2a "RAS extension" registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureRAS} }] in { +def : ROSysReg<"ERRIDR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b000>; +def : ROSysReg<"ERXFR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b000>; +} + +//===---------------------- +// Write-only regs +//===---------------------- + +// Op0 Op1 CRn CRm Op2 +def : WOSysReg<"DBGDTRTX_EL0", 0b10, 0b011, 0b0000, 0b0101, 0b000>; +def : WOSysReg<"OSLAR_EL1", 0b10, 0b000, 0b0001, 0b0000, 0b100>; +def : WOSysReg<"PMSWINC_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b100>; + +// Trace Registers +// Op0 Op1 CRn CRm Op2 +def : WOSysReg<"TRCOSLAR", 0b10, 0b001, 0b0001, 0b0000, 0b100>; +def : WOSysReg<"TRCLAR", 0b10, 0b001, 0b0111, 0b1100, 0b110>; + +// GICv3 registers +// Op0 Op1 CRn CRm Op2 +def : WOSysReg<"ICC_EOIR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b001>; +def : WOSysReg<"ICC_EOIR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b001>; +def : WOSysReg<"ICC_DIR_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b001>; +def : WOSysReg<"ICC_SGI1R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b101>; +def : WOSysReg<"ICC_ASGI1R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b110>; +def : WOSysReg<"ICC_SGI0R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b111>; + +//===---------------------- +// Read-write regs +//===---------------------- + +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"OSDTRRX_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b010>; +def : RWSysReg<"OSDTRTX_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b010>; +def : RWSysReg<"TEECR32_EL1", 0b10, 0b010, 0b0000, 0b0000, 0b000>; +def : RWSysReg<"MDCCINT_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b000>; +def : RWSysReg<"MDSCR_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b010>; +def : RWSysReg<"DBGDTR_EL0", 0b10, 0b011, 0b0000, 0b0100, 0b000>; +def : RWSysReg<"OSECCR_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b010>; +def : RWSysReg<"DBGVCR32_EL2", 0b10, 0b100, 0b0000, 0b0111, 0b000>; +def : RWSysReg<"DBGBVR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b100>; +def : RWSysReg<"DBGBVR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b100>; +def : RWSysReg<"DBGBVR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b100>; +def : RWSysReg<"DBGBVR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b100>; +def : RWSysReg<"DBGBVR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b100>; +def : RWSysReg<"DBGBVR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b100>; +def : RWSysReg<"DBGBVR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b100>; +def : RWSysReg<"DBGBVR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b100>; +def : RWSysReg<"DBGBVR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b100>; +def : RWSysReg<"DBGBVR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b100>; +def : RWSysReg<"DBGBVR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b100>; +def : RWSysReg<"DBGBVR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b100>; +def : RWSysReg<"DBGBVR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b100>; +def : RWSysReg<"DBGBVR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b100>; +def : RWSysReg<"DBGBVR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b100>; +def : RWSysReg<"DBGBVR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b100>; +def : RWSysReg<"DBGBCR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b101>; +def : RWSysReg<"DBGBCR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b101>; +def : RWSysReg<"DBGBCR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b101>; +def : RWSysReg<"DBGBCR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b101>; +def : RWSysReg<"DBGBCR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b101>; +def : RWSysReg<"DBGBCR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b101>; +def : RWSysReg<"DBGBCR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b101>; +def : RWSysReg<"DBGBCR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b101>; +def : RWSysReg<"DBGBCR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b101>; +def : RWSysReg<"DBGBCR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b101>; +def : RWSysReg<"DBGBCR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b101>; +def : RWSysReg<"DBGBCR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b101>; +def : RWSysReg<"DBGBCR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b101>; +def : RWSysReg<"DBGBCR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b101>; +def : RWSysReg<"DBGBCR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b101>; +def : RWSysReg<"DBGBCR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b101>; +def : RWSysReg<"DBGWVR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b110>; +def : RWSysReg<"DBGWVR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b110>; +def : RWSysReg<"DBGWVR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b110>; +def : RWSysReg<"DBGWVR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b110>; +def : RWSysReg<"DBGWVR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b110>; +def : RWSysReg<"DBGWVR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b110>; +def : RWSysReg<"DBGWVR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b110>; +def : RWSysReg<"DBGWVR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b110>; +def : RWSysReg<"DBGWVR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b110>; +def : RWSysReg<"DBGWVR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b110>; +def : RWSysReg<"DBGWVR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b110>; +def : RWSysReg<"DBGWVR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b110>; +def : RWSysReg<"DBGWVR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b110>; +def : RWSysReg<"DBGWVR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b110>; +def : RWSysReg<"DBGWVR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b110>; +def : RWSysReg<"DBGWVR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b110>; +def : RWSysReg<"DBGWCR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b111>; +def : RWSysReg<"DBGWCR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b111>; +def : RWSysReg<"DBGWCR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b111>; +def : RWSysReg<"DBGWCR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b111>; +def : RWSysReg<"DBGWCR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b111>; +def : RWSysReg<"DBGWCR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b111>; +def : RWSysReg<"DBGWCR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b111>; +def : RWSysReg<"DBGWCR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b111>; +def : RWSysReg<"DBGWCR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b111>; +def : RWSysReg<"DBGWCR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b111>; +def : RWSysReg<"DBGWCR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b111>; +def : RWSysReg<"DBGWCR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b111>; +def : RWSysReg<"DBGWCR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b111>; +def : RWSysReg<"DBGWCR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b111>; +def : RWSysReg<"DBGWCR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b111>; +def : RWSysReg<"DBGWCR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b111>; +def : RWSysReg<"TEEHBR32_EL1", 0b10, 0b010, 0b0001, 0b0000, 0b000>; +def : RWSysReg<"OSDLR_EL1", 0b10, 0b000, 0b0001, 0b0011, 0b100>; +def : RWSysReg<"DBGPRCR_EL1", 0b10, 0b000, 0b0001, 0b0100, 0b100>; +def : RWSysReg<"DBGCLAIMSET_EL1", 0b10, 0b000, 0b0111, 0b1000, 0b110>; +def : RWSysReg<"DBGCLAIMCLR_EL1", 0b10, 0b000, 0b0111, 0b1001, 0b110>; +def : RWSysReg<"CSSELR_EL1", 0b11, 0b010, 0b0000, 0b0000, 0b000>; +def : RWSysReg<"VPIDR_EL2", 0b11, 0b100, 0b0000, 0b0000, 0b000>; +def : RWSysReg<"VMPIDR_EL2", 0b11, 0b100, 0b0000, 0b0000, 0b101>; +def : RWSysReg<"CPACR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b010>; +def : RWSysReg<"SCTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b000>; +def : RWSysReg<"SCTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b000>; +def : RWSysReg<"SCTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b000>; +def : RWSysReg<"ACTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b001>; +def : RWSysReg<"ACTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b001>; +def : RWSysReg<"ACTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b001>; +def : RWSysReg<"HCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b000>; +def : RWSysReg<"SCR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b000>; +def : RWSysReg<"MDCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b001>; +def : RWSysReg<"SDER32_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b001>; +def : RWSysReg<"CPTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b010>; +def : RWSysReg<"CPTR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b010>; +def : RWSysReg<"HSTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b011>; +def : RWSysReg<"HACR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b111>; +def : RWSysReg<"MDCR_EL3", 0b11, 0b110, 0b0001, 0b0011, 0b001>; +def : RWSysReg<"TTBR0_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b000>; +def : RWSysReg<"TTBR0_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000>; +def : RWSysReg<"TTBR0_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b000>; +def : RWSysReg<"TTBR1_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b001>; +def : RWSysReg<"TCR_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b010>; +def : RWSysReg<"TCR_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b010>; +def : RWSysReg<"TCR_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b010>; +def : RWSysReg<"VTTBR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b000>; +def : RWSysReg<"VTCR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b010>; +def : RWSysReg<"DACR32_EL2", 0b11, 0b100, 0b0011, 0b0000, 0b000>; +def : RWSysReg<"SPSR_EL1", 0b11, 0b000, 0b0100, 0b0000, 0b000>; +def : RWSysReg<"SPSR_EL2", 0b11, 0b100, 0b0100, 0b0000, 0b000>; +def : RWSysReg<"SPSR_EL3", 0b11, 0b110, 0b0100, 0b0000, 0b000>; +def : RWSysReg<"ELR_EL1", 0b11, 0b000, 0b0100, 0b0000, 0b001>; +def : RWSysReg<"ELR_EL2", 0b11, 0b100, 0b0100, 0b0000, 0b001>; +def : RWSysReg<"ELR_EL3", 0b11, 0b110, 0b0100, 0b0000, 0b001>; +def : RWSysReg<"SP_EL0", 0b11, 0b000, 0b0100, 0b0001, 0b000>; +def : RWSysReg<"SP_EL1", 0b11, 0b100, 0b0100, 0b0001, 0b000>; +def : RWSysReg<"SP_EL2", 0b11, 0b110, 0b0100, 0b0001, 0b000>; +def : RWSysReg<"SPSel", 0b11, 0b000, 0b0100, 0b0010, 0b000>; +def : RWSysReg<"NZCV", 0b11, 0b011, 0b0100, 0b0010, 0b000>; +def : RWSysReg<"DAIF", 0b11, 0b011, 0b0100, 0b0010, 0b001>; +def : RWSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>; +def : RWSysReg<"SPSR_irq", 0b11, 0b100, 0b0100, 0b0011, 0b000>; +def : RWSysReg<"SPSR_abt", 0b11, 0b100, 0b0100, 0b0011, 0b001>; +def : RWSysReg<"SPSR_und", 0b11, 0b100, 0b0100, 0b0011, 0b010>; +def : RWSysReg<"SPSR_fiq", 0b11, 0b100, 0b0100, 0b0011, 0b011>; +def : RWSysReg<"FPCR", 0b11, 0b011, 0b0100, 0b0100, 0b000>; +def : RWSysReg<"FPSR", 0b11, 0b011, 0b0100, 0b0100, 0b001>; +def : RWSysReg<"DSPSR_EL0", 0b11, 0b011, 0b0100, 0b0101, 0b000>; +def : RWSysReg<"DLR_EL0", 0b11, 0b011, 0b0100, 0b0101, 0b001>; +def : RWSysReg<"IFSR32_EL2", 0b11, 0b100, 0b0101, 0b0000, 0b001>; +def : RWSysReg<"AFSR0_EL1", 0b11, 0b000, 0b0101, 0b0001, 0b000>; +def : RWSysReg<"AFSR0_EL2", 0b11, 0b100, 0b0101, 0b0001, 0b000>; +def : RWSysReg<"AFSR0_EL3", 0b11, 0b110, 0b0101, 0b0001, 0b000>; +def : RWSysReg<"AFSR1_EL1", 0b11, 0b000, 0b0101, 0b0001, 0b001>; +def : RWSysReg<"AFSR1_EL2", 0b11, 0b100, 0b0101, 0b0001, 0b001>; +def : RWSysReg<"AFSR1_EL3", 0b11, 0b110, 0b0101, 0b0001, 0b001>; +def : RWSysReg<"ESR_EL1", 0b11, 0b000, 0b0101, 0b0010, 0b000>; +def : RWSysReg<"ESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b000>; +def : RWSysReg<"ESR_EL3", 0b11, 0b110, 0b0101, 0b0010, 0b000>; +def : RWSysReg<"FPEXC32_EL2", 0b11, 0b100, 0b0101, 0b0011, 0b000>; +def : RWSysReg<"FAR_EL1", 0b11, 0b000, 0b0110, 0b0000, 0b000>; +def : RWSysReg<"FAR_EL2", 0b11, 0b100, 0b0110, 0b0000, 0b000>; +def : RWSysReg<"FAR_EL3", 0b11, 0b110, 0b0110, 0b0000, 0b000>; +def : RWSysReg<"HPFAR_EL2", 0b11, 0b100, 0b0110, 0b0000, 0b100>; +def : RWSysReg<"PAR_EL1", 0b11, 0b000, 0b0111, 0b0100, 0b000>; +def : RWSysReg<"PMCR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b000>; +def : RWSysReg<"PMCNTENSET_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b001>; +def : RWSysReg<"PMCNTENCLR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b010>; +def : RWSysReg<"PMOVSCLR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b011>; +def : RWSysReg<"PMSELR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b101>; +def : RWSysReg<"PMCCNTR_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b000>; +def : RWSysReg<"PMXEVTYPER_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b001>; +def : RWSysReg<"PMXEVCNTR_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b010>; +def : RWSysReg<"PMUSERENR_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b000>; +def : RWSysReg<"PMINTENSET_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b001>; +def : RWSysReg<"PMINTENCLR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b010>; +def : RWSysReg<"PMOVSSET_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b011>; +def : RWSysReg<"MAIR_EL1", 0b11, 0b000, 0b1010, 0b0010, 0b000>; +def : RWSysReg<"MAIR_EL2", 0b11, 0b100, 0b1010, 0b0010, 0b000>; +def : RWSysReg<"MAIR_EL3", 0b11, 0b110, 0b1010, 0b0010, 0b000>; +def : RWSysReg<"AMAIR_EL1", 0b11, 0b000, 0b1010, 0b0011, 0b000>; +def : RWSysReg<"AMAIR_EL2", 0b11, 0b100, 0b1010, 0b0011, 0b000>; +def : RWSysReg<"AMAIR_EL3", 0b11, 0b110, 0b1010, 0b0011, 0b000>; +def : RWSysReg<"VBAR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"VBAR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"VBAR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"RMR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b010>; +def : RWSysReg<"RMR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b010>; +def : RWSysReg<"RMR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b010>; +def : RWSysReg<"CONTEXTIDR_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b001>; +def : RWSysReg<"TPIDR_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b010>; +def : RWSysReg<"TPIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b010>; +def : RWSysReg<"TPIDR_EL3", 0b11, 0b110, 0b1101, 0b0000, 0b010>; +def : RWSysReg<"TPIDRRO_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b011>; +def : RWSysReg<"TPIDR_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b100>; +def : RWSysReg<"CNTFRQ_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b000>; +def : RWSysReg<"CNTVOFF_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b011>; +def : RWSysReg<"CNTKCTL_EL1", 0b11, 0b000, 0b1110, 0b0001, 0b000>; +def : RWSysReg<"CNTHCTL_EL2", 0b11, 0b100, 0b1110, 0b0001, 0b000>; +def : RWSysReg<"CNTP_TVAL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b000>; +def : RWSysReg<"CNTHP_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b000>; +def : RWSysReg<"CNTPS_TVAL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b000>; +def : RWSysReg<"CNTP_CTL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b001>; +def : RWSysReg<"CNTHP_CTL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b001>; +def : RWSysReg<"CNTPS_CTL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b001>; +def : RWSysReg<"CNTP_CVAL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b010>; +def : RWSysReg<"CNTHP_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b010>; +def : RWSysReg<"CNTPS_CVAL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b010>; +def : RWSysReg<"CNTV_TVAL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b000>; +def : RWSysReg<"CNTV_CTL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b001>; +def : RWSysReg<"CNTV_CVAL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b010>; +def : RWSysReg<"PMEVCNTR0_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b000>; +def : RWSysReg<"PMEVCNTR1_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b001>; +def : RWSysReg<"PMEVCNTR2_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b010>; +def : RWSysReg<"PMEVCNTR3_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b011>; +def : RWSysReg<"PMEVCNTR4_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b100>; +def : RWSysReg<"PMEVCNTR5_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b101>; +def : RWSysReg<"PMEVCNTR6_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b110>; +def : RWSysReg<"PMEVCNTR7_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b111>; +def : RWSysReg<"PMEVCNTR8_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b000>; +def : RWSysReg<"PMEVCNTR9_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b001>; +def : RWSysReg<"PMEVCNTR10_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b010>; +def : RWSysReg<"PMEVCNTR11_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b011>; +def : RWSysReg<"PMEVCNTR12_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b100>; +def : RWSysReg<"PMEVCNTR13_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b101>; +def : RWSysReg<"PMEVCNTR14_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b110>; +def : RWSysReg<"PMEVCNTR15_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b111>; +def : RWSysReg<"PMEVCNTR16_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b000>; +def : RWSysReg<"PMEVCNTR17_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b001>; +def : RWSysReg<"PMEVCNTR18_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b010>; +def : RWSysReg<"PMEVCNTR19_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b011>; +def : RWSysReg<"PMEVCNTR20_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b100>; +def : RWSysReg<"PMEVCNTR21_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b101>; +def : RWSysReg<"PMEVCNTR22_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b110>; +def : RWSysReg<"PMEVCNTR23_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b111>; +def : RWSysReg<"PMEVCNTR24_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b000>; +def : RWSysReg<"PMEVCNTR25_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b001>; +def : RWSysReg<"PMEVCNTR26_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b010>; +def : RWSysReg<"PMEVCNTR27_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b011>; +def : RWSysReg<"PMEVCNTR28_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b100>; +def : RWSysReg<"PMEVCNTR29_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b101>; +def : RWSysReg<"PMEVCNTR30_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b110>; +def : RWSysReg<"PMCCFILTR_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b111>; +def : RWSysReg<"PMEVTYPER0_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b000>; +def : RWSysReg<"PMEVTYPER1_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b001>; +def : RWSysReg<"PMEVTYPER2_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b010>; +def : RWSysReg<"PMEVTYPER3_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b011>; +def : RWSysReg<"PMEVTYPER4_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b100>; +def : RWSysReg<"PMEVTYPER5_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b101>; +def : RWSysReg<"PMEVTYPER6_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b110>; +def : RWSysReg<"PMEVTYPER7_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b111>; +def : RWSysReg<"PMEVTYPER8_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b000>; +def : RWSysReg<"PMEVTYPER9_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b001>; +def : RWSysReg<"PMEVTYPER10_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b010>; +def : RWSysReg<"PMEVTYPER11_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b011>; +def : RWSysReg<"PMEVTYPER12_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b100>; +def : RWSysReg<"PMEVTYPER13_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b101>; +def : RWSysReg<"PMEVTYPER14_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b110>; +def : RWSysReg<"PMEVTYPER15_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b111>; +def : RWSysReg<"PMEVTYPER16_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b000>; +def : RWSysReg<"PMEVTYPER17_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b001>; +def : RWSysReg<"PMEVTYPER18_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b010>; +def : RWSysReg<"PMEVTYPER19_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b011>; +def : RWSysReg<"PMEVTYPER20_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b100>; +def : RWSysReg<"PMEVTYPER21_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b101>; +def : RWSysReg<"PMEVTYPER22_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b110>; +def : RWSysReg<"PMEVTYPER23_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b111>; +def : RWSysReg<"PMEVTYPER24_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b000>; +def : RWSysReg<"PMEVTYPER25_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b001>; +def : RWSysReg<"PMEVTYPER26_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b010>; +def : RWSysReg<"PMEVTYPER27_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b011>; +def : RWSysReg<"PMEVTYPER28_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b100>; +def : RWSysReg<"PMEVTYPER29_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b101>; +def : RWSysReg<"PMEVTYPER30_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b110>; + +// Trace registers +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"TRCPRGCTLR", 0b10, 0b001, 0b0000, 0b0001, 0b000>; +def : RWSysReg<"TRCPROCSELR", 0b10, 0b001, 0b0000, 0b0010, 0b000>; +def : RWSysReg<"TRCCONFIGR", 0b10, 0b001, 0b0000, 0b0100, 0b000>; +def : RWSysReg<"TRCAUXCTLR", 0b10, 0b001, 0b0000, 0b0110, 0b000>; +def : RWSysReg<"TRCEVENTCTL0R", 0b10, 0b001, 0b0000, 0b1000, 0b000>; +def : RWSysReg<"TRCEVENTCTL1R", 0b10, 0b001, 0b0000, 0b1001, 0b000>; +def : RWSysReg<"TRCSTALLCTLR", 0b10, 0b001, 0b0000, 0b1011, 0b000>; +def : RWSysReg<"TRCTSCTLR", 0b10, 0b001, 0b0000, 0b1100, 0b000>; +def : RWSysReg<"TRCSYNCPR", 0b10, 0b001, 0b0000, 0b1101, 0b000>; +def : RWSysReg<"TRCCCCTLR", 0b10, 0b001, 0b0000, 0b1110, 0b000>; +def : RWSysReg<"TRCBBCTLR", 0b10, 0b001, 0b0000, 0b1111, 0b000>; +def : RWSysReg<"TRCTRACEIDR", 0b10, 0b001, 0b0000, 0b0000, 0b001>; +def : RWSysReg<"TRCQCTLR", 0b10, 0b001, 0b0000, 0b0001, 0b001>; +def : RWSysReg<"TRCVICTLR", 0b10, 0b001, 0b0000, 0b0000, 0b010>; +def : RWSysReg<"TRCVIIECTLR", 0b10, 0b001, 0b0000, 0b0001, 0b010>; +def : RWSysReg<"TRCVISSCTLR", 0b10, 0b001, 0b0000, 0b0010, 0b010>; +def : RWSysReg<"TRCVIPCSSCTLR", 0b10, 0b001, 0b0000, 0b0011, 0b010>; +def : RWSysReg<"TRCVDCTLR", 0b10, 0b001, 0b0000, 0b1000, 0b010>; +def : RWSysReg<"TRCVDSACCTLR", 0b10, 0b001, 0b0000, 0b1001, 0b010>; +def : RWSysReg<"TRCVDARCCTLR", 0b10, 0b001, 0b0000, 0b1010, 0b010>; +def : RWSysReg<"TRCSEQEVR0", 0b10, 0b001, 0b0000, 0b0000, 0b100>; +def : RWSysReg<"TRCSEQEVR1", 0b10, 0b001, 0b0000, 0b0001, 0b100>; +def : RWSysReg<"TRCSEQEVR2", 0b10, 0b001, 0b0000, 0b0010, 0b100>; +def : RWSysReg<"TRCSEQRSTEVR", 0b10, 0b001, 0b0000, 0b0110, 0b100>; +def : RWSysReg<"TRCSEQSTR", 0b10, 0b001, 0b0000, 0b0111, 0b100>; +def : RWSysReg<"TRCEXTINSELR", 0b10, 0b001, 0b0000, 0b1000, 0b100>; +def : RWSysReg<"TRCCNTRLDVR0", 0b10, 0b001, 0b0000, 0b0000, 0b101>; +def : RWSysReg<"TRCCNTRLDVR1", 0b10, 0b001, 0b0000, 0b0001, 0b101>; +def : RWSysReg<"TRCCNTRLDVR2", 0b10, 0b001, 0b0000, 0b0010, 0b101>; +def : RWSysReg<"TRCCNTRLDVR3", 0b10, 0b001, 0b0000, 0b0011, 0b101>; +def : RWSysReg<"TRCCNTCTLR0", 0b10, 0b001, 0b0000, 0b0100, 0b101>; +def : RWSysReg<"TRCCNTCTLR1", 0b10, 0b001, 0b0000, 0b0101, 0b101>; +def : RWSysReg<"TRCCNTCTLR2", 0b10, 0b001, 0b0000, 0b0110, 0b101>; +def : RWSysReg<"TRCCNTCTLR3", 0b10, 0b001, 0b0000, 0b0111, 0b101>; +def : RWSysReg<"TRCCNTVR0", 0b10, 0b001, 0b0000, 0b1000, 0b101>; +def : RWSysReg<"TRCCNTVR1", 0b10, 0b001, 0b0000, 0b1001, 0b101>; +def : RWSysReg<"TRCCNTVR2", 0b10, 0b001, 0b0000, 0b1010, 0b101>; +def : RWSysReg<"TRCCNTVR3", 0b10, 0b001, 0b0000, 0b1011, 0b101>; +def : RWSysReg<"TRCIMSPEC0", 0b10, 0b001, 0b0000, 0b0000, 0b111>; +def : RWSysReg<"TRCIMSPEC1", 0b10, 0b001, 0b0000, 0b0001, 0b111>; +def : RWSysReg<"TRCIMSPEC2", 0b10, 0b001, 0b0000, 0b0010, 0b111>; +def : RWSysReg<"TRCIMSPEC3", 0b10, 0b001, 0b0000, 0b0011, 0b111>; +def : RWSysReg<"TRCIMSPEC4", 0b10, 0b001, 0b0000, 0b0100, 0b111>; +def : RWSysReg<"TRCIMSPEC5", 0b10, 0b001, 0b0000, 0b0101, 0b111>; +def : RWSysReg<"TRCIMSPEC6", 0b10, 0b001, 0b0000, 0b0110, 0b111>; +def : RWSysReg<"TRCIMSPEC7", 0b10, 0b001, 0b0000, 0b0111, 0b111>; +def : RWSysReg<"TRCRSCTLR2", 0b10, 0b001, 0b0001, 0b0010, 0b000>; +def : RWSysReg<"TRCRSCTLR3", 0b10, 0b001, 0b0001, 0b0011, 0b000>; +def : RWSysReg<"TRCRSCTLR4", 0b10, 0b001, 0b0001, 0b0100, 0b000>; +def : RWSysReg<"TRCRSCTLR5", 0b10, 0b001, 0b0001, 0b0101, 0b000>; +def : RWSysReg<"TRCRSCTLR6", 0b10, 0b001, 0b0001, 0b0110, 0b000>; +def : RWSysReg<"TRCRSCTLR7", 0b10, 0b001, 0b0001, 0b0111, 0b000>; +def : RWSysReg<"TRCRSCTLR8", 0b10, 0b001, 0b0001, 0b1000, 0b000>; +def : RWSysReg<"TRCRSCTLR9", 0b10, 0b001, 0b0001, 0b1001, 0b000>; +def : RWSysReg<"TRCRSCTLR10", 0b10, 0b001, 0b0001, 0b1010, 0b000>; +def : RWSysReg<"TRCRSCTLR11", 0b10, 0b001, 0b0001, 0b1011, 0b000>; +def : RWSysReg<"TRCRSCTLR12", 0b10, 0b001, 0b0001, 0b1100, 0b000>; +def : RWSysReg<"TRCRSCTLR13", 0b10, 0b001, 0b0001, 0b1101, 0b000>; +def : RWSysReg<"TRCRSCTLR14", 0b10, 0b001, 0b0001, 0b1110, 0b000>; +def : RWSysReg<"TRCRSCTLR15", 0b10, 0b001, 0b0001, 0b1111, 0b000>; +def : RWSysReg<"TRCRSCTLR16", 0b10, 0b001, 0b0001, 0b0000, 0b001>; +def : RWSysReg<"TRCRSCTLR17", 0b10, 0b001, 0b0001, 0b0001, 0b001>; +def : RWSysReg<"TRCRSCTLR18", 0b10, 0b001, 0b0001, 0b0010, 0b001>; +def : RWSysReg<"TRCRSCTLR19", 0b10, 0b001, 0b0001, 0b0011, 0b001>; +def : RWSysReg<"TRCRSCTLR20", 0b10, 0b001, 0b0001, 0b0100, 0b001>; +def : RWSysReg<"TRCRSCTLR21", 0b10, 0b001, 0b0001, 0b0101, 0b001>; +def : RWSysReg<"TRCRSCTLR22", 0b10, 0b001, 0b0001, 0b0110, 0b001>; +def : RWSysReg<"TRCRSCTLR23", 0b10, 0b001, 0b0001, 0b0111, 0b001>; +def : RWSysReg<"TRCRSCTLR24", 0b10, 0b001, 0b0001, 0b1000, 0b001>; +def : RWSysReg<"TRCRSCTLR25", 0b10, 0b001, 0b0001, 0b1001, 0b001>; +def : RWSysReg<"TRCRSCTLR26", 0b10, 0b001, 0b0001, 0b1010, 0b001>; +def : RWSysReg<"TRCRSCTLR27", 0b10, 0b001, 0b0001, 0b1011, 0b001>; +def : RWSysReg<"TRCRSCTLR28", 0b10, 0b001, 0b0001, 0b1100, 0b001>; +def : RWSysReg<"TRCRSCTLR29", 0b10, 0b001, 0b0001, 0b1101, 0b001>; +def : RWSysReg<"TRCRSCTLR30", 0b10, 0b001, 0b0001, 0b1110, 0b001>; +def : RWSysReg<"TRCRSCTLR31", 0b10, 0b001, 0b0001, 0b1111, 0b001>; +def : RWSysReg<"TRCSSCCR0", 0b10, 0b001, 0b0001, 0b0000, 0b010>; +def : RWSysReg<"TRCSSCCR1", 0b10, 0b001, 0b0001, 0b0001, 0b010>; +def : RWSysReg<"TRCSSCCR2", 0b10, 0b001, 0b0001, 0b0010, 0b010>; +def : RWSysReg<"TRCSSCCR3", 0b10, 0b001, 0b0001, 0b0011, 0b010>; +def : RWSysReg<"TRCSSCCR4", 0b10, 0b001, 0b0001, 0b0100, 0b010>; +def : RWSysReg<"TRCSSCCR5", 0b10, 0b001, 0b0001, 0b0101, 0b010>; +def : RWSysReg<"TRCSSCCR6", 0b10, 0b001, 0b0001, 0b0110, 0b010>; +def : RWSysReg<"TRCSSCCR7", 0b10, 0b001, 0b0001, 0b0111, 0b010>; +def : RWSysReg<"TRCSSCSR0", 0b10, 0b001, 0b0001, 0b1000, 0b010>; +def : RWSysReg<"TRCSSCSR1", 0b10, 0b001, 0b0001, 0b1001, 0b010>; +def : RWSysReg<"TRCSSCSR2", 0b10, 0b001, 0b0001, 0b1010, 0b010>; +def : RWSysReg<"TRCSSCSR3", 0b10, 0b001, 0b0001, 0b1011, 0b010>; +def : RWSysReg<"TRCSSCSR4", 0b10, 0b001, 0b0001, 0b1100, 0b010>; +def : RWSysReg<"TRCSSCSR5", 0b10, 0b001, 0b0001, 0b1101, 0b010>; +def : RWSysReg<"TRCSSCSR6", 0b10, 0b001, 0b0001, 0b1110, 0b010>; +def : RWSysReg<"TRCSSCSR7", 0b10, 0b001, 0b0001, 0b1111, 0b010>; +def : RWSysReg<"TRCSSPCICR0", 0b10, 0b001, 0b0001, 0b0000, 0b011>; +def : RWSysReg<"TRCSSPCICR1", 0b10, 0b001, 0b0001, 0b0001, 0b011>; +def : RWSysReg<"TRCSSPCICR2", 0b10, 0b001, 0b0001, 0b0010, 0b011>; +def : RWSysReg<"TRCSSPCICR3", 0b10, 0b001, 0b0001, 0b0011, 0b011>; +def : RWSysReg<"TRCSSPCICR4", 0b10, 0b001, 0b0001, 0b0100, 0b011>; +def : RWSysReg<"TRCSSPCICR5", 0b10, 0b001, 0b0001, 0b0101, 0b011>; +def : RWSysReg<"TRCSSPCICR6", 0b10, 0b001, 0b0001, 0b0110, 0b011>; +def : RWSysReg<"TRCSSPCICR7", 0b10, 0b001, 0b0001, 0b0111, 0b011>; +def : RWSysReg<"TRCPDCR", 0b10, 0b001, 0b0001, 0b0100, 0b100>; +def : RWSysReg<"TRCACVR0", 0b10, 0b001, 0b0010, 0b0000, 0b000>; +def : RWSysReg<"TRCACVR1", 0b10, 0b001, 0b0010, 0b0010, 0b000>; +def : RWSysReg<"TRCACVR2", 0b10, 0b001, 0b0010, 0b0100, 0b000>; +def : RWSysReg<"TRCACVR3", 0b10, 0b001, 0b0010, 0b0110, 0b000>; +def : RWSysReg<"TRCACVR4", 0b10, 0b001, 0b0010, 0b1000, 0b000>; +def : RWSysReg<"TRCACVR5", 0b10, 0b001, 0b0010, 0b1010, 0b000>; +def : RWSysReg<"TRCACVR6", 0b10, 0b001, 0b0010, 0b1100, 0b000>; +def : RWSysReg<"TRCACVR7", 0b10, 0b001, 0b0010, 0b1110, 0b000>; +def : RWSysReg<"TRCACVR8", 0b10, 0b001, 0b0010, 0b0000, 0b001>; +def : RWSysReg<"TRCACVR9", 0b10, 0b001, 0b0010, 0b0010, 0b001>; +def : RWSysReg<"TRCACVR10", 0b10, 0b001, 0b0010, 0b0100, 0b001>; +def : RWSysReg<"TRCACVR11", 0b10, 0b001, 0b0010, 0b0110, 0b001>; +def : RWSysReg<"TRCACVR12", 0b10, 0b001, 0b0010, 0b1000, 0b001>; +def : RWSysReg<"TRCACVR13", 0b10, 0b001, 0b0010, 0b1010, 0b001>; +def : RWSysReg<"TRCACVR14", 0b10, 0b001, 0b0010, 0b1100, 0b001>; +def : RWSysReg<"TRCACVR15", 0b10, 0b001, 0b0010, 0b1110, 0b001>; +def : RWSysReg<"TRCACATR0", 0b10, 0b001, 0b0010, 0b0000, 0b010>; +def : RWSysReg<"TRCACATR1", 0b10, 0b001, 0b0010, 0b0010, 0b010>; +def : RWSysReg<"TRCACATR2", 0b10, 0b001, 0b0010, 0b0100, 0b010>; +def : RWSysReg<"TRCACATR3", 0b10, 0b001, 0b0010, 0b0110, 0b010>; +def : RWSysReg<"TRCACATR4", 0b10, 0b001, 0b0010, 0b1000, 0b010>; +def : RWSysReg<"TRCACATR5", 0b10, 0b001, 0b0010, 0b1010, 0b010>; +def : RWSysReg<"TRCACATR6", 0b10, 0b001, 0b0010, 0b1100, 0b010>; +def : RWSysReg<"TRCACATR7", 0b10, 0b001, 0b0010, 0b1110, 0b010>; +def : RWSysReg<"TRCACATR8", 0b10, 0b001, 0b0010, 0b0000, 0b011>; +def : RWSysReg<"TRCACATR9", 0b10, 0b001, 0b0010, 0b0010, 0b011>; +def : RWSysReg<"TRCACATR10", 0b10, 0b001, 0b0010, 0b0100, 0b011>; +def : RWSysReg<"TRCACATR11", 0b10, 0b001, 0b0010, 0b0110, 0b011>; +def : RWSysReg<"TRCACATR12", 0b10, 0b001, 0b0010, 0b1000, 0b011>; +def : RWSysReg<"TRCACATR13", 0b10, 0b001, 0b0010, 0b1010, 0b011>; +def : RWSysReg<"TRCACATR14", 0b10, 0b001, 0b0010, 0b1100, 0b011>; +def : RWSysReg<"TRCACATR15", 0b10, 0b001, 0b0010, 0b1110, 0b011>; +def : RWSysReg<"TRCDVCVR0", 0b10, 0b001, 0b0010, 0b0000, 0b100>; +def : RWSysReg<"TRCDVCVR1", 0b10, 0b001, 0b0010, 0b0100, 0b100>; +def : RWSysReg<"TRCDVCVR2", 0b10, 0b001, 0b0010, 0b1000, 0b100>; +def : RWSysReg<"TRCDVCVR3", 0b10, 0b001, 0b0010, 0b1100, 0b100>; +def : RWSysReg<"TRCDVCVR4", 0b10, 0b001, 0b0010, 0b0000, 0b101>; +def : RWSysReg<"TRCDVCVR5", 0b10, 0b001, 0b0010, 0b0100, 0b101>; +def : RWSysReg<"TRCDVCVR6", 0b10, 0b001, 0b0010, 0b1000, 0b101>; +def : RWSysReg<"TRCDVCVR7", 0b10, 0b001, 0b0010, 0b1100, 0b101>; +def : RWSysReg<"TRCDVCMR0", 0b10, 0b001, 0b0010, 0b0000, 0b110>; +def : RWSysReg<"TRCDVCMR1", 0b10, 0b001, 0b0010, 0b0100, 0b110>; +def : RWSysReg<"TRCDVCMR2", 0b10, 0b001, 0b0010, 0b1000, 0b110>; +def : RWSysReg<"TRCDVCMR3", 0b10, 0b001, 0b0010, 0b1100, 0b110>; +def : RWSysReg<"TRCDVCMR4", 0b10, 0b001, 0b0010, 0b0000, 0b111>; +def : RWSysReg<"TRCDVCMR5", 0b10, 0b001, 0b0010, 0b0100, 0b111>; +def : RWSysReg<"TRCDVCMR6", 0b10, 0b001, 0b0010, 0b1000, 0b111>; +def : RWSysReg<"TRCDVCMR7", 0b10, 0b001, 0b0010, 0b1100, 0b111>; +def : RWSysReg<"TRCCIDCVR0", 0b10, 0b001, 0b0011, 0b0000, 0b000>; +def : RWSysReg<"TRCCIDCVR1", 0b10, 0b001, 0b0011, 0b0010, 0b000>; +def : RWSysReg<"TRCCIDCVR2", 0b10, 0b001, 0b0011, 0b0100, 0b000>; +def : RWSysReg<"TRCCIDCVR3", 0b10, 0b001, 0b0011, 0b0110, 0b000>; +def : RWSysReg<"TRCCIDCVR4", 0b10, 0b001, 0b0011, 0b1000, 0b000>; +def : RWSysReg<"TRCCIDCVR5", 0b10, 0b001, 0b0011, 0b1010, 0b000>; +def : RWSysReg<"TRCCIDCVR6", 0b10, 0b001, 0b0011, 0b1100, 0b000>; +def : RWSysReg<"TRCCIDCVR7", 0b10, 0b001, 0b0011, 0b1110, 0b000>; +def : RWSysReg<"TRCVMIDCVR0", 0b10, 0b001, 0b0011, 0b0000, 0b001>; +def : RWSysReg<"TRCVMIDCVR1", 0b10, 0b001, 0b0011, 0b0010, 0b001>; +def : RWSysReg<"TRCVMIDCVR2", 0b10, 0b001, 0b0011, 0b0100, 0b001>; +def : RWSysReg<"TRCVMIDCVR3", 0b10, 0b001, 0b0011, 0b0110, 0b001>; +def : RWSysReg<"TRCVMIDCVR4", 0b10, 0b001, 0b0011, 0b1000, 0b001>; +def : RWSysReg<"TRCVMIDCVR5", 0b10, 0b001, 0b0011, 0b1010, 0b001>; +def : RWSysReg<"TRCVMIDCVR6", 0b10, 0b001, 0b0011, 0b1100, 0b001>; +def : RWSysReg<"TRCVMIDCVR7", 0b10, 0b001, 0b0011, 0b1110, 0b001>; +def : RWSysReg<"TRCCIDCCTLR0", 0b10, 0b001, 0b0011, 0b0000, 0b010>; +def : RWSysReg<"TRCCIDCCTLR1", 0b10, 0b001, 0b0011, 0b0001, 0b010>; +def : RWSysReg<"TRCVMIDCCTLR0", 0b10, 0b001, 0b0011, 0b0010, 0b010>; +def : RWSysReg<"TRCVMIDCCTLR1", 0b10, 0b001, 0b0011, 0b0011, 0b010>; +def : RWSysReg<"TRCITCTRL", 0b10, 0b001, 0b0111, 0b0000, 0b100>; +def : RWSysReg<"TRCCLAIMSET", 0b10, 0b001, 0b0111, 0b1000, 0b110>; +def : RWSysReg<"TRCCLAIMCLR", 0b10, 0b001, 0b0111, 0b1001, 0b110>; + +// GICv3 registers +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"ICC_BPR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b011>; +def : RWSysReg<"ICC_BPR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b011>; +def : RWSysReg<"ICC_PMR_EL1", 0b11, 0b000, 0b0100, 0b0110, 0b000>; +def : RWSysReg<"ICC_CTLR_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b100>; +def : RWSysReg<"ICC_CTLR_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b100>; +def : RWSysReg<"ICC_SRE_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b101>; +def : RWSysReg<"ICC_SRE_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b101>; +def : RWSysReg<"ICC_SRE_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b101>; +def : RWSysReg<"ICC_IGRPEN0_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b110>; +def : RWSysReg<"ICC_IGRPEN1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b111>; +def : RWSysReg<"ICC_IGRPEN1_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b111>; +def : RWSysReg<"ICC_SEIEN_EL1", 0b11, 0b000, 0b1100, 0b1101, 0b000>; +def : RWSysReg<"ICC_AP0R0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b100>; +def : RWSysReg<"ICC_AP0R1_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b101>; +def : RWSysReg<"ICC_AP0R2_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b110>; +def : RWSysReg<"ICC_AP0R3_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b111>; +def : RWSysReg<"ICC_AP1R0_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b000>; +def : RWSysReg<"ICC_AP1R1_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b001>; +def : RWSysReg<"ICC_AP1R2_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b010>; +def : RWSysReg<"ICC_AP1R3_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b011>; +def : RWSysReg<"ICH_AP0R0_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b000>; +def : RWSysReg<"ICH_AP0R1_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b001>; +def : RWSysReg<"ICH_AP0R2_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b010>; +def : RWSysReg<"ICH_AP0R3_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b011>; +def : RWSysReg<"ICH_AP1R0_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b000>; +def : RWSysReg<"ICH_AP1R1_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b001>; +def : RWSysReg<"ICH_AP1R2_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b010>; +def : RWSysReg<"ICH_AP1R3_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b011>; +def : RWSysReg<"ICH_HCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b000>; +def : RWSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>; +def : RWSysReg<"ICH_VMCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b111>; +def : RWSysReg<"ICH_VSEIR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b100>; +def : RWSysReg<"ICH_LR0_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b000>; +def : RWSysReg<"ICH_LR1_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b001>; +def : RWSysReg<"ICH_LR2_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b010>; +def : RWSysReg<"ICH_LR3_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b011>; +def : RWSysReg<"ICH_LR4_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b100>; +def : RWSysReg<"ICH_LR5_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b101>; +def : RWSysReg<"ICH_LR6_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b110>; +def : RWSysReg<"ICH_LR7_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b111>; +def : RWSysReg<"ICH_LR8_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b000>; +def : RWSysReg<"ICH_LR9_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b001>; +def : RWSysReg<"ICH_LR10_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b010>; +def : RWSysReg<"ICH_LR11_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b011>; +def : RWSysReg<"ICH_LR12_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b100>; +def : RWSysReg<"ICH_LR13_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b101>; +def : RWSysReg<"ICH_LR14_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b110>; +def : RWSysReg<"ICH_LR15_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b111>; + +// v8.1a "Privileged Access Never" extension-specific system registers +let Requires = [{ {AArch64::HasV8_1aOps} }] in +def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>; + +// v8.1a "Limited Ordering Regions" extension-specific system registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::HasV8_1aOps} }] in { +def : RWSysReg<"LORSA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b000>; +def : RWSysReg<"LOREA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b001>; +def : RWSysReg<"LORN_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b010>; +def : RWSysReg<"LORC_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b011>; +} + +// v8.1a "Virtualization hos extensions" system registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::HasV8_1aOps} }] in { +def : RWSysReg<"TTBR1_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b001>; +def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>; +def : RWSysReg<"CNTHV_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b000>; +def : RWSysReg<"CNTHV_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b010>; +def : RWSysReg<"CNTHV_CTL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b001>; +def : RWSysReg<"SCTLR_EL12", 0b11, 0b101, 0b0001, 0b0000, 0b000>; +def : RWSysReg<"CPACR_EL12", 0b11, 0b101, 0b0001, 0b0000, 0b010>; +def : RWSysReg<"TTBR0_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b000>; +def : RWSysReg<"TTBR1_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b001>; +def : RWSysReg<"TCR_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b010>; +def : RWSysReg<"AFSR0_EL12", 0b11, 0b101, 0b0101, 0b0001, 0b000>; +def : RWSysReg<"AFSR1_EL12", 0b11, 0b101, 0b0101, 0b0001, 0b001>; +def : RWSysReg<"ESR_EL12", 0b11, 0b101, 0b0101, 0b0010, 0b000>; +def : RWSysReg<"FAR_EL12", 0b11, 0b101, 0b0110, 0b0000, 0b000>; +def : RWSysReg<"MAIR_EL12", 0b11, 0b101, 0b1010, 0b0010, 0b000>; +def : RWSysReg<"AMAIR_EL12", 0b11, 0b101, 0b1010, 0b0011, 0b000>; +def : RWSysReg<"VBAR_EL12", 0b11, 0b101, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"CONTEXTIDR_EL12", 0b11, 0b101, 0b1101, 0b0000, 0b001>; +def : RWSysReg<"CNTKCTL_EL12", 0b11, 0b101, 0b1110, 0b0001, 0b000>; +def : RWSysReg<"CNTP_TVAL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b000>; +def : RWSysReg<"CNTP_CTL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b001>; +def : RWSysReg<"CNTP_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b010>; +def : RWSysReg<"CNTV_TVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b000>; +def : RWSysReg<"CNTV_CTL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b001>; +def : RWSysReg<"CNTV_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b010>; +def : RWSysReg<"SPSR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b000>; +def : RWSysReg<"ELR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b001>; +} +// v8.2a registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::HasV8_2aOps} }] in +def : RWSysReg<"UAO", 0b11, 0b000, 0b0100, 0b0010, 0b100>; + +// v8.2a "Statistical Profiling extension" registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureSPE} }] in { +def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>; +def : RWSysReg<"PMBPTR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b001>; +def : RWSysReg<"PMBSR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b011>; +def : RWSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>; +def : RWSysReg<"PMSCR_EL2", 0b11, 0b100, 0b1001, 0b1001, 0b000>; +def : RWSysReg<"PMSCR_EL12", 0b11, 0b101, 0b1001, 0b1001, 0b000>; +def : RWSysReg<"PMSCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b000>; +def : RWSysReg<"PMSICR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b010>; +def : RWSysReg<"PMSIRR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b011>; +def : RWSysReg<"PMSFCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b100>; +def : RWSysReg<"PMSEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b101>; +def : RWSysReg<"PMSLATFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b110>; +def : RWSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>; +} + +// v8.2a "RAS extension" registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureRAS} }] in { +def : RWSysReg<"ERRSELR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b001>; +def : RWSysReg<"ERXCTLR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b001>; +def : RWSysReg<"ERXSTATUS_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b010>; +def : RWSysReg<"ERXADDR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b011>; +def : RWSysReg<"ERXMISC0_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b000>; +def : RWSysReg<"ERXMISC1_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b001>; +def : RWSysReg<"DISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b001>; +def : RWSysReg<"VDISR_EL2", 0b11, 0b100, 0b1100, 0b0001, 0b001>; +def : RWSysReg<"VSESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b011>; +} + +// Cyclone specific system registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::ProcCyclone} }] in +def : RWSysReg<"CPM_IOACC_CTL_EL3", 0b11, 0b111, 0b1111, 0b0010, 0b000>; diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index c52c5544fc7e..0b6345ff8011 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -11,13 +11,19 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "AArch64CallLowering.h" +#include "AArch64RegisterBankInfo.h" #include "AArch64TargetMachine.h" #include "AArch64TargetObjectFile.h" #include "AArch64TargetTransformInfo.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/IR/LegacyPassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetOptions.h" @@ -57,6 +63,11 @@ EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden, " register"), cl::init(true)); +static cl::opt +EnableRedundantCopyElimination("aarch64-redundant-copy-elim", + cl::desc("Enable the redundant copy elimination pass"), + cl::init(true), cl::Hidden); + static cl::opt EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden); @@ -92,11 +103,19 @@ static cl::opt EnableGlobalMerge("aarch64-global-merge", cl::Hidden, cl::desc("Enable the global merge pass")); +static cl::opt + EnableLoopDataPrefetch("aarch64-loop-data-prefetch", cl::Hidden, + cl::desc("Enable the loop data prefetch pass"), + cl::init(true)); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(TheAArch64leTarget); RegisterTargetMachine Y(TheAArch64beTarget); RegisterTargetMachine Z(TheARM64Target); + auto PR = PassRegistry::getPassRegistry(); + initializeGlobalISel(*PR); + initializeAArch64ExpandPseudoPass(*PR); } //===----------------------------------------------------------------------===// @@ -114,29 +133,79 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) { if (TT.isOSBinFormatMachO()) return "e-m:o-i64:64-i128:128-n32:64-S128"; if (LittleEndian) - return "e-m:e-i64:64-i128:128-n32:64-S128"; - return "E-m:e-i64:64-i128:128-n32:64-S128"; + return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; + return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; } -/// TargetMachine ctor - Create an AArch64 architecture model. +// Helper function to set up the defaults for reciprocals. +static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST) +{ + // For the estimates, convergence is quadratic, so essentially the number of + // digits is doubled after each iteration. ARMv8, the minimum architected + // accuracy of the initial estimate is 2^-8. Therefore, the number of extra + // steps to refine the result for float (23 mantissa bits) and for double + // (52 mantissa bits) are 2 and 3, respectively. + unsigned ExtraStepsF = 2, + ExtraStepsD = ExtraStepsF + 1; + bool UseRsqrt = ST.useRSqrt(); + + TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF); + TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD); + TM.Options.Reciprocals.setDefaults("vec-sqrtf", UseRsqrt, ExtraStepsF); + TM.Options.Reciprocals.setDefaults("vec-sqrtd", UseRsqrt, ExtraStepsD); + + TM.Options.Reciprocals.setDefaults("divf", false, ExtraStepsF); + TM.Options.Reciprocals.setDefaults("divd", false, ExtraStepsD); + TM.Options.Reciprocals.setDefaults("vec-divf", false, ExtraStepsF); + TM.Options.Reciprocals.setDefaults("vec-divd", false, ExtraStepsD); +} + +static Reloc::Model getEffectiveRelocModel(const Triple &TT, + Optional RM) { + // AArch64 Darwin is always PIC. + if (TT.isOSDarwin()) + return Reloc::PIC_; + // On ELF platforms the default static relocation model has a smart enough + // linker to cope with referencing external symbols defined in a shared + // library. Hence DynamicNoPIC doesn't need to be promoted to PIC. + if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC) + return Reloc::Static; + return *RM; +} + +/// Create an AArch64 architecture model. /// -AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool LittleEndian) +AArch64TargetMachine::AArch64TargetMachine( + const Target &T, const Triple &TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Optional RM, + CodeModel::Model CM, CodeGenOpt::Level OL, bool LittleEndian) // This nested ternary is horrible, but DL needs to be properly // initialized before TLInfo is constructed. : LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS, - Options, RM, CM, OL), + Options, getEffectiveRelocModel(TT, RM), CM, OL), TLOF(createTLOF(getTargetTriple())), - isLittle(LittleEndian) { + Subtarget(TT, CPU, FS, *this, LittleEndian) { + initReciprocals(*this, Subtarget); initAsmInfo(); } AArch64TargetMachine::~AArch64TargetMachine() {} +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { +struct AArch64GISelActualAccessor : public GISelAccessor { + std::unique_ptr CallLoweringInfo; + std::unique_ptr RegBankInfo; + const CallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); + } +}; +} // End anonymous namespace. +#endif + const AArch64Subtarget * AArch64TargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); @@ -156,7 +225,18 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { // function that reside in TargetOptions. resetTargetOptions(F); I = llvm::make_unique(TargetTriple, CPU, FS, *this, - isLittle); + Subtarget.isLittleEndian()); +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *GISel = new GISelAccessor(); +#else + AArch64GISelActualAccessor *GISel = + new AArch64GISelActualAccessor(); + GISel->CallLoweringInfo.reset( + new AArch64CallLowering(*I->getTargetLowering())); + GISel->RegBankInfo.reset( + new AArch64RegisterBankInfo(*I->getRegisterInfo())); +#endif + I->setGISelAccessor(*GISel); } return I.get(); } @@ -165,16 +245,16 @@ void AArch64leTargetMachine::anchor() { } AArch64leTargetMachine::AArch64leTargetMachine( const Target &T, const Triple &TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) + const TargetOptions &Options, Optional RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} void AArch64beTargetMachine::anchor() { } AArch64beTargetMachine::AArch64beTargetMachine( const Target &T, const Triple &TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) + const TargetOptions &Options, Optional RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} namespace { @@ -194,6 +274,10 @@ public: void addIRPasses() override; bool addPreISel() override; bool addInstSelector() override; +#ifdef LLVM_BUILD_GLOBAL_ISEL + bool addIRTranslator() override; + bool addRegBankSelect() override; +#endif bool addILPOpts() override; void addPreRegAlloc() override; void addPostRegAlloc() override; @@ -223,6 +307,13 @@ void AArch64PassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) addPass(createCFGSimplificationPass()); + // Run LoopDataPrefetch + // + // Run this before LSR to remove the multiplies involved in computing the + // pointer values N iterations ahead. + if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch) + addPass(createLoopDataPrefetchPass()); + TargetPassConfig::addIRPasses(); // Match interleaved memory accesses to ldN/stN intrinsics. @@ -278,6 +369,17 @@ bool AArch64PassConfig::addInstSelector() { return false; } +#ifdef LLVM_BUILD_GLOBAL_ISEL +bool AArch64PassConfig::addIRTranslator() { + addPass(new IRTranslator()); + return false; +} +bool AArch64PassConfig::addRegBankSelect() { + addPass(new RegBankSelect()); + return false; +} +#endif + bool AArch64PassConfig::addILPOpts() { if (EnableCondOpt) addPass(createAArch64ConditionOptimizerPass()); @@ -303,6 +405,10 @@ void AArch64PassConfig::addPreRegAlloc() { } void AArch64PassConfig::addPostRegAlloc() { + // Remove redundant copy instructions. + if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination) + addPass(createAArch64RedundantCopyEliminationPass()); + // Change dead register definitions to refer to the zero register. if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination) addPass(createAArch64DeadRegisterDefinitions()); diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 8d49a29386ac..b44107b065bd 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -29,7 +29,7 @@ protected: public: AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool IsLittleEndian); ~AArch64TargetMachine() override; @@ -46,28 +46,28 @@ public: } private: - bool isLittle; + AArch64Subtarget Subtarget; }; -// AArch64leTargetMachine - AArch64 little endian target machine. +// AArch64 little endian target machine. // class AArch64leTargetMachine : public AArch64TargetMachine { virtual void anchor(); public: AArch64leTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; -// AArch64beTargetMachine - AArch64 big endian target machine. +// AArch64 big endian target machine. // class AArch64beTargetMachine : public AArch64TargetMachine { virtual void anchor(); public: AArch64beTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 9af0e6444789..ecf4d93068a4 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -291,6 +291,61 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { return BaseT::getCastInstrCost(Opcode, Dst, Src); } +int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, + VectorType *VecTy, + unsigned Index) { + + // Make sure we were given a valid extend opcode. + assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && + "Invalid opcode"); + + // We are extending an element we extract from a vector, so the source type + // of the extend is the element type of the vector. + auto *Src = VecTy->getElementType(); + + // Sign- and zero-extends are for integer types only. + assert(isa(Dst) && isa(Src) && "Invalid type"); + + // Get the cost for the extract. We compute the cost (if any) for the extend + // below. + auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); + + // Legalize the types. + auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); + auto DstVT = TLI->getValueType(DL, Dst); + auto SrcVT = TLI->getValueType(DL, Src); + + // If the resulting type is still a vector and the destination type is legal, + // we may get the extension for free. If not, get the default cost for the + // extend. + if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) + return Cost + getCastInstrCost(Opcode, Dst, Src); + + // The destination type should be larger than the element type. If not, get + // the default cost for the extend. + if (DstVT.getSizeInBits() < SrcVT.getSizeInBits()) + return Cost + getCastInstrCost(Opcode, Dst, Src); + + switch (Opcode) { + default: + llvm_unreachable("Opcode should be either SExt or ZExt"); + + // For sign-extends, we only need a smov, which performs the extension + // automatically. + case Instruction::SExt: + return Cost; + + // For zero-extends, the extend is performed automatically by a umov unless + // the destination type is i64 and the element type is i8 or i16. + case Instruction::ZExt: + if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) + return Cost; + } + + // If we are unable to perform the extend for free, get the default cost. + return Cost + getCastInstrCost(Opcode, Dst, Src); +} + int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); @@ -313,7 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } // All other insert/extracts cost this much. - return 3; + return ST->getVectorInsertExtractBaseCost(); } int AArch64TTIImpl::getArithmeticInstrCost( @@ -472,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { } unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { - if (ST->isCortexA57()) - return 4; - return 2; + return ST->getMaxInterleaveFactor(); } void AArch64TTIImpl::getUnrollingPreferences(Loop *L, @@ -571,3 +624,19 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, } return true; } + +unsigned AArch64TTIImpl::getCacheLineSize() { + return ST->getCacheLineSize(); +} + +unsigned AArch64TTIImpl::getPrefetchDistance() { + return ST->getPrefetchDistance(); +} + +unsigned AArch64TTIImpl::getMinPrefetchStride() { + return ST->getMinPrefetchStride(); +} + +unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { + return ST->getMaxPrefetchIterationsAhead(); +} diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index ec58c4fe309f..4f2e8310d769 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -99,6 +99,9 @@ public: int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, + unsigned Index); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getArithmeticInstrCost( @@ -127,6 +130,14 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace); + + unsigned getCacheLineSize(); + + unsigned getPrefetchDistance(); + + unsigned getMinPrefetchStride(); + + unsigned getMaxPrefetchIterationsAhead(); /// @} }; diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 394c8e78581f..aebc370333e3 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -13,7 +13,6 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" @@ -24,13 +23,14 @@ #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCTargetAsmParser.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include @@ -70,6 +70,8 @@ private: bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); } bool showMatchError(SMLoc Loc, unsigned ErrCode); + bool parseDirectiveArch(SMLoc L); + bool parseDirectiveCPU(SMLoc L); bool parseDirectiveWord(unsigned Size, SMLoc L); bool parseDirectiveInst(SMLoc L); @@ -866,14 +868,7 @@ public: if (!CE) return false; uint64_t Value = CE->getValue(); - if (RegWidth == 32) - Value &= 0xffffffffULL; - - // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0". - if (Value == 0 && Shift != 0) - return false; - - return (Value & ~(0xffffULL << Shift)) == 0; + return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth); } template @@ -884,16 +879,7 @@ public: if (!CE) return false; uint64_t Value = CE->getValue(); - // MOVZ takes precedence over MOVN. - for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16) - if ((Value & ~(0xffffULL << MOVZShift)) == 0) - return false; - - Value = ~Value; - if (RegWidth == 32) - Value &= 0xffffffffULL; - - return (Value & ~(0xffffULL << Shift)) == 0; + return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth); } bool isFPImm() const { return Kind == k_FPImm; } @@ -2087,12 +2073,9 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_ParseFail; } - bool Valid; - auto Mapper = AArch64PRFM::PRFMMapper(); - StringRef Name = - Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); - Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name, - S, getContext())); + auto PRFM = AArch64PRFM::lookupPRFMByEncoding(MCE->getValue()); + Operands.push_back(AArch64Operand::CreatePrefetch( + prfop, PRFM ? PRFM->Name : "", S, getContext())); return MatchOperand_Success; } @@ -2101,18 +2084,15 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_ParseFail; } - bool Valid; - auto Mapper = AArch64PRFM::PRFMMapper(); - unsigned prfop = - Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); - if (!Valid) { + auto PRFM = AArch64PRFM::lookupPRFMByName(Tok.getString()); + if (!PRFM) { TokError("pre-fetch hint expected"); return MatchOperand_ParseFail; } Parser.Lex(); // Eat identifier token. - Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Tok.getString(), - S, getContext())); + Operands.push_back(AArch64Operand::CreatePrefetch( + PRFM->Encoding, Tok.getString(), S, getContext())); return MatchOperand_Success; } @@ -2127,18 +2107,15 @@ AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) { return MatchOperand_ParseFail; } - bool Valid; - auto Mapper = AArch64PSBHint::PSBHintMapper(); - unsigned psbhint = - Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); - if (!Valid) { + auto PSB = AArch64PSBHint::lookupPSBByName(Tok.getString()); + if (!PSB) { TokError("invalid operand for instruction"); return MatchOperand_ParseFail; } Parser.Lex(); // Eat identifier token. - Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(), - S, getContext())); + Operands.push_back(AArch64Operand::CreatePSBHint( + PSB->Encoding, Tok.getString(), S, getContext())); return MatchOperand_Success; } @@ -2762,12 +2739,9 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { Error(ExprLoc, "barrier operand out of range"); return MatchOperand_ParseFail; } - bool Valid; - auto Mapper = AArch64DB::DBarrierMapper(); - StringRef Name = - Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); - Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name, - ExprLoc, getContext())); + auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue()); + Operands.push_back(AArch64Operand::CreateBarrier( + MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext())); return MatchOperand_Success; } @@ -2776,23 +2750,20 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { return MatchOperand_ParseFail; } - bool Valid; - auto Mapper = AArch64DB::DBarrierMapper(); - unsigned Opt = - Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); - if (!Valid) { + auto DB = AArch64DB::lookupDBByName(Tok.getString()); + if (!DB) { TokError("invalid barrier option name"); return MatchOperand_ParseFail; } // The only valid named option for ISB is 'sy' - if (Mnemonic == "isb" && Opt != AArch64DB::SY) { + if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) { TokError("'sy' or #imm operand expected"); return MatchOperand_ParseFail; } - Operands.push_back( AArch64Operand::CreateBarrier(Opt, Tok.getString(), - getLoc(), getContext())); + Operands.push_back(AArch64Operand::CreateBarrier( + DB->Encoding, Tok.getString(), getLoc(), getContext())); Parser.Lex(); // Consume the option return MatchOperand_Success; @@ -2806,28 +2777,22 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { if (Tok.isNot(AsmToken::Identifier)) return MatchOperand_NoMatch; - bool IsKnown; - auto MRSMapper = AArch64SysReg::MRSMapper(); - uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), - getSTI().getFeatureBits(), IsKnown); - assert(IsKnown == (MRSReg != -1U) && - "register should be -1 if and only if it's unknown"); - - auto MSRMapper = AArch64SysReg::MSRMapper(); - uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), - getSTI().getFeatureBits(), IsKnown); - assert(IsKnown == (MSRReg != -1U) && - "register should be -1 if and only if it's unknown"); - - auto PStateMapper = AArch64PState::PStateMapper(); - uint32_t PStateField = - PStateMapper.fromString(Tok.getString(), - getSTI().getFeatureBits(), IsKnown); - assert(IsKnown == (PStateField != -1U) && - "register should be -1 if and only if it's unknown"); - - Operands.push_back(AArch64Operand::CreateSysReg( - Tok.getString(), getLoc(), MRSReg, MSRReg, PStateField, getContext())); + int MRSReg, MSRReg; + auto SysReg = AArch64SysReg::lookupSysRegByName(Tok.getString()); + if (SysReg && SysReg->haveFeatures(getSTI().getFeatureBits())) { + MRSReg = SysReg->Readable ? SysReg->Encoding : -1; + MSRReg = SysReg->Writeable ? SysReg->Encoding : -1; + } else + MRSReg = MSRReg = AArch64SysReg::parseGenericRegister(Tok.getString()); + + auto PState = AArch64PState::lookupPStateByName(Tok.getString()); + unsigned PStateImm = -1; + if (PState && PState->haveFeatures(getSTI().getFeatureBits())) + PStateImm = PState->Encoding; + + Operands.push_back( + AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), MRSReg, MSRReg, + PStateImm, getContext())); Parser.Lex(); // Eat identifier return MatchOperand_Success; @@ -4195,6 +4160,10 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getIdentifier(); SMLoc Loc = DirectiveID.getLoc(); + if (IDVal == ".arch") + return parseDirectiveArch(Loc); + if (IDVal == ".cpu") + return parseDirectiveCPU(Loc); if (IDVal == ".hword") return parseDirectiveWord(2, Loc); if (IDVal == ".word") @@ -4216,6 +4185,99 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { return parseDirectiveLOH(IDVal, Loc); } +static const struct { + const char *Name; + const FeatureBitset Features; +} ExtensionMap[] = { + { "crc", {AArch64::FeatureCRC} }, + { "crypto", {AArch64::FeatureCrypto} }, + { "fp", {AArch64::FeatureFPARMv8} }, + { "simd", {AArch64::FeatureNEON} }, + + // FIXME: Unsupported extensions + { "lse", {} }, + { "pan", {} }, + { "lor", {} }, + { "rdma", {} }, + { "profile", {} }, +}; + +/// parseDirectiveArch +/// ::= .arch token +bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { + SMLoc ArchLoc = getLoc(); + + StringRef Arch, ExtensionString; + std::tie(Arch, ExtensionString) = + getParser().parseStringToEndOfStatement().trim().split('+'); + + unsigned ID = AArch64::parseArch(Arch); + if (ID == ARM::AK_INVALID) { + Error(ArchLoc, "unknown arch name"); + return false; + } + + MCSubtargetInfo &STI = copySTI(); + STI.setDefaultFeatures("", ""); + if (!ExtensionString.empty()) + STI.setDefaultFeatures("", ("+" + ExtensionString).str()); + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + + return false; +} + +/// parseDirectiveCPU +/// ::= .cpu id +bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { + SMLoc CPULoc = getLoc(); + + StringRef CPU, ExtensionString; + std::tie(CPU, ExtensionString) = + getParser().parseStringToEndOfStatement().trim().split('+'); + + SmallVector RequestedExtensions; + if (!ExtensionString.empty()) + ExtensionString.split(RequestedExtensions, '+'); + + // FIXME This is using tablegen data, but should be moved to ARMTargetParser + // once that is tablegen'ed + if (!getSTI().isCPUStringValid(CPU)) { + Error(CPULoc, "unknown CPU name"); + return false; + } + + MCSubtargetInfo &STI = copySTI(); + STI.setDefaultFeatures(CPU, ""); + + FeatureBitset Features = STI.getFeatureBits(); + for (auto Name : RequestedExtensions) { + bool EnableFeature = true; + + if (Name.startswith_lower("no")) { + EnableFeature = false; + Name = Name.substr(2); + } + + for (const auto &Extension : ExtensionMap) { + if (Extension.Name != Name) + continue; + + if (Extension.Features.none()) + report_fatal_error("unsupported architectural extension: " + Name); + + FeatureBitset ToggleFeatures = EnableFeature + ? (~Features & Extension.Features) + : ( Features & Extension.Features); + uint64_t Features = + ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); + setAvailableFeatures(Features); + + break; + } + } + return false; +} + /// parseDirectiveWord /// ::= .word [ expression (, expression)* ] bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile deleted file mode 100644 index 00268c76f8e8..000000000000 --- a/lib/Target/AArch64/AsmParser/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AArch64/AsmParser/Makefile ---------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64AsmParser - -# Hack: we need to include 'main' ARM target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index f26327ff84ad..a79960ea9605 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -12,8 +12,25 @@ tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv) tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables) + add_public_tablegen_target(AArch64CommonTableGen) +# List of all GlobalISel files. +set(GLOBAL_ISEL_FILES + AArch64CallLowering.cpp + AArch64RegisterBankInfo.cpp + ) + +# Add GlobalISel files to the dependencies if the user wants to build it. +if(LLVM_BUILD_GLOBAL_ISEL) + set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES}) +else() + set(GLOBAL_ISEL_BUILD_FILES"") + set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES}) +endif() + + add_llvm_target(AArch64CodeGen AArch64A57FPLoadBalancing.cpp AArch64AddressTypePromotion.cpp @@ -29,6 +46,7 @@ add_llvm_target(AArch64CodeGen AArch64A53Fix835769.cpp AArch64FrameLowering.cpp AArch64ConditionOptimizer.cpp + AArch64RedundantCopyElimination.cpp AArch64ISelDAGToDAG.cpp AArch64ISelLowering.cpp AArch64InstrInfo.cpp @@ -43,6 +61,7 @@ add_llvm_target(AArch64CodeGen AArch64TargetMachine.cpp AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp + ${GLOBAL_ISEL_BUILD_FILES} ) add_dependencies(LLVMAArch64CodeGen intrinsics_gen) diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index f1f968e73123..fe6ea31b9061 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -1523,13 +1523,12 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, Inst.addOperand(MCOperand::createImm(pstate_field)); Inst.addOperand(MCOperand::createImm(crm)); - bool ValidNamed; - const AArch64Disassembler *Dis = + const AArch64Disassembler *Dis = static_cast(Decoder); - (void)AArch64PState::PStateMapper().toString(pstate_field, - Dis->getSubtargetInfo().getFeatureBits(), ValidNamed); - - return ValidNamed ? Success : Fail; + auto PState = AArch64PState::lookupPStateByEncoding(pstate_field); + if (PState && PState->haveFeatures(Dis->getSubtargetInfo().getFeatureBits())) + return Success; + return Fail; } static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, @@ -1574,7 +1573,7 @@ static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, const void *Decoder) { - return DecodeGPRSeqPairsClassRegisterClass(Inst, + return DecodeGPRSeqPairsClassRegisterClass(Inst, AArch64::WSeqPairsClassRegClassID, RegNo, Addr, Decoder); } @@ -1583,7 +1582,7 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, const void *Decoder) { - return DecodeGPRSeqPairsClassRegisterClass(Inst, + return DecodeGPRSeqPairsClassRegisterClass(Inst, AArch64::XSeqPairsClassRegClassID, RegNo, Addr, Decoder); } diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h index 7fb57adfeeba..e475e505e7d1 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h @@ -13,7 +13,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H #define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H -#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" namespace llvm { diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp index 82bc949927ce..19d0ba2e1c41 100644 --- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -134,9 +134,11 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand( if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr) CommentStream << "literal pool symbol address: " << ReferenceName; else if (ReferenceType == - LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) - CommentStream << "literal pool for: \"" << ReferenceName << "\""; - else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) { + CommentStream << "literal pool for: \""; + CommentStream.write_escaped(ReferenceName); + CommentStream << "\""; + } else if (ReferenceType == LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref) CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\""; else if (ReferenceType == diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h index 12b8450b13c6..49e844963797 100644 --- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H #define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H -#include "llvm/MC/MCExternalSymbolizer.h" +#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h" namespace llvm { diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile deleted file mode 100644 index 741bb817a633..000000000000 --- a/lib/Target/AArch64/Disassembler/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AArch64/Disassembler/Makefile ------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64Disassembler - -# Hack: we need to include 'main' arm target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index d8a810824370..b4f85204714f 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -219,6 +219,54 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, return; } + // MOVZ, MOVN and "ORR wzr, #imm" instructions are aliases for MOV, but their + // domains overlap so they need to be prioritized. The chain is "MOVZ lsl #0 > + // MOVZ lsl #N > MOVN lsl #0 > MOVN lsl #N > ORR". The highest instruction + // that can represent the move is the MOV alias, and the rest get printed + // normally. + if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) && + MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) { + int RegWidth = Opcode == AArch64::MOVZXi ? 64 : 32; + int Shift = MI->getOperand(2).getImm(); + uint64_t Value = (uint64_t)MI->getOperand(1).getImm() << Shift; + + if (AArch64_AM::isMOVZMovAlias(Value, Shift, + Opcode == AArch64::MOVZXi ? 64 : 32)) { + O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << formatImm(SignExtend64(Value, RegWidth)); + return; + } + } + + if ((Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) && + MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) { + int RegWidth = Opcode == AArch64::MOVNXi ? 64 : 32; + int Shift = MI->getOperand(2).getImm(); + uint64_t Value = ~((uint64_t)MI->getOperand(1).getImm() << Shift); + if (RegWidth == 32) + Value = Value & 0xffffffff; + + if (AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth)) { + O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << formatImm(SignExtend64(Value, RegWidth)); + return; + } + } + + if ((Opcode == AArch64::ORRXri || Opcode == AArch64::ORRWri) && + (MI->getOperand(1).getReg() == AArch64::XZR || + MI->getOperand(1).getReg() == AArch64::WZR) && + MI->getOperand(2).isImm()) { + int RegWidth = Opcode == AArch64::ORRXri ? 64 : 32; + uint64_t Value = AArch64_AM::decodeLogicalImmediate( + MI->getOperand(2).getImm(), RegWidth); + if (!AArch64_AM::isAnyMOVWMovAlias(Value, RegWidth)) { + O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << formatImm(SignExtend64(Value, RegWidth)); + return; + } + } + if (!printAliasInstr(MI, STI, O)) printInstruction(MI, STI, O); @@ -928,14 +976,21 @@ void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, unsigned Reg = Op.getReg(); O << getRegisterName(Reg); } else if (Op.isImm()) { - O << '#' << Op.getImm(); + printImm(MI, OpNo, STI, O); } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); Op.getExpr()->print(O, &MAI); } } -void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo, +void AArch64InstPrinter::printImm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + O << "#" << formatImm(Op.getImm()); +} + +void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); @@ -981,12 +1036,12 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum, assert(Val == MO.getImm() && "Add/sub immediate out of range!"); unsigned Shift = AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm()); - O << '#' << Val; + O << '#' << formatImm(Val); if (Shift != 0) printShifter(MI, OpNum + 1, STI, O); if (CommentStream) - *CommentStream << '=' << (Val << Shift) << '\n'; + *CommentStream << '=' << formatImm(Val << Shift) << '\n'; } else { assert(MO.isExpr() && "Unexpected operand type!"); MO.getExpr()->print(O, &MAI); @@ -1104,14 +1159,14 @@ template void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { - O << '#' << Scale * MI->getOperand(OpNum).getImm(); + O << '#' << formatImm(Scale * MI->getOperand(OpNum).getImm()); } void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale, raw_ostream &O) { const MCOperand MO = MI->getOperand(OpNum); if (MO.isImm()) { - O << "#" << (MO.getImm() * Scale); + O << "#" << formatImm(MO.getImm() * Scale); } else { assert(MO.isExpr() && "Unexpected operand type!"); MO.getExpr()->print(O, &MAI); @@ -1123,7 +1178,7 @@ void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum, const MCOperand MO1 = MI->getOperand(OpNum + 1); O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()); if (MO1.isImm()) { - O << ", #" << (MO1.getImm() * Scale); + O << ", #" << formatImm(MO1.getImm() * Scale); } else { assert(MO1.isExpr() && "Unexpected operand type!"); O << ", "; @@ -1136,26 +1191,22 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned prfop = MI->getOperand(OpNum).getImm(); - bool Valid; - StringRef Name = - AArch64PRFM::PRFMMapper().toString(prfop, STI.getFeatureBits(), Valid); - if (Valid) - O << Name; + auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop); + if (PRFM) + O << PRFM->Name; else - O << '#' << prfop; + O << '#' << formatImm(prfop); } void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned psbhintop = MI->getOperand(OpNum).getImm(); - bool Valid; - StringRef Name = - AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid); - if (Valid) - O << Name; + auto PSB = AArch64PSBHint::lookupPSBByEncoding(psbhintop); + if (PSB) + O << PSB->Name; else - O << '#' << psbhintop; + O << '#' << formatImm(psbhintop); } void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, @@ -1310,7 +1361,7 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, // If the label has already been resolved to an immediate offset (say, when // we're running the disassembler), just print the immediate. if (Op.isImm()) { - O << "#" << (Op.getImm() * 4); + O << "#" << formatImm(Op.getImm() * 4); return; } @@ -1335,7 +1386,7 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum, // If the label has already been resolved to an immediate offset (say, when // we're running the disassembler), just print the immediate. if (Op.isImm()) { - O << "#" << (Op.getImm() * (1 << 12)); + O << "#" << formatImm(Op.getImm() * (1 << 12)); return; } @@ -1349,15 +1400,15 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo, unsigned Val = MI->getOperand(OpNo).getImm(); unsigned Opcode = MI->getOpcode(); - bool Valid; StringRef Name; - if (Opcode == AArch64::ISB) - Name = AArch64ISB::ISBMapper().toString(Val, STI.getFeatureBits(), - Valid); - else - Name = AArch64DB::DBarrierMapper().toString(Val, STI.getFeatureBits(), - Valid); - if (Valid) + if (Opcode == AArch64::ISB) { + auto ISB = AArch64ISB::lookupISBByEncoding(Val); + Name = ISB ? ISB->Name : ""; + } else { + auto DB = AArch64DB::lookupDBByEncoding(Val); + Name = DB ? DB->Name : ""; + } + if (!Name.empty()) O << Name; else O << "#" << Val; @@ -1368,10 +1419,19 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned Val = MI->getOperand(OpNo).getImm(); - auto Mapper = AArch64SysReg::MRSMapper(); - std::string Name = Mapper.toString(Val, STI.getFeatureBits()); + // Horrible hack for the one register that has identical encodings but + // different names in MSR and MRS. Because of this, one of MRS and MSR is + // going to get the wrong entry + if (Val == AArch64SysReg::DBGDTRRX_EL0) { + O << "DBGDTRRX_EL0"; + return; + } - O << StringRef(Name).upper(); + const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); + if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits())) + O << Reg->Name; + else + O << AArch64SysReg::genericRegisterString(Val); } void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, @@ -1379,10 +1439,19 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned Val = MI->getOperand(OpNo).getImm(); - auto Mapper = AArch64SysReg::MSRMapper(); - std::string Name = Mapper.toString(Val, STI.getFeatureBits()); + // Horrible hack for the one register that has identical encodings but + // different names in MSR and MRS. Because of this, one of MRS and MSR is + // going to get the wrong entry + if (Val == AArch64SysReg::DBGDTRTX_EL0) { + O << "DBGDTRTX_EL0"; + return; + } - O << StringRef(Name).upper(); + const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); + if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits())) + O << Reg->Name; + else + O << AArch64SysReg::genericRegisterString(Val); } void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, @@ -1390,13 +1459,11 @@ void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned Val = MI->getOperand(OpNo).getImm(); - bool Valid; - StringRef Name = - AArch64PState::PStateMapper().toString(Val, STI.getFeatureBits(), Valid); - if (Valid) - O << Name.upper(); + auto PState = AArch64PState::lookupPStateByEncoding(Val); + if (PState && PState->haveFeatures(STI.getFeatureBits())) + O << PState->Name; else - O << "#" << Val; + O << "#" << formatImm(Val); } void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo, diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index ea68d9848b42..65dca99ed04e 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -49,7 +49,9 @@ protected: // Operand printers void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printHexImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + void printImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, raw_ostream &O); diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile deleted file mode 100644 index b17e8d080119..000000000000 --- a/lib/Target/AArch64/InstPrinter/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AArch64/AsmPrinter/Makefile --------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64AsmPrinter - -# Hack: we need to include 'main' arm target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt index 642c18394a67..0196c505ba3c 100644 --- a/lib/Target/AArch64/LLVMBuild.txt +++ b/lib/Target/AArch64/LLVMBuild.txt @@ -31,5 +31,5 @@ has_jit = 1 type = Library name = AArch64CodeGen parent = AArch64 -required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target +required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target GlobalISel add_to_library_groups = AArch64 diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index 648b1dfc8c5e..3e5ef4df4706 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -753,6 +753,49 @@ static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) { return (EncVal << 32) | EncVal; } +inline static bool isAnyMOVZMovAlias(uint64_t Value, int RegWidth) { + for (int Shift = 0; Shift <= RegWidth - 16; Shift += 16) + if ((Value & ~(0xffffULL << Shift)) == 0) + return true; + + return false; +} + +inline static bool isMOVZMovAlias(uint64_t Value, int Shift, int RegWidth) { + if (RegWidth == 32) + Value &= 0xffffffffULL; + + // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0". + if (Value == 0 && Shift != 0) + return false; + + return (Value & ~(0xffffULL << Shift)) == 0; +} + +inline static bool isMOVNMovAlias(uint64_t Value, int Shift, int RegWidth) { + // MOVZ takes precedence over MOVN. + if (isAnyMOVZMovAlias(Value, RegWidth)) + return false; + + Value = ~Value; + if (RegWidth == 32) + Value &= 0xffffffffULL; + + return isMOVZMovAlias(Value, Shift, RegWidth); +} + +inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) { + if (isAnyMOVZMovAlias(Value, RegWidth)) + return true; + + // It's not a MOVZ, but it might be a MOVN. + Value = ~Value; + if (RegWidth == 32) + Value &= 0xffffffffULL; + + return isAnyMOVZMovAlias(Value, RegWidth); +} + } // end namespace AArch64_AM } // end namespace llvm diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 7624c7240d68..27993246eb07 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -12,6 +12,7 @@ #include "MCTargetDesc/AArch64FixupKinds.h" #include "llvm/ADT/Triple.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixupKindInfo.h" @@ -28,9 +29,12 @@ namespace { class AArch64AsmBackend : public MCAsmBackend { static const unsigned PCRelFlagVal = MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel; +public: + bool IsLittleEndian; public: - AArch64AsmBackend(const Target &T) : MCAsmBackend() {} + AArch64AsmBackend(const Target &T, bool IsLittleEndian) + : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {} unsigned getNumFixupKinds() const override { return AArch64::NumTargetFixupKinds; @@ -74,12 +78,15 @@ public: bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; - void relaxInstruction(const MCInst &Inst, MCInst &Res) const override; + void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + MCInst &Res) const override; bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; void HandleAssemblerFlag(MCAssemblerFlag Flag) {} unsigned getPointerSize() const { return 8; } + + unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const; }; } // end anonymous namespace @@ -129,14 +136,16 @@ static unsigned AdrImmBits(unsigned Value) { return (hi19 << 5) | (lo2 << 29); } -static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { +static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, + MCContext *Ctx) { + unsigned Kind = Fixup.getKind(); int64_t SignedValue = static_cast(Value); switch (Kind) { default: llvm_unreachable("Unknown fixup kind!"); case AArch64::fixup_aarch64_pcrel_adr_imm21: - if (SignedValue > 2097151 || SignedValue < -2097152) - report_fatal_error("fixup value out of range"); + if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); return AdrImmBits(Value & 0x1fffffULL); case AArch64::fixup_aarch64_pcrel_adrp_imm21: return AdrImmBits((Value & 0x1fffff000ULL) >> 12); @@ -144,54 +153,66 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { case AArch64::fixup_aarch64_pcrel_branch19: // Signed 21-bit immediate if (SignedValue > 2097151 || SignedValue < -2097152) - report_fatal_error("fixup value out of range"); + if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Ctx && (Value & 0x3)) + Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); // Low two bits are not encoded. return (Value >> 2) & 0x7ffff; case AArch64::fixup_aarch64_add_imm12: case AArch64::fixup_aarch64_ldst_imm12_scale1: // Unsigned 12-bit immediate - if (Value >= 0x1000) - report_fatal_error("invalid imm12 fixup value"); + if (Ctx && Value >= 0x1000) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); return Value; case AArch64::fixup_aarch64_ldst_imm12_scale2: // Unsigned 12-bit immediate which gets multiplied by 2 - if (Value & 1 || Value >= 0x2000) - report_fatal_error("invalid imm12 fixup value"); + if (Ctx && (Value >= 0x2000)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Ctx && (Value & 0x1)) + Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned"); return Value >> 1; case AArch64::fixup_aarch64_ldst_imm12_scale4: // Unsigned 12-bit immediate which gets multiplied by 4 - if (Value & 3 || Value >= 0x4000) - report_fatal_error("invalid imm12 fixup value"); + if (Ctx && (Value >= 0x4000)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Ctx && (Value & 0x3)) + Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned"); return Value >> 2; case AArch64::fixup_aarch64_ldst_imm12_scale8: // Unsigned 12-bit immediate which gets multiplied by 8 - if (Value & 7 || Value >= 0x8000) - report_fatal_error("invalid imm12 fixup value"); + if (Ctx && (Value >= 0x8000)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Ctx && (Value & 0x7)) + Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned"); return Value >> 3; case AArch64::fixup_aarch64_ldst_imm12_scale16: // Unsigned 12-bit immediate which gets multiplied by 16 - if (Value & 15 || Value >= 0x10000) - report_fatal_error("invalid imm12 fixup value"); + if (Ctx && (Value >= 0x10000)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Ctx && (Value & 0xf)) + Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned"); return Value >> 4; case AArch64::fixup_aarch64_movw: - report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet"); + if (Ctx) + Ctx->reportError(Fixup.getLoc(), + "no resolvable MOVZ/MOVK fixups supported yet"); return Value; case AArch64::fixup_aarch64_pcrel_branch14: // Signed 16-bit immediate - if (SignedValue > 32767 || SignedValue < -32768) - report_fatal_error("fixup value out of range"); + if (Ctx && (SignedValue > 32767 || SignedValue < -32768)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); // Low two bits are not encoded (4-byte alignment assumed). - if (Value & 0x3) - report_fatal_error("fixup not sufficiently aligned"); + if (Ctx && (Value & 0x3)) + Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); return (Value >> 2) & 0x3fff; case AArch64::fixup_aarch64_pcrel_branch26: case AArch64::fixup_aarch64_pcrel_call26: // Signed 28-bit immediate - if (SignedValue > 134217727 || SignedValue < -134217728) - report_fatal_error("fixup value out of range"); + if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); // Low two bits are not encoded (4-byte alignment assumed). - if (Value & 0x3) - report_fatal_error("fixup not sufficiently aligned"); + if (Ctx && (Value & 0x3)) + Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); return (Value >> 2) & 0x3ffffff; case FK_Data_1: case FK_Data_2: @@ -201,6 +222,45 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { } } +/// getFixupKindContainereSizeInBytes - The number of bytes of the +/// container involved in big endian or 0 if the item is little endian +unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const { + if (IsLittleEndian) + return 0; + + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + + case FK_Data_1: + return 1; + case FK_Data_2: + return 2; + case FK_Data_4: + return 4; + case FK_Data_8: + return 8; + + case AArch64::fixup_aarch64_tlsdesc_call: + case AArch64::fixup_aarch64_movw: + case AArch64::fixup_aarch64_pcrel_branch14: + case AArch64::fixup_aarch64_add_imm12: + case AArch64::fixup_aarch64_ldst_imm12_scale1: + case AArch64::fixup_aarch64_ldst_imm12_scale2: + case AArch64::fixup_aarch64_ldst_imm12_scale4: + case AArch64::fixup_aarch64_ldst_imm12_scale8: + case AArch64::fixup_aarch64_ldst_imm12_scale16: + case AArch64::fixup_aarch64_ldr_pcrel_imm19: + case AArch64::fixup_aarch64_pcrel_branch19: + case AArch64::fixup_aarch64_pcrel_adr_imm21: + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + // Instructions are always little endian + return 0; + } +} + void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const { @@ -209,7 +269,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, return; // Doesn't change encoding. MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); // Apply any target-specific value adjustments. - Value = adjustFixupValue(Fixup.getKind(), Value); + Value = adjustFixupValue(Fixup, Value, nullptr); // Shift the value into position. Value <<= Info.TargetOffset; @@ -217,10 +277,25 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned Offset = Fixup.getOffset(); assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + // Used to point to big endian bytes. + unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind()); + // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. - for (unsigned i = 0; i != NumBytes; ++i) - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + if (FulleSizeInBytes == 0) { + // Handle as little-endian + for (unsigned i = 0; i != NumBytes; ++i) { + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + } + } else { + // Handle as big-endian + assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!"); + assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!"); + for (unsigned i = 0; i != NumBytes; ++i) { + unsigned Idx = FulleSizeInBytes - 1 - i; + Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + } + } } bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const { @@ -239,6 +314,7 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, } void AArch64AsmBackend::relaxInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI, MCInst &Res) const { llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented"); } @@ -264,14 +340,14 @@ namespace CU { enum CompactUnwindEncodings { /// \brief A "frameless" leaf function, where no non-volatile registers are /// saved. The return remains in LR throughout the function. - UNWIND_AArch64_MODE_FRAMELESS = 0x02000000, + UNWIND_ARM64_MODE_FRAMELESS = 0x02000000, /// \brief No compact unwind encoding available. Instead the low 23-bits of /// the compact unwind encoding is the offset of the DWARF FDE in the /// __eh_frame section. This mode is never used in object files. It is only /// generated by the linker in final linked images, which have only DWARF info /// for a function. - UNWIND_AArch64_MODE_DWARF = 0x03000000, + UNWIND_ARM64_MODE_DWARF = 0x03000000, /// \brief This is a standard arm64 prologue where FP/LR are immediately /// pushed on the stack, then SP is copied to FP. If there are any @@ -279,18 +355,18 @@ enum CompactUnwindEncodings { /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the /// five X pairs and four D pairs can be saved, but the memory layout must be /// in register number order. - UNWIND_AArch64_MODE_FRAME = 0x04000000, + UNWIND_ARM64_MODE_FRAME = 0x04000000, /// \brief Frame register pair encodings. - UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001, - UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002, - UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004, - UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008, - UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010, - UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100, - UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200, - UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400, - UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800 + UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001, + UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002, + UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004, + UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008, + UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010, + UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100, + UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200, + UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400, + UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800 }; } // end CU namespace @@ -300,7 +376,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend { const MCRegisterInfo &MRI; /// \brief Encode compact unwind stack adjustment for frameless functions. - /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h. + /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h. /// The stack size always needs to be 16 byte aligned. uint32_t encodeStackAdjustment(uint32_t StackSize) const { return (StackSize / 16) << 12; @@ -308,7 +384,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend { public: DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI) - : AArch64AsmBackend(T), MRI(MRI) {} + : AArch64AsmBackend(T, /*IsLittleEndian*/true), MRI(MRI) {} MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64, @@ -319,7 +395,7 @@ public: uint32_t generateCompactUnwindEncoding( ArrayRef Instrs) const override { if (Instrs.empty()) - return CU::UNWIND_AArch64_MODE_FRAMELESS; + return CU::UNWIND_ARM64_MODE_FRAMELESS; bool HasFP = false; unsigned StackSize = 0; @@ -331,7 +407,7 @@ public: switch (Inst.getOperation()) { default: // Cannot handle this directive: bail out. - return CU::UNWIND_AArch64_MODE_DWARF; + return CU::UNWIND_ARM64_MODE_DWARF; case MCCFIInstruction::OpDefCfa: { // Defines a frame pointer. assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) == @@ -356,7 +432,7 @@ public: "Pushing invalid registers for frame!"); // Indicate that the function has a frame. - CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME; + CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME; HasFP = true; break; } @@ -370,11 +446,11 @@ public: // `.cfi_offset' instructions with the appropriate registers specified. unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true); if (i + 1 == e) - return CU::UNWIND_AArch64_MODE_DWARF; + return CU::UNWIND_ARM64_MODE_DWARF; const MCCFIInstruction &Inst2 = Instrs[++i]; if (Inst2.getOperation() != MCCFIInstruction::OpOffset) - return CU::UNWIND_AArch64_MODE_DWARF; + return CU::UNWIND_ARM64_MODE_DWARF; unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true); // N.B. The encodings must be in register number order, and the X @@ -390,19 +466,19 @@ public: if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 && (CompactUnwindEncoding & 0xF1E) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR; else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 && (CompactUnwindEncoding & 0xF1C) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR; else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 && (CompactUnwindEncoding & 0xF18) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR; else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 && (CompactUnwindEncoding & 0xF10) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR; else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 && (CompactUnwindEncoding & 0xF00) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR; else { Reg1 = getDRegFromBReg(Reg1); Reg2 = getDRegFromBReg(Reg2); @@ -413,18 +489,18 @@ public: // D14/D15 pair = 0x00000800 if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 && (CompactUnwindEncoding & 0xE00) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR; else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 && (CompactUnwindEncoding & 0xC00) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR; else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 && (CompactUnwindEncoding & 0x800) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR; else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR; else // A pair was pushed which we cannot handle. - return CU::UNWIND_AArch64_MODE_DWARF; + return CU::UNWIND_ARM64_MODE_DWARF; } break; @@ -436,9 +512,9 @@ public: // With compact unwind info we can only represent stack adjustments of up // to 65520 bytes. if (StackSize > 65520) - return CU::UNWIND_AArch64_MODE_DWARF; + return CU::UNWIND_ARM64_MODE_DWARF; - CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS; + CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS; CompactUnwindEncoding |= encodeStackAdjustment(StackSize); } @@ -453,10 +529,9 @@ namespace { class ELFAArch64AsmBackend : public AArch64AsmBackend { public: uint8_t OSABI; - bool IsLittleEndian; ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian) - : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {} + : AArch64AsmBackend(T, IsLittleEndian), OSABI(OSABI) {} MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian); @@ -466,9 +541,6 @@ public: const MCFixup &Fixup, const MCFragment *DF, const MCValue &Target, uint64_t &Value, bool &IsResolved) override; - - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override; }; void ELFAArch64AsmBackend::processFixupValue( @@ -489,34 +561,14 @@ void ELFAArch64AsmBackend::processFixupValue( // to the linker -- a relocation! if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21) IsResolved = false; -} - -// Returns whether this fixup is based on an address in the .eh_frame section, -// and therefore should be byte swapped. -// FIXME: Should be replaced with something more principled. -static bool isByteSwappedFixup(const MCExpr *E) { - MCValue Val; - if (!E->evaluateAsRelocatable(Val, nullptr, nullptr)) - return false; - if (!Val.getSymA() || Val.getSymA()->getSymbol().isUndefined()) - return false; - - const MCSectionELF *SecELF = - dyn_cast(&Val.getSymA()->getSymbol().getSection()); - return SecELF->getSectionName() == ".eh_frame"; + // Try to get the encoded value for the fixup as-if we're mapping it into + // the instruction. This allows adjustFixupValue() to issue a diagnostic + // if the value is invalid. + if (IsResolved) + (void)adjustFixupValue(Fixup, Value, &Asm.getContext()); } -void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel) const { - // store fixups in .eh_frame section in big endian order - if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) { - if (isByteSwappedFixup(Fixup.getValue())) - Value = ByteSwap_32(unsigned(Value)); - } - AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel); -} } MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 1f516d1db896..4b4c4097b97b 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -15,6 +15,7 @@ #include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" @@ -29,8 +30,8 @@ public: ~AArch64ELFObjectWriter() override; protected: - unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsPCRel) const override; + unsigned getRelocType(MCContext &Ctx, const MCValue &Target, + const MCFixup &Fixup, bool IsPCRel) const override; private: }; @@ -43,9 +44,10 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {} -unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, - const MCFixup &Fixup, - bool IsPCRel) const { +unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, + const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { AArch64MCExpr::VariantKind RefKind = static_cast(Target.getRefKind()); AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); @@ -61,6 +63,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (IsPCRel) { switch ((unsigned)Fixup.getKind()) { + case FK_Data_1: + Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); + return ELF::R_AARCH64_NONE; case FK_Data_2: return ELF::R_AARCH64_PREL16; case FK_Data_4: @@ -79,7 +84,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21; if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) return ELF::R_AARCH64_TLSDESC_ADR_PAGE21; - llvm_unreachable("invalid symbol kind for ADRP relocation"); + Ctx.reportError(Fixup.getLoc(), + "invalid symbol kind for ADRP relocation"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_pcrel_branch26: return ELF::R_AARCH64_JUMP26; case AArch64::fixup_aarch64_pcrel_call26: @@ -93,10 +100,14 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, case AArch64::fixup_aarch64_pcrel_branch19: return ELF::R_AARCH64_CONDBR19; default: - llvm_unreachable("Unsupported pc-relative fixup kind"); + Ctx.reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind"); + return ELF::R_AARCH64_NONE; } } else { switch ((unsigned)Fixup.getKind()) { + case FK_Data_1: + Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); + return ELF::R_AARCH64_NONE; case FK_Data_2: return ELF::R_AARCH64_ABS16; case FK_Data_4: @@ -121,8 +132,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_ADD_ABS_LO12_NC; - report_fatal_error("invalid fixup for add (uimm12) instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for add (uimm12) instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale1: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_LDST8_ABS_LO12_NC; @@ -135,8 +147,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC; - report_fatal_error("invalid fixup for 8-bit load/store instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 8-bit load/store instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale2: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_LDST16_ABS_LO12_NC; @@ -149,8 +162,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC; - report_fatal_error("invalid fixup for 16-bit load/store instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 16-bit load/store instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale4: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_LDST32_ABS_LO12_NC; @@ -163,8 +177,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC; - report_fatal_error("invalid fixup for 32-bit load/store instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 32-bit load/store instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale8: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_LDST64_ABS_LO12_NC; @@ -183,14 +198,16 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC) return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC; - report_fatal_error("invalid fixup for 64-bit load/store instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 64-bit load/store instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale16: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_LDST128_ABS_LO12_NC; - report_fatal_error("invalid fixup for 128-bit load/store instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 128-bit load/store instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_movw: if (RefKind == AArch64MCExpr::VK_ABS_G3) return ELF::R_AARCH64_MOVW_UABS_G3; @@ -236,12 +253,14 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1; if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC) return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC; - report_fatal_error("invalid fixup for movz/movk instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for movz/movk instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_tlsdesc_call: return ELF::R_AARCH64_TLSDESC_CALL; default: - llvm_unreachable("Unknown ELF relocation type"); + Ctx.reportError(Fixup.getLoc(), "Unknown ELF relocation type"); + return ELF::R_AARCH64_NONE; } } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 7d8e79bc63c8..7b9ff8fa0503 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -154,24 +154,6 @@ public: SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; - /// getSIMDShift64OpValue - Return the encoded value for the - // shift-by-immediate AdvSIMD instructions. - uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const; @@ -428,41 +410,6 @@ AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx, llvm_unreachable("Invalid value for vector shift amount!"); } -uint32_t -AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 64 - (MO.getImm()); -} - -uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue( - const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 64 - (MO.getImm() | 32); -} - -uint32_t -AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 32 - (MO.getImm() | 16); -} - -uint32_t -AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 16 - (MO.getImm() | 8); -} - /// getFixedPointScaleOpValue - Return the encoded value for the // FP-to-fixed-point scale factor. uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue( diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 9f7bed0d3b12..702780621208 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -15,7 +15,6 @@ #include "AArch64ELFStreamer.h" #include "AArch64MCAsmInfo.h" #include "InstPrinter/AArch64InstPrinter.h" -#include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" @@ -72,10 +71,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT, - Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { +static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM, + CodeModel::Model &CM) { assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) && "Only expect Darwin and ELF targets"); @@ -89,19 +86,6 @@ static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT, else if (CM != CodeModel::Small && CM != CodeModel::Large) report_fatal_error( "Only small and large code models are allowed on AArch64"); - - // AArch64 Darwin is always PIC. - if (TT.isOSDarwin()) - RM = Reloc::PIC_; - // On ELF platforms the default static relocation model has a smart enough - // linker to cope with referencing external symbols defined in a shared - // library. Hence DynamicNoPIC doesn't need to be promoted to PIC. - else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) - RM = Reloc::Static; - - MCCodeGenInfo *X = new MCCodeGenInfo(); - X->initMCCodeGenInfo(RM, CM, OL); - return X; } static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T, @@ -140,7 +124,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() { RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo); // Register the MC codegen info. - TargetRegistry::RegisterMCCodeGenInfo(*T, createAArch64MCCodeGenInfo); + TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts); // Register the MC instruction info. TargetRegistry::RegisterMCInstrInfo(*T, createAArch64MCInstrInfo); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 342384437c6a..39414cc0c6a5 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -15,7 +15,6 @@ #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H #include "llvm/Support/DataTypes.h" -#include namespace llvm { class formatted_raw_ostream; diff --git a/lib/Target/AArch64/MCTargetDesc/Makefile b/lib/Target/AArch64/MCTargetDesc/Makefile deleted file mode 100644 index 5779ac5ac60a..000000000000 --- a/lib/Target/AArch64/MCTargetDesc/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AArch64/TargetDesc/Makefile --------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64Desc - -# Hack: we need to include 'main' target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile deleted file mode 100644 index f356c5850413..000000000000 --- a/lib/Target/AArch64/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -##===- lib/Target/AArch64/Makefile -------------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../.. -LIBRARYNAME = LLVMAArch64CodeGen -TARGET = AArch64 - -# Make sure that tblgen is run, first thing. -BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \ - AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \ - AArch64GenDAGISel.inc \ - AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \ - AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \ - AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \ - AArch64GenMCPseudoLowering.inc - -DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/TargetInfo/Makefile b/lib/Target/AArch64/TargetInfo/Makefile deleted file mode 100644 index 9dc9aa4bccf7..000000000000 --- a/lib/Target/AArch64/TargetInfo/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AArch64/TargetInfo/Makefile --------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64Info - -# Hack: we need to include 'main' target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index cde1c6df2608..e65ba1f2401d 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -11,858 +11,84 @@ // //===----------------------------------------------------------------------===// #include "AArch64BaseInfo.h" -#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Regex.h" using namespace llvm; -StringRef AArch64NamedImmMapper::toString(uint32_t Value, - const FeatureBitset& FeatureBits, bool &Valid) const { - for (unsigned i = 0; i < NumMappings; ++i) { - if (Mappings[i].isValueEqual(Value, FeatureBits)) { - Valid = true; - return Mappings[i].Name; - } +namespace llvm { + namespace AArch64AT { +#define GET_AT_IMPL +#include "AArch64GenSystemOperands.inc" } - - Valid = false; - return StringRef(); } -uint32_t AArch64NamedImmMapper::fromString(StringRef Name, - const FeatureBitset& FeatureBits, bool &Valid) const { - std::string LowerCaseName = Name.lower(); - for (unsigned i = 0; i < NumMappings; ++i) { - if (Mappings[i].isNameEqual(LowerCaseName, FeatureBits)) { - Valid = true; - return Mappings[i].Value; - } - } - Valid = false; - return -1; +namespace llvm { + namespace AArch64DB { +#define GET_DB_IMPL +#include "AArch64GenSystemOperands.inc" + } } -bool AArch64NamedImmMapper::validImm(uint32_t Value) const { - return Value < TooBigImm; +namespace llvm { + namespace AArch64DC { +#define GET_DC_IMPL +#include "AArch64GenSystemOperands.inc" + } } -const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATMappings[] = { - {"s1e1r", S1E1R, {}}, - {"s1e2r", S1E2R, {}}, - {"s1e3r", S1E3R, {}}, - {"s1e1w", S1E1W, {}}, - {"s1e2w", S1E2W, {}}, - {"s1e3w", S1E3W, {}}, - {"s1e0r", S1E0R, {}}, - {"s1e0w", S1E0W, {}}, - {"s12e1r", S12E1R, {}}, - {"s12e1w", S12E1W, {}}, - {"s12e0r", S12E0R, {}}, - {"s12e0w", S12E0W, {}}, -}; - -AArch64AT::ATMapper::ATMapper() - : AArch64NamedImmMapper(ATMappings, 0) {} - -const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierMappings[] = { - {"oshld", OSHLD, {}}, - {"oshst", OSHST, {}}, - {"osh", OSH, {}}, - {"nshld", NSHLD, {}}, - {"nshst", NSHST, {}}, - {"nsh", NSH, {}}, - {"ishld", ISHLD, {}}, - {"ishst", ISHST, {}}, - {"ish", ISH, {}}, - {"ld", LD, {}}, - {"st", ST, {}}, - {"sy", SY, {}} -}; - -AArch64DB::DBarrierMapper::DBarrierMapper() - : AArch64NamedImmMapper(DBarrierMappings, 16u) {} - -const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCMappings[] = { - {"zva", ZVA, {}}, - {"ivac", IVAC, {}}, - {"isw", ISW, {}}, - {"cvac", CVAC, {}}, - {"csw", CSW, {}}, - {"cvau", CVAU, {}}, - {"civac", CIVAC, {}}, - {"cisw", CISW, {}} -}; - -AArch64DC::DCMapper::DCMapper() - : AArch64NamedImmMapper(DCMappings, 0) {} - -const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICMappings[] = { - {"ialluis", IALLUIS, {}}, - {"iallu", IALLU, {}}, - {"ivau", IVAU, {}} -}; - -AArch64IC::ICMapper::ICMapper() - : AArch64NamedImmMapper(ICMappings, 0) {} - -const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBMappings[] = { - {"sy", SY, {}}, -}; - -AArch64ISB::ISBMapper::ISBMapper() - : AArch64NamedImmMapper(ISBMappings, 16) {} - -const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMMappings[] = { - {"pldl1keep", PLDL1KEEP, {}}, - {"pldl1strm", PLDL1STRM, {}}, - {"pldl2keep", PLDL2KEEP, {}}, - {"pldl2strm", PLDL2STRM, {}}, - {"pldl3keep", PLDL3KEEP, {}}, - {"pldl3strm", PLDL3STRM, {}}, - {"plil1keep", PLIL1KEEP, {}}, - {"plil1strm", PLIL1STRM, {}}, - {"plil2keep", PLIL2KEEP, {}}, - {"plil2strm", PLIL2STRM, {}}, - {"plil3keep", PLIL3KEEP, {}}, - {"plil3strm", PLIL3STRM, {}}, - {"pstl1keep", PSTL1KEEP, {}}, - {"pstl1strm", PSTL1STRM, {}}, - {"pstl2keep", PSTL2KEEP, {}}, - {"pstl2strm", PSTL2STRM, {}}, - {"pstl3keep", PSTL3KEEP, {}}, - {"pstl3strm", PSTL3STRM, {}} -}; - -AArch64PRFM::PRFMMapper::PRFMMapper() - : AArch64NamedImmMapper(PRFMMappings, 32) {} - -const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings[] = { - {"spsel", SPSel, {}}, - {"daifset", DAIFSet, {}}, - {"daifclr", DAIFClr, {}}, - - // v8.1a "Privileged Access Never" extension-specific PStates - {"pan", PAN, {AArch64::HasV8_1aOps}}, - - // v8.2a - {"uao", UAO, {AArch64::HasV8_2aOps}}, -}; - -AArch64PState::PStateMapper::PStateMapper() - : AArch64NamedImmMapper(PStateMappings, 0) {} - -const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = { - // v8.2a "Statistical Profiling" extension-specific PSB operand - {"csync", CSync, {AArch64::FeatureSPE}}, -}; - -AArch64PSBHint::PSBHintMapper::PSBHintMapper() - : AArch64NamedImmMapper(PSBHintMappings, 0) {} - -const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { - {"mdccsr_el0", MDCCSR_EL0, {}}, - {"dbgdtrrx_el0", DBGDTRRX_EL0, {}}, - {"mdrar_el1", MDRAR_EL1, {}}, - {"oslsr_el1", OSLSR_EL1, {}}, - {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1, {}}, - {"pmceid0_el0", PMCEID0_EL0, {}}, - {"pmceid1_el0", PMCEID1_EL0, {}}, - {"midr_el1", MIDR_EL1, {}}, - {"ccsidr_el1", CCSIDR_EL1, {}}, - {"clidr_el1", CLIDR_EL1, {}}, - {"ctr_el0", CTR_EL0, {}}, - {"mpidr_el1", MPIDR_EL1, {}}, - {"revidr_el1", REVIDR_EL1, {}}, - {"aidr_el1", AIDR_EL1, {}}, - {"dczid_el0", DCZID_EL0, {}}, - {"id_pfr0_el1", ID_PFR0_EL1, {}}, - {"id_pfr1_el1", ID_PFR1_EL1, {}}, - {"id_dfr0_el1", ID_DFR0_EL1, {}}, - {"id_afr0_el1", ID_AFR0_EL1, {}}, - {"id_mmfr0_el1", ID_MMFR0_EL1, {}}, - {"id_mmfr1_el1", ID_MMFR1_EL1, {}}, - {"id_mmfr2_el1", ID_MMFR2_EL1, {}}, - {"id_mmfr3_el1", ID_MMFR3_EL1, {}}, - {"id_mmfr4_el1", ID_MMFR4_EL1, {}}, - {"id_isar0_el1", ID_ISAR0_EL1, {}}, - {"id_isar1_el1", ID_ISAR1_EL1, {}}, - {"id_isar2_el1", ID_ISAR2_EL1, {}}, - {"id_isar3_el1", ID_ISAR3_EL1, {}}, - {"id_isar4_el1", ID_ISAR4_EL1, {}}, - {"id_isar5_el1", ID_ISAR5_EL1, {}}, - {"id_aa64pfr0_el1", ID_A64PFR0_EL1, {}}, - {"id_aa64pfr1_el1", ID_A64PFR1_EL1, {}}, - {"id_aa64dfr0_el1", ID_A64DFR0_EL1, {}}, - {"id_aa64dfr1_el1", ID_A64DFR1_EL1, {}}, - {"id_aa64afr0_el1", ID_A64AFR0_EL1, {}}, - {"id_aa64afr1_el1", ID_A64AFR1_EL1, {}}, - {"id_aa64isar0_el1", ID_A64ISAR0_EL1, {}}, - {"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}}, - {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}}, - {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}}, - {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}}, - {"mvfr0_el1", MVFR0_EL1, {}}, - {"mvfr1_el1", MVFR1_EL1, {}}, - {"mvfr2_el1", MVFR2_EL1, {}}, - {"rvbar_el1", RVBAR_EL1, {}}, - {"rvbar_el2", RVBAR_EL2, {}}, - {"rvbar_el3", RVBAR_EL3, {}}, - {"isr_el1", ISR_EL1, {}}, - {"cntpct_el0", CNTPCT_EL0, {}}, - {"cntvct_el0", CNTVCT_EL0, {}}, - - // Trace registers - {"trcstatr", TRCSTATR, {}}, - {"trcidr8", TRCIDR8, {}}, - {"trcidr9", TRCIDR9, {}}, - {"trcidr10", TRCIDR10, {}}, - {"trcidr11", TRCIDR11, {}}, - {"trcidr12", TRCIDR12, {}}, - {"trcidr13", TRCIDR13, {}}, - {"trcidr0", TRCIDR0, {}}, - {"trcidr1", TRCIDR1, {}}, - {"trcidr2", TRCIDR2, {}}, - {"trcidr3", TRCIDR3, {}}, - {"trcidr4", TRCIDR4, {}}, - {"trcidr5", TRCIDR5, {}}, - {"trcidr6", TRCIDR6, {}}, - {"trcidr7", TRCIDR7, {}}, - {"trcoslsr", TRCOSLSR, {}}, - {"trcpdsr", TRCPDSR, {}}, - {"trcdevaff0", TRCDEVAFF0, {}}, - {"trcdevaff1", TRCDEVAFF1, {}}, - {"trclsr", TRCLSR, {}}, - {"trcauthstatus", TRCAUTHSTATUS, {}}, - {"trcdevarch", TRCDEVARCH, {}}, - {"trcdevid", TRCDEVID, {}}, - {"trcdevtype", TRCDEVTYPE, {}}, - {"trcpidr4", TRCPIDR4, {}}, - {"trcpidr5", TRCPIDR5, {}}, - {"trcpidr6", TRCPIDR6, {}}, - {"trcpidr7", TRCPIDR7, {}}, - {"trcpidr0", TRCPIDR0, {}}, - {"trcpidr1", TRCPIDR1, {}}, - {"trcpidr2", TRCPIDR2, {}}, - {"trcpidr3", TRCPIDR3, {}}, - {"trccidr0", TRCCIDR0, {}}, - {"trccidr1", TRCCIDR1, {}}, - {"trccidr2", TRCCIDR2, {}}, - {"trccidr3", TRCCIDR3, {}}, - - // GICv3 registers - {"icc_iar1_el1", ICC_IAR1_EL1, {}}, - {"icc_iar0_el1", ICC_IAR0_EL1, {}}, - {"icc_hppir1_el1", ICC_HPPIR1_EL1, {}}, - {"icc_hppir0_el1", ICC_HPPIR0_EL1, {}}, - {"icc_rpr_el1", ICC_RPR_EL1, {}}, - {"ich_vtr_el2", ICH_VTR_EL2, {}}, - {"ich_eisr_el2", ICH_EISR_EL2, {}}, - {"ich_elsr_el2", ICH_ELSR_EL2, {}}, - - // v8.1a "Limited Ordering Regions" extension-specific system registers - {"lorid_el1", LORID_EL1, {AArch64::HasV8_1aOps}}, -}; - -AArch64SysReg::MRSMapper::MRSMapper() { - InstMappings = &MRSMappings[0]; - NumInstMappings = llvm::array_lengthof(MRSMappings); +namespace llvm { + namespace AArch64IC { +#define GET_IC_IMPL +#include "AArch64GenSystemOperands.inc" + } } -const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = { - {"dbgdtrtx_el0", DBGDTRTX_EL0, {}}, - {"oslar_el1", OSLAR_EL1, {}}, - {"pmswinc_el0", PMSWINC_EL0, {}}, - - // Trace registers - {"trcoslar", TRCOSLAR, {}}, - {"trclar", TRCLAR, {}}, - - // GICv3 registers - {"icc_eoir1_el1", ICC_EOIR1_EL1, {}}, - {"icc_eoir0_el1", ICC_EOIR0_EL1, {}}, - {"icc_dir_el1", ICC_DIR_EL1, {}}, - {"icc_sgi1r_el1", ICC_SGI1R_EL1, {}}, - {"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}}, - {"icc_sgi0r_el1", ICC_SGI0R_EL1, {}}, -}; - -AArch64SysReg::MSRMapper::MSRMapper() { - InstMappings = &MSRMappings[0]; - NumInstMappings = llvm::array_lengthof(MSRMappings); +namespace llvm { + namespace AArch64ISB { +#define GET_ISB_IMPL +#include "AArch64GenSystemOperands.inc" + } +} +namespace llvm { + namespace AArch64PRFM { +#define GET_PRFM_IMPL +#include "AArch64GenSystemOperands.inc" + } } +namespace llvm { + namespace AArch64PState { +#define GET_PSTATE_IMPL +#include "AArch64GenSystemOperands.inc" + } +} -const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings[] = { - {"osdtrrx_el1", OSDTRRX_EL1, {}}, - {"osdtrtx_el1", OSDTRTX_EL1, {}}, - {"teecr32_el1", TEECR32_EL1, {}}, - {"mdccint_el1", MDCCINT_EL1, {}}, - {"mdscr_el1", MDSCR_EL1, {}}, - {"dbgdtr_el0", DBGDTR_EL0, {}}, - {"oseccr_el1", OSECCR_EL1, {}}, - {"dbgvcr32_el2", DBGVCR32_EL2, {}}, - {"dbgbvr0_el1", DBGBVR0_EL1, {}}, - {"dbgbvr1_el1", DBGBVR1_EL1, {}}, - {"dbgbvr2_el1", DBGBVR2_EL1, {}}, - {"dbgbvr3_el1", DBGBVR3_EL1, {}}, - {"dbgbvr4_el1", DBGBVR4_EL1, {}}, - {"dbgbvr5_el1", DBGBVR5_EL1, {}}, - {"dbgbvr6_el1", DBGBVR6_EL1, {}}, - {"dbgbvr7_el1", DBGBVR7_EL1, {}}, - {"dbgbvr8_el1", DBGBVR8_EL1, {}}, - {"dbgbvr9_el1", DBGBVR9_EL1, {}}, - {"dbgbvr10_el1", DBGBVR10_EL1, {}}, - {"dbgbvr11_el1", DBGBVR11_EL1, {}}, - {"dbgbvr12_el1", DBGBVR12_EL1, {}}, - {"dbgbvr13_el1", DBGBVR13_EL1, {}}, - {"dbgbvr14_el1", DBGBVR14_EL1, {}}, - {"dbgbvr15_el1", DBGBVR15_EL1, {}}, - {"dbgbcr0_el1", DBGBCR0_EL1, {}}, - {"dbgbcr1_el1", DBGBCR1_EL1, {}}, - {"dbgbcr2_el1", DBGBCR2_EL1, {}}, - {"dbgbcr3_el1", DBGBCR3_EL1, {}}, - {"dbgbcr4_el1", DBGBCR4_EL1, {}}, - {"dbgbcr5_el1", DBGBCR5_EL1, {}}, - {"dbgbcr6_el1", DBGBCR6_EL1, {}}, - {"dbgbcr7_el1", DBGBCR7_EL1, {}}, - {"dbgbcr8_el1", DBGBCR8_EL1, {}}, - {"dbgbcr9_el1", DBGBCR9_EL1, {}}, - {"dbgbcr10_el1", DBGBCR10_EL1, {}}, - {"dbgbcr11_el1", DBGBCR11_EL1, {}}, - {"dbgbcr12_el1", DBGBCR12_EL1, {}}, - {"dbgbcr13_el1", DBGBCR13_EL1, {}}, - {"dbgbcr14_el1", DBGBCR14_EL1, {}}, - {"dbgbcr15_el1", DBGBCR15_EL1, {}}, - {"dbgwvr0_el1", DBGWVR0_EL1, {}}, - {"dbgwvr1_el1", DBGWVR1_EL1, {}}, - {"dbgwvr2_el1", DBGWVR2_EL1, {}}, - {"dbgwvr3_el1", DBGWVR3_EL1, {}}, - {"dbgwvr4_el1", DBGWVR4_EL1, {}}, - {"dbgwvr5_el1", DBGWVR5_EL1, {}}, - {"dbgwvr6_el1", DBGWVR6_EL1, {}}, - {"dbgwvr7_el1", DBGWVR7_EL1, {}}, - {"dbgwvr8_el1", DBGWVR8_EL1, {}}, - {"dbgwvr9_el1", DBGWVR9_EL1, {}}, - {"dbgwvr10_el1", DBGWVR10_EL1, {}}, - {"dbgwvr11_el1", DBGWVR11_EL1, {}}, - {"dbgwvr12_el1", DBGWVR12_EL1, {}}, - {"dbgwvr13_el1", DBGWVR13_EL1, {}}, - {"dbgwvr14_el1", DBGWVR14_EL1, {}}, - {"dbgwvr15_el1", DBGWVR15_EL1, {}}, - {"dbgwcr0_el1", DBGWCR0_EL1, {}}, - {"dbgwcr1_el1", DBGWCR1_EL1, {}}, - {"dbgwcr2_el1", DBGWCR2_EL1, {}}, - {"dbgwcr3_el1", DBGWCR3_EL1, {}}, - {"dbgwcr4_el1", DBGWCR4_EL1, {}}, - {"dbgwcr5_el1", DBGWCR5_EL1, {}}, - {"dbgwcr6_el1", DBGWCR6_EL1, {}}, - {"dbgwcr7_el1", DBGWCR7_EL1, {}}, - {"dbgwcr8_el1", DBGWCR8_EL1, {}}, - {"dbgwcr9_el1", DBGWCR9_EL1, {}}, - {"dbgwcr10_el1", DBGWCR10_EL1, {}}, - {"dbgwcr11_el1", DBGWCR11_EL1, {}}, - {"dbgwcr12_el1", DBGWCR12_EL1, {}}, - {"dbgwcr13_el1", DBGWCR13_EL1, {}}, - {"dbgwcr14_el1", DBGWCR14_EL1, {}}, - {"dbgwcr15_el1", DBGWCR15_EL1, {}}, - {"teehbr32_el1", TEEHBR32_EL1, {}}, - {"osdlr_el1", OSDLR_EL1, {}}, - {"dbgprcr_el1", DBGPRCR_EL1, {}}, - {"dbgclaimset_el1", DBGCLAIMSET_EL1, {}}, - {"dbgclaimclr_el1", DBGCLAIMCLR_EL1, {}}, - {"csselr_el1", CSSELR_EL1, {}}, - {"vpidr_el2", VPIDR_EL2, {}}, - {"vmpidr_el2", VMPIDR_EL2, {}}, - {"sctlr_el1", SCTLR_EL1, {}}, - {"sctlr_el2", SCTLR_EL2, {}}, - {"sctlr_el3", SCTLR_EL3, {}}, - {"actlr_el1", ACTLR_EL1, {}}, - {"actlr_el2", ACTLR_EL2, {}}, - {"actlr_el3", ACTLR_EL3, {}}, - {"cpacr_el1", CPACR_EL1, {}}, - {"hcr_el2", HCR_EL2, {}}, - {"scr_el3", SCR_EL3, {}}, - {"mdcr_el2", MDCR_EL2, {}}, - {"sder32_el3", SDER32_EL3, {}}, - {"cptr_el2", CPTR_EL2, {}}, - {"cptr_el3", CPTR_EL3, {}}, - {"hstr_el2", HSTR_EL2, {}}, - {"hacr_el2", HACR_EL2, {}}, - {"mdcr_el3", MDCR_EL3, {}}, - {"ttbr0_el1", TTBR0_EL1, {}}, - {"ttbr0_el2", TTBR0_EL2, {}}, - {"ttbr0_el3", TTBR0_EL3, {}}, - {"ttbr1_el1", TTBR1_EL1, {}}, - {"tcr_el1", TCR_EL1, {}}, - {"tcr_el2", TCR_EL2, {}}, - {"tcr_el3", TCR_EL3, {}}, - {"vttbr_el2", VTTBR_EL2, {}}, - {"vtcr_el2", VTCR_EL2, {}}, - {"dacr32_el2", DACR32_EL2, {}}, - {"spsr_el1", SPSR_EL1, {}}, - {"spsr_el2", SPSR_EL2, {}}, - {"spsr_el3", SPSR_EL3, {}}, - {"elr_el1", ELR_EL1, {}}, - {"elr_el2", ELR_EL2, {}}, - {"elr_el3", ELR_EL3, {}}, - {"sp_el0", SP_EL0, {}}, - {"sp_el1", SP_EL1, {}}, - {"sp_el2", SP_EL2, {}}, - {"spsel", SPSel, {}}, - {"nzcv", NZCV, {}}, - {"daif", DAIF, {}}, - {"currentel", CurrentEL, {}}, - {"spsr_irq", SPSR_irq, {}}, - {"spsr_abt", SPSR_abt, {}}, - {"spsr_und", SPSR_und, {}}, - {"spsr_fiq", SPSR_fiq, {}}, - {"fpcr", FPCR, {}}, - {"fpsr", FPSR, {}}, - {"dspsr_el0", DSPSR_EL0, {}}, - {"dlr_el0", DLR_EL0, {}}, - {"ifsr32_el2", IFSR32_EL2, {}}, - {"afsr0_el1", AFSR0_EL1, {}}, - {"afsr0_el2", AFSR0_EL2, {}}, - {"afsr0_el3", AFSR0_EL3, {}}, - {"afsr1_el1", AFSR1_EL1, {}}, - {"afsr1_el2", AFSR1_EL2, {}}, - {"afsr1_el3", AFSR1_EL3, {}}, - {"esr_el1", ESR_EL1, {}}, - {"esr_el2", ESR_EL2, {}}, - {"esr_el3", ESR_EL3, {}}, - {"fpexc32_el2", FPEXC32_EL2, {}}, - {"far_el1", FAR_EL1, {}}, - {"far_el2", FAR_EL2, {}}, - {"far_el3", FAR_EL3, {}}, - {"hpfar_el2", HPFAR_EL2, {}}, - {"par_el1", PAR_EL1, {}}, - {"pmcr_el0", PMCR_EL0, {}}, - {"pmcntenset_el0", PMCNTENSET_EL0, {}}, - {"pmcntenclr_el0", PMCNTENCLR_EL0, {}}, - {"pmovsclr_el0", PMOVSCLR_EL0, {}}, - {"pmselr_el0", PMSELR_EL0, {}}, - {"pmccntr_el0", PMCCNTR_EL0, {}}, - {"pmxevtyper_el0", PMXEVTYPER_EL0, {}}, - {"pmxevcntr_el0", PMXEVCNTR_EL0, {}}, - {"pmuserenr_el0", PMUSERENR_EL0, {}}, - {"pmintenset_el1", PMINTENSET_EL1, {}}, - {"pmintenclr_el1", PMINTENCLR_EL1, {}}, - {"pmovsset_el0", PMOVSSET_EL0, {}}, - {"mair_el1", MAIR_EL1, {}}, - {"mair_el2", MAIR_EL2, {}}, - {"mair_el3", MAIR_EL3, {}}, - {"amair_el1", AMAIR_EL1, {}}, - {"amair_el2", AMAIR_EL2, {}}, - {"amair_el3", AMAIR_EL3, {}}, - {"vbar_el1", VBAR_EL1, {}}, - {"vbar_el2", VBAR_EL2, {}}, - {"vbar_el3", VBAR_EL3, {}}, - {"rmr_el1", RMR_EL1, {}}, - {"rmr_el2", RMR_EL2, {}}, - {"rmr_el3", RMR_EL3, {}}, - {"contextidr_el1", CONTEXTIDR_EL1, {}}, - {"tpidr_el0", TPIDR_EL0, {}}, - {"tpidr_el2", TPIDR_EL2, {}}, - {"tpidr_el3", TPIDR_EL3, {}}, - {"tpidrro_el0", TPIDRRO_EL0, {}}, - {"tpidr_el1", TPIDR_EL1, {}}, - {"cntfrq_el0", CNTFRQ_EL0, {}}, - {"cntvoff_el2", CNTVOFF_EL2, {}}, - {"cntkctl_el1", CNTKCTL_EL1, {}}, - {"cnthctl_el2", CNTHCTL_EL2, {}}, - {"cntp_tval_el0", CNTP_TVAL_EL0, {}}, - {"cnthp_tval_el2", CNTHP_TVAL_EL2, {}}, - {"cntps_tval_el1", CNTPS_TVAL_EL1, {}}, - {"cntp_ctl_el0", CNTP_CTL_EL0, {}}, - {"cnthp_ctl_el2", CNTHP_CTL_EL2, {}}, - {"cntps_ctl_el1", CNTPS_CTL_EL1, {}}, - {"cntp_cval_el0", CNTP_CVAL_EL0, {}}, - {"cnthp_cval_el2", CNTHP_CVAL_EL2, {}}, - {"cntps_cval_el1", CNTPS_CVAL_EL1, {}}, - {"cntv_tval_el0", CNTV_TVAL_EL0, {}}, - {"cntv_ctl_el0", CNTV_CTL_EL0, {}}, - {"cntv_cval_el0", CNTV_CVAL_EL0, {}}, - {"pmevcntr0_el0", PMEVCNTR0_EL0, {}}, - {"pmevcntr1_el0", PMEVCNTR1_EL0, {}}, - {"pmevcntr2_el0", PMEVCNTR2_EL0, {}}, - {"pmevcntr3_el0", PMEVCNTR3_EL0, {}}, - {"pmevcntr4_el0", PMEVCNTR4_EL0, {}}, - {"pmevcntr5_el0", PMEVCNTR5_EL0, {}}, - {"pmevcntr6_el0", PMEVCNTR6_EL0, {}}, - {"pmevcntr7_el0", PMEVCNTR7_EL0, {}}, - {"pmevcntr8_el0", PMEVCNTR8_EL0, {}}, - {"pmevcntr9_el0", PMEVCNTR9_EL0, {}}, - {"pmevcntr10_el0", PMEVCNTR10_EL0, {}}, - {"pmevcntr11_el0", PMEVCNTR11_EL0, {}}, - {"pmevcntr12_el0", PMEVCNTR12_EL0, {}}, - {"pmevcntr13_el0", PMEVCNTR13_EL0, {}}, - {"pmevcntr14_el0", PMEVCNTR14_EL0, {}}, - {"pmevcntr15_el0", PMEVCNTR15_EL0, {}}, - {"pmevcntr16_el0", PMEVCNTR16_EL0, {}}, - {"pmevcntr17_el0", PMEVCNTR17_EL0, {}}, - {"pmevcntr18_el0", PMEVCNTR18_EL0, {}}, - {"pmevcntr19_el0", PMEVCNTR19_EL0, {}}, - {"pmevcntr20_el0", PMEVCNTR20_EL0, {}}, - {"pmevcntr21_el0", PMEVCNTR21_EL0, {}}, - {"pmevcntr22_el0", PMEVCNTR22_EL0, {}}, - {"pmevcntr23_el0", PMEVCNTR23_EL0, {}}, - {"pmevcntr24_el0", PMEVCNTR24_EL0, {}}, - {"pmevcntr25_el0", PMEVCNTR25_EL0, {}}, - {"pmevcntr26_el0", PMEVCNTR26_EL0, {}}, - {"pmevcntr27_el0", PMEVCNTR27_EL0, {}}, - {"pmevcntr28_el0", PMEVCNTR28_EL0, {}}, - {"pmevcntr29_el0", PMEVCNTR29_EL0, {}}, - {"pmevcntr30_el0", PMEVCNTR30_EL0, {}}, - {"pmccfiltr_el0", PMCCFILTR_EL0, {}}, - {"pmevtyper0_el0", PMEVTYPER0_EL0, {}}, - {"pmevtyper1_el0", PMEVTYPER1_EL0, {}}, - {"pmevtyper2_el0", PMEVTYPER2_EL0, {}}, - {"pmevtyper3_el0", PMEVTYPER3_EL0, {}}, - {"pmevtyper4_el0", PMEVTYPER4_EL0, {}}, - {"pmevtyper5_el0", PMEVTYPER5_EL0, {}}, - {"pmevtyper6_el0", PMEVTYPER6_EL0, {}}, - {"pmevtyper7_el0", PMEVTYPER7_EL0, {}}, - {"pmevtyper8_el0", PMEVTYPER8_EL0, {}}, - {"pmevtyper9_el0", PMEVTYPER9_EL0, {}}, - {"pmevtyper10_el0", PMEVTYPER10_EL0, {}}, - {"pmevtyper11_el0", PMEVTYPER11_EL0, {}}, - {"pmevtyper12_el0", PMEVTYPER12_EL0, {}}, - {"pmevtyper13_el0", PMEVTYPER13_EL0, {}}, - {"pmevtyper14_el0", PMEVTYPER14_EL0, {}}, - {"pmevtyper15_el0", PMEVTYPER15_EL0, {}}, - {"pmevtyper16_el0", PMEVTYPER16_EL0, {}}, - {"pmevtyper17_el0", PMEVTYPER17_EL0, {}}, - {"pmevtyper18_el0", PMEVTYPER18_EL0, {}}, - {"pmevtyper19_el0", PMEVTYPER19_EL0, {}}, - {"pmevtyper20_el0", PMEVTYPER20_EL0, {}}, - {"pmevtyper21_el0", PMEVTYPER21_EL0, {}}, - {"pmevtyper22_el0", PMEVTYPER22_EL0, {}}, - {"pmevtyper23_el0", PMEVTYPER23_EL0, {}}, - {"pmevtyper24_el0", PMEVTYPER24_EL0, {}}, - {"pmevtyper25_el0", PMEVTYPER25_EL0, {}}, - {"pmevtyper26_el0", PMEVTYPER26_EL0, {}}, - {"pmevtyper27_el0", PMEVTYPER27_EL0, {}}, - {"pmevtyper28_el0", PMEVTYPER28_EL0, {}}, - {"pmevtyper29_el0", PMEVTYPER29_EL0, {}}, - {"pmevtyper30_el0", PMEVTYPER30_EL0, {}}, - - // Trace registers - {"trcprgctlr", TRCPRGCTLR, {}}, - {"trcprocselr", TRCPROCSELR, {}}, - {"trcconfigr", TRCCONFIGR, {}}, - {"trcauxctlr", TRCAUXCTLR, {}}, - {"trceventctl0r", TRCEVENTCTL0R, {}}, - {"trceventctl1r", TRCEVENTCTL1R, {}}, - {"trcstallctlr", TRCSTALLCTLR, {}}, - {"trctsctlr", TRCTSCTLR, {}}, - {"trcsyncpr", TRCSYNCPR, {}}, - {"trcccctlr", TRCCCCTLR, {}}, - {"trcbbctlr", TRCBBCTLR, {}}, - {"trctraceidr", TRCTRACEIDR, {}}, - {"trcqctlr", TRCQCTLR, {}}, - {"trcvictlr", TRCVICTLR, {}}, - {"trcviiectlr", TRCVIIECTLR, {}}, - {"trcvissctlr", TRCVISSCTLR, {}}, - {"trcvipcssctlr", TRCVIPCSSCTLR, {}}, - {"trcvdctlr", TRCVDCTLR, {}}, - {"trcvdsacctlr", TRCVDSACCTLR, {}}, - {"trcvdarcctlr", TRCVDARCCTLR, {}}, - {"trcseqevr0", TRCSEQEVR0, {}}, - {"trcseqevr1", TRCSEQEVR1, {}}, - {"trcseqevr2", TRCSEQEVR2, {}}, - {"trcseqrstevr", TRCSEQRSTEVR, {}}, - {"trcseqstr", TRCSEQSTR, {}}, - {"trcextinselr", TRCEXTINSELR, {}}, - {"trccntrldvr0", TRCCNTRLDVR0, {}}, - {"trccntrldvr1", TRCCNTRLDVR1, {}}, - {"trccntrldvr2", TRCCNTRLDVR2, {}}, - {"trccntrldvr3", TRCCNTRLDVR3, {}}, - {"trccntctlr0", TRCCNTCTLR0, {}}, - {"trccntctlr1", TRCCNTCTLR1, {}}, - {"trccntctlr2", TRCCNTCTLR2, {}}, - {"trccntctlr3", TRCCNTCTLR3, {}}, - {"trccntvr0", TRCCNTVR0, {}}, - {"trccntvr1", TRCCNTVR1, {}}, - {"trccntvr2", TRCCNTVR2, {}}, - {"trccntvr3", TRCCNTVR3, {}}, - {"trcimspec0", TRCIMSPEC0, {}}, - {"trcimspec1", TRCIMSPEC1, {}}, - {"trcimspec2", TRCIMSPEC2, {}}, - {"trcimspec3", TRCIMSPEC3, {}}, - {"trcimspec4", TRCIMSPEC4, {}}, - {"trcimspec5", TRCIMSPEC5, {}}, - {"trcimspec6", TRCIMSPEC6, {}}, - {"trcimspec7", TRCIMSPEC7, {}}, - {"trcrsctlr2", TRCRSCTLR2, {}}, - {"trcrsctlr3", TRCRSCTLR3, {}}, - {"trcrsctlr4", TRCRSCTLR4, {}}, - {"trcrsctlr5", TRCRSCTLR5, {}}, - {"trcrsctlr6", TRCRSCTLR6, {}}, - {"trcrsctlr7", TRCRSCTLR7, {}}, - {"trcrsctlr8", TRCRSCTLR8, {}}, - {"trcrsctlr9", TRCRSCTLR9, {}}, - {"trcrsctlr10", TRCRSCTLR10, {}}, - {"trcrsctlr11", TRCRSCTLR11, {}}, - {"trcrsctlr12", TRCRSCTLR12, {}}, - {"trcrsctlr13", TRCRSCTLR13, {}}, - {"trcrsctlr14", TRCRSCTLR14, {}}, - {"trcrsctlr15", TRCRSCTLR15, {}}, - {"trcrsctlr16", TRCRSCTLR16, {}}, - {"trcrsctlr17", TRCRSCTLR17, {}}, - {"trcrsctlr18", TRCRSCTLR18, {}}, - {"trcrsctlr19", TRCRSCTLR19, {}}, - {"trcrsctlr20", TRCRSCTLR20, {}}, - {"trcrsctlr21", TRCRSCTLR21, {}}, - {"trcrsctlr22", TRCRSCTLR22, {}}, - {"trcrsctlr23", TRCRSCTLR23, {}}, - {"trcrsctlr24", TRCRSCTLR24, {}}, - {"trcrsctlr25", TRCRSCTLR25, {}}, - {"trcrsctlr26", TRCRSCTLR26, {}}, - {"trcrsctlr27", TRCRSCTLR27, {}}, - {"trcrsctlr28", TRCRSCTLR28, {}}, - {"trcrsctlr29", TRCRSCTLR29, {}}, - {"trcrsctlr30", TRCRSCTLR30, {}}, - {"trcrsctlr31", TRCRSCTLR31, {}}, - {"trcssccr0", TRCSSCCR0, {}}, - {"trcssccr1", TRCSSCCR1, {}}, - {"trcssccr2", TRCSSCCR2, {}}, - {"trcssccr3", TRCSSCCR3, {}}, - {"trcssccr4", TRCSSCCR4, {}}, - {"trcssccr5", TRCSSCCR5, {}}, - {"trcssccr6", TRCSSCCR6, {}}, - {"trcssccr7", TRCSSCCR7, {}}, - {"trcsscsr0", TRCSSCSR0, {}}, - {"trcsscsr1", TRCSSCSR1, {}}, - {"trcsscsr2", TRCSSCSR2, {}}, - {"trcsscsr3", TRCSSCSR3, {}}, - {"trcsscsr4", TRCSSCSR4, {}}, - {"trcsscsr5", TRCSSCSR5, {}}, - {"trcsscsr6", TRCSSCSR6, {}}, - {"trcsscsr7", TRCSSCSR7, {}}, - {"trcsspcicr0", TRCSSPCICR0, {}}, - {"trcsspcicr1", TRCSSPCICR1, {}}, - {"trcsspcicr2", TRCSSPCICR2, {}}, - {"trcsspcicr3", TRCSSPCICR3, {}}, - {"trcsspcicr4", TRCSSPCICR4, {}}, - {"trcsspcicr5", TRCSSPCICR5, {}}, - {"trcsspcicr6", TRCSSPCICR6, {}}, - {"trcsspcicr7", TRCSSPCICR7, {}}, - {"trcpdcr", TRCPDCR, {}}, - {"trcacvr0", TRCACVR0, {}}, - {"trcacvr1", TRCACVR1, {}}, - {"trcacvr2", TRCACVR2, {}}, - {"trcacvr3", TRCACVR3, {}}, - {"trcacvr4", TRCACVR4, {}}, - {"trcacvr5", TRCACVR5, {}}, - {"trcacvr6", TRCACVR6, {}}, - {"trcacvr7", TRCACVR7, {}}, - {"trcacvr8", TRCACVR8, {}}, - {"trcacvr9", TRCACVR9, {}}, - {"trcacvr10", TRCACVR10, {}}, - {"trcacvr11", TRCACVR11, {}}, - {"trcacvr12", TRCACVR12, {}}, - {"trcacvr13", TRCACVR13, {}}, - {"trcacvr14", TRCACVR14, {}}, - {"trcacvr15", TRCACVR15, {}}, - {"trcacatr0", TRCACATR0, {}}, - {"trcacatr1", TRCACATR1, {}}, - {"trcacatr2", TRCACATR2, {}}, - {"trcacatr3", TRCACATR3, {}}, - {"trcacatr4", TRCACATR4, {}}, - {"trcacatr5", TRCACATR5, {}}, - {"trcacatr6", TRCACATR6, {}}, - {"trcacatr7", TRCACATR7, {}}, - {"trcacatr8", TRCACATR8, {}}, - {"trcacatr9", TRCACATR9, {}}, - {"trcacatr10", TRCACATR10, {}}, - {"trcacatr11", TRCACATR11, {}}, - {"trcacatr12", TRCACATR12, {}}, - {"trcacatr13", TRCACATR13, {}}, - {"trcacatr14", TRCACATR14, {}}, - {"trcacatr15", TRCACATR15, {}}, - {"trcdvcvr0", TRCDVCVR0, {}}, - {"trcdvcvr1", TRCDVCVR1, {}}, - {"trcdvcvr2", TRCDVCVR2, {}}, - {"trcdvcvr3", TRCDVCVR3, {}}, - {"trcdvcvr4", TRCDVCVR4, {}}, - {"trcdvcvr5", TRCDVCVR5, {}}, - {"trcdvcvr6", TRCDVCVR6, {}}, - {"trcdvcvr7", TRCDVCVR7, {}}, - {"trcdvcmr0", TRCDVCMR0, {}}, - {"trcdvcmr1", TRCDVCMR1, {}}, - {"trcdvcmr2", TRCDVCMR2, {}}, - {"trcdvcmr3", TRCDVCMR3, {}}, - {"trcdvcmr4", TRCDVCMR4, {}}, - {"trcdvcmr5", TRCDVCMR5, {}}, - {"trcdvcmr6", TRCDVCMR6, {}}, - {"trcdvcmr7", TRCDVCMR7, {}}, - {"trccidcvr0", TRCCIDCVR0, {}}, - {"trccidcvr1", TRCCIDCVR1, {}}, - {"trccidcvr2", TRCCIDCVR2, {}}, - {"trccidcvr3", TRCCIDCVR3, {}}, - {"trccidcvr4", TRCCIDCVR4, {}}, - {"trccidcvr5", TRCCIDCVR5, {}}, - {"trccidcvr6", TRCCIDCVR6, {}}, - {"trccidcvr7", TRCCIDCVR7, {}}, - {"trcvmidcvr0", TRCVMIDCVR0, {}}, - {"trcvmidcvr1", TRCVMIDCVR1, {}}, - {"trcvmidcvr2", TRCVMIDCVR2, {}}, - {"trcvmidcvr3", TRCVMIDCVR3, {}}, - {"trcvmidcvr4", TRCVMIDCVR4, {}}, - {"trcvmidcvr5", TRCVMIDCVR5, {}}, - {"trcvmidcvr6", TRCVMIDCVR6, {}}, - {"trcvmidcvr7", TRCVMIDCVR7, {}}, - {"trccidcctlr0", TRCCIDCCTLR0, {}}, - {"trccidcctlr1", TRCCIDCCTLR1, {}}, - {"trcvmidcctlr0", TRCVMIDCCTLR0, {}}, - {"trcvmidcctlr1", TRCVMIDCCTLR1, {}}, - {"trcitctrl", TRCITCTRL, {}}, - {"trcclaimset", TRCCLAIMSET, {}}, - {"trcclaimclr", TRCCLAIMCLR, {}}, - - // GICv3 registers - {"icc_bpr1_el1", ICC_BPR1_EL1, {}}, - {"icc_bpr0_el1", ICC_BPR0_EL1, {}}, - {"icc_pmr_el1", ICC_PMR_EL1, {}}, - {"icc_ctlr_el1", ICC_CTLR_EL1, {}}, - {"icc_ctlr_el3", ICC_CTLR_EL3, {}}, - {"icc_sre_el1", ICC_SRE_EL1, {}}, - {"icc_sre_el2", ICC_SRE_EL2, {}}, - {"icc_sre_el3", ICC_SRE_EL3, {}}, - {"icc_igrpen0_el1", ICC_IGRPEN0_EL1, {}}, - {"icc_igrpen1_el1", ICC_IGRPEN1_EL1, {}}, - {"icc_igrpen1_el3", ICC_IGRPEN1_EL3, {}}, - {"icc_seien_el1", ICC_SEIEN_EL1, {}}, - {"icc_ap0r0_el1", ICC_AP0R0_EL1, {}}, - {"icc_ap0r1_el1", ICC_AP0R1_EL1, {}}, - {"icc_ap0r2_el1", ICC_AP0R2_EL1, {}}, - {"icc_ap0r3_el1", ICC_AP0R3_EL1, {}}, - {"icc_ap1r0_el1", ICC_AP1R0_EL1, {}}, - {"icc_ap1r1_el1", ICC_AP1R1_EL1, {}}, - {"icc_ap1r2_el1", ICC_AP1R2_EL1, {}}, - {"icc_ap1r3_el1", ICC_AP1R3_EL1, {}}, - {"ich_ap0r0_el2", ICH_AP0R0_EL2, {}}, - {"ich_ap0r1_el2", ICH_AP0R1_EL2, {}}, - {"ich_ap0r2_el2", ICH_AP0R2_EL2, {}}, - {"ich_ap0r3_el2", ICH_AP0R3_EL2, {}}, - {"ich_ap1r0_el2", ICH_AP1R0_EL2, {}}, - {"ich_ap1r1_el2", ICH_AP1R1_EL2, {}}, - {"ich_ap1r2_el2", ICH_AP1R2_EL2, {}}, - {"ich_ap1r3_el2", ICH_AP1R3_EL2, {}}, - {"ich_hcr_el2", ICH_HCR_EL2, {}}, - {"ich_misr_el2", ICH_MISR_EL2, {}}, - {"ich_vmcr_el2", ICH_VMCR_EL2, {}}, - {"ich_vseir_el2", ICH_VSEIR_EL2, {}}, - {"ich_lr0_el2", ICH_LR0_EL2, {}}, - {"ich_lr1_el2", ICH_LR1_EL2, {}}, - {"ich_lr2_el2", ICH_LR2_EL2, {}}, - {"ich_lr3_el2", ICH_LR3_EL2, {}}, - {"ich_lr4_el2", ICH_LR4_EL2, {}}, - {"ich_lr5_el2", ICH_LR5_EL2, {}}, - {"ich_lr6_el2", ICH_LR6_EL2, {}}, - {"ich_lr7_el2", ICH_LR7_EL2, {}}, - {"ich_lr8_el2", ICH_LR8_EL2, {}}, - {"ich_lr9_el2", ICH_LR9_EL2, {}}, - {"ich_lr10_el2", ICH_LR10_EL2, {}}, - {"ich_lr11_el2", ICH_LR11_EL2, {}}, - {"ich_lr12_el2", ICH_LR12_EL2, {}}, - {"ich_lr13_el2", ICH_LR13_EL2, {}}, - {"ich_lr14_el2", ICH_LR14_EL2, {}}, - {"ich_lr15_el2", ICH_LR15_EL2, {}}, - - // Cyclone registers - {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3, {AArch64::ProcCyclone}}, - - // v8.1a "Privileged Access Never" extension-specific system registers - {"pan", PAN, {AArch64::HasV8_1aOps}}, - - // v8.1a "Limited Ordering Regions" extension-specific system registers - {"lorsa_el1", LORSA_EL1, {AArch64::HasV8_1aOps}}, - {"lorea_el1", LOREA_EL1, {AArch64::HasV8_1aOps}}, - {"lorn_el1", LORN_EL1, {AArch64::HasV8_1aOps}}, - {"lorc_el1", LORC_EL1, {AArch64::HasV8_1aOps}}, - - // v8.1a "Virtualization host extensions" system registers - {"ttbr1_el2", TTBR1_EL2, {AArch64::HasV8_1aOps}}, - {"contextidr_el2", CONTEXTIDR_EL2, {AArch64::HasV8_1aOps}}, - {"cnthv_tval_el2", CNTHV_TVAL_EL2, {AArch64::HasV8_1aOps}}, - {"cnthv_cval_el2", CNTHV_CVAL_EL2, {AArch64::HasV8_1aOps}}, - {"cnthv_ctl_el2", CNTHV_CTL_EL2, {AArch64::HasV8_1aOps}}, - {"sctlr_el12", SCTLR_EL12, {AArch64::HasV8_1aOps}}, - {"cpacr_el12", CPACR_EL12, {AArch64::HasV8_1aOps}}, - {"ttbr0_el12", TTBR0_EL12, {AArch64::HasV8_1aOps}}, - {"ttbr1_el12", TTBR1_EL12, {AArch64::HasV8_1aOps}}, - {"tcr_el12", TCR_EL12, {AArch64::HasV8_1aOps}}, - {"afsr0_el12", AFSR0_EL12, {AArch64::HasV8_1aOps}}, - {"afsr1_el12", AFSR1_EL12, {AArch64::HasV8_1aOps}}, - {"esr_el12", ESR_EL12, {AArch64::HasV8_1aOps}}, - {"far_el12", FAR_EL12, {AArch64::HasV8_1aOps}}, - {"mair_el12", MAIR_EL12, {AArch64::HasV8_1aOps}}, - {"amair_el12", AMAIR_EL12, {AArch64::HasV8_1aOps}}, - {"vbar_el12", VBAR_EL12, {AArch64::HasV8_1aOps}}, - {"contextidr_el12", CONTEXTIDR_EL12, {AArch64::HasV8_1aOps}}, - {"cntkctl_el12", CNTKCTL_EL12, {AArch64::HasV8_1aOps}}, - {"cntp_tval_el02", CNTP_TVAL_EL02, {AArch64::HasV8_1aOps}}, - {"cntp_ctl_el02", CNTP_CTL_EL02, {AArch64::HasV8_1aOps}}, - {"cntp_cval_el02", CNTP_CVAL_EL02, {AArch64::HasV8_1aOps}}, - {"cntv_tval_el02", CNTV_TVAL_EL02, {AArch64::HasV8_1aOps}}, - {"cntv_ctl_el02", CNTV_CTL_EL02, {AArch64::HasV8_1aOps}}, - {"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}}, - {"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}}, - {"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}}, - - // v8.2a registers - {"uao", UAO, {AArch64::HasV8_2aOps}}, - - // v8.2a "Statistical Profiling extension" registers - {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}}, - {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}}, - {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}}, - {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}}, - {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}}, - {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}}, - {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}}, - {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}}, - {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}}, - {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}}, - {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}}, - {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}}, - {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}}, -}; - -uint32_t -AArch64SysReg::SysRegMapper::fromString(StringRef Name, - const FeatureBitset& FeatureBits, bool &Valid) const { - std::string NameLower = Name.lower(); - - // First search the registers shared by all - for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) { - if (SysRegMappings[i].isNameEqual(NameLower, FeatureBits)) { - Valid = true; - return SysRegMappings[i].Value; - } +namespace llvm { + namespace AArch64PSBHint { +#define GET_PSB_IMPL +#include "AArch64GenSystemOperands.inc" } +} - // Now try the instruction-specific registers (either read-only or - // write-only). - for (unsigned i = 0; i < NumInstMappings; ++i) { - if (InstMappings[i].isNameEqual(NameLower, FeatureBits)) { - Valid = true; - return InstMappings[i].Value; - } +namespace llvm { + namespace AArch64SysReg { +#define GET_SYSREG_IMPL +#include "AArch64GenSystemOperands.inc" } +} +uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) { // Try to parse an S____ register name - Regex GenericRegPattern("^s([0-3])_([0-7])_c([0-9]|1[0-5])_c([0-9]|1[0-5])_([0-7])$"); + Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$"); + std::string UpperName = Name.upper(); SmallVector Ops; - if (!GenericRegPattern.match(NameLower, &Ops)) { - Valid = false; + if (!GenericRegPattern.match(UpperName, &Ops)) return -1; - } uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0; uint32_t Bits; @@ -873,28 +99,10 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, Ops[5].getAsInteger(10, Op2); Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2; - Valid = true; return Bits; } -std::string -AArch64SysReg::SysRegMapper::toString(uint32_t Bits, - const FeatureBitset& FeatureBits) const { - // First search the registers shared by all - for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) { - if (SysRegMappings[i].isValueEqual(Bits, FeatureBits)) { - return SysRegMappings[i].Name; - } - } - - // Now try the instruction-specific registers (either read-only or - // write-only). - for (unsigned i = 0; i < NumInstMappings; ++i) { - if (InstMappings[i].isValueEqual(Bits, FeatureBits)) { - return InstMappings[i].Name; - } - } - +std::string AArch64SysReg::genericRegisterString(uint32_t Bits) { assert(Bits < 0x10000); uint32_t Op0 = (Bits >> 14) & 0x3; uint32_t Op1 = (Bits >> 11) & 0x7; @@ -902,44 +110,13 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits, uint32_t CRm = (Bits >> 3) & 0xf; uint32_t Op2 = Bits & 0x7; - return "s" + utostr(Op0)+ "_" + utostr(Op1) + "_c" + utostr(CRn) - + "_c" + utostr(CRm) + "_" + utostr(Op2); + return "S" + utostr(Op0) + "_" + utostr(Op1) + "_C" + utostr(CRn) + "_C" + + utostr(CRm) + "_" + utostr(Op2); } -const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIMappings[] = { - {"ipas2e1is", IPAS2E1IS, {}}, - {"ipas2le1is", IPAS2LE1IS, {}}, - {"vmalle1is", VMALLE1IS, {}}, - {"alle2is", ALLE2IS, {}}, - {"alle3is", ALLE3IS, {}}, - {"vae1is", VAE1IS, {}}, - {"vae2is", VAE2IS, {}}, - {"vae3is", VAE3IS, {}}, - {"aside1is", ASIDE1IS, {}}, - {"vaae1is", VAAE1IS, {}}, - {"alle1is", ALLE1IS, {}}, - {"vale1is", VALE1IS, {}}, - {"vale2is", VALE2IS, {}}, - {"vale3is", VALE3IS, {}}, - {"vmalls12e1is", VMALLS12E1IS, {}}, - {"vaale1is", VAALE1IS, {}}, - {"ipas2e1", IPAS2E1, {}}, - {"ipas2le1", IPAS2LE1, {}}, - {"vmalle1", VMALLE1, {}}, - {"alle2", ALLE2, {}}, - {"alle3", ALLE3, {}}, - {"vae1", VAE1, {}}, - {"vae2", VAE2, {}}, - {"vae3", VAE3, {}}, - {"aside1", ASIDE1, {}}, - {"vaae1", VAAE1, {}}, - {"alle1", ALLE1, {}}, - {"vale1", VALE1, {}}, - {"vale2", VALE2, {}}, - {"vale3", VALE3, {}}, - {"vmalls12e1", VMALLS12E1, {}}, - {"vaale1", VAALE1, {}} -}; - -AArch64TLBI::TLBIMapper::TLBIMapper() - : AArch64NamedImmMapper(TLBIMappings, 0) {} +namespace llvm { + namespace AArch64TLBI { +#define GET_TLBI_IMPL +#include "AArch64GenSystemOperands.inc" + } +} diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index e63627eae123..dcc39176031c 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -266,231 +266,85 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) { } } // end namespace AArch64CC -/// Instances of this class can perform bidirectional mapping from random -/// identifier strings to operand encodings. For example "MSR" takes a named -/// system-register which must be encoded somehow and decoded for printing. This -/// central location means that the information for those transformations is not -/// duplicated and remains in sync. -/// -/// FIXME: currently the algorithm is a completely unoptimised linear -/// search. Obviously this could be improved, but we would probably want to work -/// out just how often these instructions are emitted before working on it. It -/// might even be optimal to just reorder the tables for the common instructions -/// rather than changing the algorithm. -struct AArch64NamedImmMapper { - struct Mapping { +namespace AArch64AT{ + struct AT { const char *Name; - uint32_t Value; - // Set of features this mapping is available for - // Zero value of FeatureBitSet means the mapping is always available - FeatureBitset FeatureBitSet; - - bool isNameEqual(std::string Other, - const FeatureBitset& FeatureBits) const { - if (FeatureBitSet.any() && - (FeatureBitSet & FeatureBits).none()) - return false; - return Name == Other; - } - - bool isValueEqual(uint32_t Other, - const FeatureBitset& FeatureBits) const { - if (FeatureBitSet.any() && - (FeatureBitSet & FeatureBits).none()) - return false; - return Value == Other; - } - }; - - template - AArch64NamedImmMapper(const Mapping (&Mappings)[N], uint32_t TooBigImm) - : Mappings(&Mappings[0]), NumMappings(N), TooBigImm(TooBigImm) {} - - // Maps value to string, depending on availability for FeatureBits given - StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits, - bool &Valid) const; - // Maps string to value, depending on availability for FeatureBits given - uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, - bool &Valid) const; - - /// Many of the instructions allow an alternative assembly form consisting of - /// a simple immediate. Currently the only valid forms are ranges [0, N) where - /// N being 0 indicates no immediate syntax-form is allowed. - bool validImm(uint32_t Value) const; -protected: - const Mapping *Mappings; - size_t NumMappings; - uint32_t TooBigImm; -}; - -namespace AArch64AT { - enum ATValues { - Invalid = -1, // Op0 Op1 CRn CRm Op2 - S1E1R = 0x43c0, // 01 000 0111 1000 000 - S1E2R = 0x63c0, // 01 100 0111 1000 000 - S1E3R = 0x73c0, // 01 110 0111 1000 000 - S1E1W = 0x43c1, // 01 000 0111 1000 001 - S1E2W = 0x63c1, // 01 100 0111 1000 001 - S1E3W = 0x73c1, // 01 110 0111 1000 001 - S1E0R = 0x43c2, // 01 000 0111 1000 010 - S1E0W = 0x43c3, // 01 000 0111 1000 011 - S12E1R = 0x63c4, // 01 100 0111 1000 100 - S12E1W = 0x63c5, // 01 100 0111 1000 101 - S12E0R = 0x63c6, // 01 100 0111 1000 110 - S12E0W = 0x63c7, // 01 100 0111 1000 111 - S1E1RP = 0x43c8, // 01 000 0111 1001 000 - S1E1WP = 0x43c9 // 01 000 0111 1001 001 + uint16_t Encoding; }; - struct ATMapper : AArch64NamedImmMapper { - const static Mapping ATMappings[]; - - ATMapper(); - }; + #define GET_AT_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64DB { - enum DBValues { - Invalid = -1, - OSHLD = 0x1, - OSHST = 0x2, - OSH = 0x3, - NSHLD = 0x5, - NSHST = 0x6, - NSH = 0x7, - ISHLD = 0x9, - ISHST = 0xa, - ISH = 0xb, - LD = 0xd, - ST = 0xe, - SY = 0xf + struct DB { + const char *Name; + uint16_t Encoding; }; - struct DBarrierMapper : AArch64NamedImmMapper { - const static Mapping DBarrierMappings[]; - - DBarrierMapper(); - }; + #define GET_DB_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64DC { - enum DCValues { - Invalid = -1, // Op1 CRn CRm Op2 - ZVA = 0x5ba1, // 01 011 0111 0100 001 - IVAC = 0x43b1, // 01 000 0111 0110 001 - ISW = 0x43b2, // 01 000 0111 0110 010 - CVAC = 0x5bd1, // 01 011 0111 1010 001 - CSW = 0x43d2, // 01 000 0111 1010 010 - CVAU = 0x5bd9, // 01 011 0111 1011 001 - CIVAC = 0x5bf1, // 01 011 0111 1110 001 - CISW = 0x43f2 // 01 000 0111 1110 010 - }; - - struct DCMapper : AArch64NamedImmMapper { - const static Mapping DCMappings[]; - - DCMapper(); + struct DC { + const char *Name; + uint16_t Encoding; }; + #define GET_DC_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64IC { - enum ICValues { - Invalid = -1, // Op1 CRn CRm Op2 - IALLUIS = 0x0388, // 000 0111 0001 000 - IALLU = 0x03a8, // 000 0111 0101 000 - IVAU = 0x1ba9 // 011 0111 0101 001 - }; - - - struct ICMapper : AArch64NamedImmMapper { - const static Mapping ICMappings[]; - - ICMapper(); + struct IC { + const char *Name; + uint16_t Encoding; + bool NeedsReg; }; - - static inline bool NeedsRegister(ICValues Val) { - return Val == IVAU; - } + #define GET_IC_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64ISB { - enum ISBValues { - Invalid = -1, - SY = 0xf - }; - struct ISBMapper : AArch64NamedImmMapper { - const static Mapping ISBMappings[]; - - ISBMapper(); + struct ISB { + const char *Name; + uint16_t Encoding; }; + #define GET_ISB_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64PRFM { - enum PRFMValues { - Invalid = -1, - PLDL1KEEP = 0x00, - PLDL1STRM = 0x01, - PLDL2KEEP = 0x02, - PLDL2STRM = 0x03, - PLDL3KEEP = 0x04, - PLDL3STRM = 0x05, - PLIL1KEEP = 0x08, - PLIL1STRM = 0x09, - PLIL2KEEP = 0x0a, - PLIL2STRM = 0x0b, - PLIL3KEEP = 0x0c, - PLIL3STRM = 0x0d, - PSTL1KEEP = 0x10, - PSTL1STRM = 0x11, - PSTL2KEEP = 0x12, - PSTL2STRM = 0x13, - PSTL3KEEP = 0x14, - PSTL3STRM = 0x15 - }; - - struct PRFMMapper : AArch64NamedImmMapper { - const static Mapping PRFMMappings[]; - - PRFMMapper(); + struct PRFM { + const char *Name; + uint16_t Encoding; }; + #define GET_PRFM_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64PState { - enum PStateValues { - Invalid = -1, - SPSel = 0x05, - DAIFSet = 0x1e, - DAIFClr = 0x1f, - - // v8.1a "Privileged Access Never" extension-specific PStates - PAN = 0x04, - - // v8.2a "User Access Override" extension-specific PStates - UAO = 0x03 - }; - - struct PStateMapper : AArch64NamedImmMapper { - const static Mapping PStateMappings[]; + struct PState { + const char *Name; + uint16_t Encoding; + FeatureBitset FeaturesRequired; - PStateMapper(); + bool haveFeatures(FeatureBitset ActiveFeatures) const { + return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; + } }; - + #define GET_PSTATE_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64PSBHint { - enum PSBHintValues { - Invalid = -1, - // v8.2a "Statistical Profiling" extension-specific PSB operands - CSync = 0x11, // psb csync = hint #0x11 - }; - - struct PSBHintMapper : AArch64NamedImmMapper { - const static Mapping PSBHintMappings[]; - - PSBHintMapper(); + struct PSB { + const char *Name; + uint16_t Encoding; }; - + #define GET_PSB_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64SE { @@ -574,754 +428,36 @@ AArch64StringToVectorLayout(StringRef LayoutStr) { } namespace AArch64SysReg { - enum SysRegROValues { - MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000 - DBGDTRRX_EL0 = 0x9828, // 10 011 0000 0101 000 - MDRAR_EL1 = 0x8080, // 10 000 0001 0000 000 - OSLSR_EL1 = 0x808c, // 10 000 0001 0001 100 - DBGAUTHSTATUS_EL1 = 0x83f6, // 10 000 0111 1110 110 - PMCEID0_EL0 = 0xdce6, // 11 011 1001 1100 110 - PMCEID1_EL0 = 0xdce7, // 11 011 1001 1100 111 - MIDR_EL1 = 0xc000, // 11 000 0000 0000 000 - CCSIDR_EL1 = 0xc800, // 11 001 0000 0000 000 - CLIDR_EL1 = 0xc801, // 11 001 0000 0000 001 - CTR_EL0 = 0xd801, // 11 011 0000 0000 001 - MPIDR_EL1 = 0xc005, // 11 000 0000 0000 101 - REVIDR_EL1 = 0xc006, // 11 000 0000 0000 110 - AIDR_EL1 = 0xc807, // 11 001 0000 0000 111 - DCZID_EL0 = 0xd807, // 11 011 0000 0000 111 - ID_PFR0_EL1 = 0xc008, // 11 000 0000 0001 000 - ID_PFR1_EL1 = 0xc009, // 11 000 0000 0001 001 - ID_DFR0_EL1 = 0xc00a, // 11 000 0000 0001 010 - ID_AFR0_EL1 = 0xc00b, // 11 000 0000 0001 011 - ID_MMFR0_EL1 = 0xc00c, // 11 000 0000 0001 100 - ID_MMFR1_EL1 = 0xc00d, // 11 000 0000 0001 101 - ID_MMFR2_EL1 = 0xc00e, // 11 000 0000 0001 110 - ID_MMFR3_EL1 = 0xc00f, // 11 000 0000 0001 111 - ID_ISAR0_EL1 = 0xc010, // 11 000 0000 0010 000 - ID_ISAR1_EL1 = 0xc011, // 11 000 0000 0010 001 - ID_ISAR2_EL1 = 0xc012, // 11 000 0000 0010 010 - ID_ISAR3_EL1 = 0xc013, // 11 000 0000 0010 011 - ID_ISAR4_EL1 = 0xc014, // 11 000 0000 0010 100 - ID_ISAR5_EL1 = 0xc015, // 11 000 0000 0010 101 - ID_A64PFR0_EL1 = 0xc020, // 11 000 0000 0100 000 - ID_A64PFR1_EL1 = 0xc021, // 11 000 0000 0100 001 - ID_A64DFR0_EL1 = 0xc028, // 11 000 0000 0101 000 - ID_A64DFR1_EL1 = 0xc029, // 11 000 0000 0101 001 - ID_A64AFR0_EL1 = 0xc02c, // 11 000 0000 0101 100 - ID_A64AFR1_EL1 = 0xc02d, // 11 000 0000 0101 101 - ID_A64ISAR0_EL1 = 0xc030, // 11 000 0000 0110 000 - ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001 - ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000 - ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001 - ID_A64MMFR2_EL1 = 0xc03a, // 11 000 0000 0111 010 - MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000 - MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001 - MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010 - RVBAR_EL1 = 0xc601, // 11 000 1100 0000 001 - RVBAR_EL2 = 0xe601, // 11 100 1100 0000 001 - RVBAR_EL3 = 0xf601, // 11 110 1100 0000 001 - ISR_EL1 = 0xc608, // 11 000 1100 0001 000 - CNTPCT_EL0 = 0xdf01, // 11 011 1110 0000 001 - CNTVCT_EL0 = 0xdf02, // 11 011 1110 0000 010 - ID_MMFR4_EL1 = 0xc016, // 11 000 0000 0010 110 - - // Trace registers - TRCSTATR = 0x8818, // 10 001 0000 0011 000 - TRCIDR8 = 0x8806, // 10 001 0000 0000 110 - TRCIDR9 = 0x880e, // 10 001 0000 0001 110 - TRCIDR10 = 0x8816, // 10 001 0000 0010 110 - TRCIDR11 = 0x881e, // 10 001 0000 0011 110 - TRCIDR12 = 0x8826, // 10 001 0000 0100 110 - TRCIDR13 = 0x882e, // 10 001 0000 0101 110 - TRCIDR0 = 0x8847, // 10 001 0000 1000 111 - TRCIDR1 = 0x884f, // 10 001 0000 1001 111 - TRCIDR2 = 0x8857, // 10 001 0000 1010 111 - TRCIDR3 = 0x885f, // 10 001 0000 1011 111 - TRCIDR4 = 0x8867, // 10 001 0000 1100 111 - TRCIDR5 = 0x886f, // 10 001 0000 1101 111 - TRCIDR6 = 0x8877, // 10 001 0000 1110 111 - TRCIDR7 = 0x887f, // 10 001 0000 1111 111 - TRCOSLSR = 0x888c, // 10 001 0001 0001 100 - TRCPDSR = 0x88ac, // 10 001 0001 0101 100 - TRCDEVAFF0 = 0x8bd6, // 10 001 0111 1010 110 - TRCDEVAFF1 = 0x8bde, // 10 001 0111 1011 110 - TRCLSR = 0x8bee, // 10 001 0111 1101 110 - TRCAUTHSTATUS = 0x8bf6, // 10 001 0111 1110 110 - TRCDEVARCH = 0x8bfe, // 10 001 0111 1111 110 - TRCDEVID = 0x8b97, // 10 001 0111 0010 111 - TRCDEVTYPE = 0x8b9f, // 10 001 0111 0011 111 - TRCPIDR4 = 0x8ba7, // 10 001 0111 0100 111 - TRCPIDR5 = 0x8baf, // 10 001 0111 0101 111 - TRCPIDR6 = 0x8bb7, // 10 001 0111 0110 111 - TRCPIDR7 = 0x8bbf, // 10 001 0111 0111 111 - TRCPIDR0 = 0x8bc7, // 10 001 0111 1000 111 - TRCPIDR1 = 0x8bcf, // 10 001 0111 1001 111 - TRCPIDR2 = 0x8bd7, // 10 001 0111 1010 111 - TRCPIDR3 = 0x8bdf, // 10 001 0111 1011 111 - TRCCIDR0 = 0x8be7, // 10 001 0111 1100 111 - TRCCIDR1 = 0x8bef, // 10 001 0111 1101 111 - TRCCIDR2 = 0x8bf7, // 10 001 0111 1110 111 - TRCCIDR3 = 0x8bff, // 10 001 0111 1111 111 - - // GICv3 registers - ICC_IAR1_EL1 = 0xc660, // 11 000 1100 1100 000 - ICC_IAR0_EL1 = 0xc640, // 11 000 1100 1000 000 - ICC_HPPIR1_EL1 = 0xc662, // 11 000 1100 1100 010 - ICC_HPPIR0_EL1 = 0xc642, // 11 000 1100 1000 010 - ICC_RPR_EL1 = 0xc65b, // 11 000 1100 1011 011 - ICH_VTR_EL2 = 0xe659, // 11 100 1100 1011 001 - ICH_EISR_EL2 = 0xe65b, // 11 100 1100 1011 011 - ICH_ELSR_EL2 = 0xe65d // 11 100 1100 1011 101 - }; - - enum SysRegWOValues { - DBGDTRTX_EL0 = 0x9828, // 10 011 0000 0101 000 - OSLAR_EL1 = 0x8084, // 10 000 0001 0000 100 - PMSWINC_EL0 = 0xdce4, // 11 011 1001 1100 100 - - // Trace Registers - TRCOSLAR = 0x8884, // 10 001 0001 0000 100 - TRCLAR = 0x8be6, // 10 001 0111 1100 110 - - // GICv3 registers - ICC_EOIR1_EL1 = 0xc661, // 11 000 1100 1100 001 - ICC_EOIR0_EL1 = 0xc641, // 11 000 1100 1000 001 - ICC_DIR_EL1 = 0xc659, // 11 000 1100 1011 001 - ICC_SGI1R_EL1 = 0xc65d, // 11 000 1100 1011 101 - ICC_ASGI1R_EL1 = 0xc65e, // 11 000 1100 1011 110 - ICC_SGI0R_EL1 = 0xc65f // 11 000 1100 1011 111 - }; - - enum SysRegValues { - Invalid = -1, // Op0 Op1 CRn CRm Op2 - OSDTRRX_EL1 = 0x8002, // 10 000 0000 0000 010 - OSDTRTX_EL1 = 0x801a, // 10 000 0000 0011 010 - TEECR32_EL1 = 0x9000, // 10 010 0000 0000 000 - MDCCINT_EL1 = 0x8010, // 10 000 0000 0010 000 - MDSCR_EL1 = 0x8012, // 10 000 0000 0010 010 - DBGDTR_EL0 = 0x9820, // 10 011 0000 0100 000 - OSECCR_EL1 = 0x8032, // 10 000 0000 0110 010 - DBGVCR32_EL2 = 0xa038, // 10 100 0000 0111 000 - DBGBVR0_EL1 = 0x8004, // 10 000 0000 0000 100 - DBGBVR1_EL1 = 0x800c, // 10 000 0000 0001 100 - DBGBVR2_EL1 = 0x8014, // 10 000 0000 0010 100 - DBGBVR3_EL1 = 0x801c, // 10 000 0000 0011 100 - DBGBVR4_EL1 = 0x8024, // 10 000 0000 0100 100 - DBGBVR5_EL1 = 0x802c, // 10 000 0000 0101 100 - DBGBVR6_EL1 = 0x8034, // 10 000 0000 0110 100 - DBGBVR7_EL1 = 0x803c, // 10 000 0000 0111 100 - DBGBVR8_EL1 = 0x8044, // 10 000 0000 1000 100 - DBGBVR9_EL1 = 0x804c, // 10 000 0000 1001 100 - DBGBVR10_EL1 = 0x8054, // 10 000 0000 1010 100 - DBGBVR11_EL1 = 0x805c, // 10 000 0000 1011 100 - DBGBVR12_EL1 = 0x8064, // 10 000 0000 1100 100 - DBGBVR13_EL1 = 0x806c, // 10 000 0000 1101 100 - DBGBVR14_EL1 = 0x8074, // 10 000 0000 1110 100 - DBGBVR15_EL1 = 0x807c, // 10 000 0000 1111 100 - DBGBCR0_EL1 = 0x8005, // 10 000 0000 0000 101 - DBGBCR1_EL1 = 0x800d, // 10 000 0000 0001 101 - DBGBCR2_EL1 = 0x8015, // 10 000 0000 0010 101 - DBGBCR3_EL1 = 0x801d, // 10 000 0000 0011 101 - DBGBCR4_EL1 = 0x8025, // 10 000 0000 0100 101 - DBGBCR5_EL1 = 0x802d, // 10 000 0000 0101 101 - DBGBCR6_EL1 = 0x8035, // 10 000 0000 0110 101 - DBGBCR7_EL1 = 0x803d, // 10 000 0000 0111 101 - DBGBCR8_EL1 = 0x8045, // 10 000 0000 1000 101 - DBGBCR9_EL1 = 0x804d, // 10 000 0000 1001 101 - DBGBCR10_EL1 = 0x8055, // 10 000 0000 1010 101 - DBGBCR11_EL1 = 0x805d, // 10 000 0000 1011 101 - DBGBCR12_EL1 = 0x8065, // 10 000 0000 1100 101 - DBGBCR13_EL1 = 0x806d, // 10 000 0000 1101 101 - DBGBCR14_EL1 = 0x8075, // 10 000 0000 1110 101 - DBGBCR15_EL1 = 0x807d, // 10 000 0000 1111 101 - DBGWVR0_EL1 = 0x8006, // 10 000 0000 0000 110 - DBGWVR1_EL1 = 0x800e, // 10 000 0000 0001 110 - DBGWVR2_EL1 = 0x8016, // 10 000 0000 0010 110 - DBGWVR3_EL1 = 0x801e, // 10 000 0000 0011 110 - DBGWVR4_EL1 = 0x8026, // 10 000 0000 0100 110 - DBGWVR5_EL1 = 0x802e, // 10 000 0000 0101 110 - DBGWVR6_EL1 = 0x8036, // 10 000 0000 0110 110 - DBGWVR7_EL1 = 0x803e, // 10 000 0000 0111 110 - DBGWVR8_EL1 = 0x8046, // 10 000 0000 1000 110 - DBGWVR9_EL1 = 0x804e, // 10 000 0000 1001 110 - DBGWVR10_EL1 = 0x8056, // 10 000 0000 1010 110 - DBGWVR11_EL1 = 0x805e, // 10 000 0000 1011 110 - DBGWVR12_EL1 = 0x8066, // 10 000 0000 1100 110 - DBGWVR13_EL1 = 0x806e, // 10 000 0000 1101 110 - DBGWVR14_EL1 = 0x8076, // 10 000 0000 1110 110 - DBGWVR15_EL1 = 0x807e, // 10 000 0000 1111 110 - DBGWCR0_EL1 = 0x8007, // 10 000 0000 0000 111 - DBGWCR1_EL1 = 0x800f, // 10 000 0000 0001 111 - DBGWCR2_EL1 = 0x8017, // 10 000 0000 0010 111 - DBGWCR3_EL1 = 0x801f, // 10 000 0000 0011 111 - DBGWCR4_EL1 = 0x8027, // 10 000 0000 0100 111 - DBGWCR5_EL1 = 0x802f, // 10 000 0000 0101 111 - DBGWCR6_EL1 = 0x8037, // 10 000 0000 0110 111 - DBGWCR7_EL1 = 0x803f, // 10 000 0000 0111 111 - DBGWCR8_EL1 = 0x8047, // 10 000 0000 1000 111 - DBGWCR9_EL1 = 0x804f, // 10 000 0000 1001 111 - DBGWCR10_EL1 = 0x8057, // 10 000 0000 1010 111 - DBGWCR11_EL1 = 0x805f, // 10 000 0000 1011 111 - DBGWCR12_EL1 = 0x8067, // 10 000 0000 1100 111 - DBGWCR13_EL1 = 0x806f, // 10 000 0000 1101 111 - DBGWCR14_EL1 = 0x8077, // 10 000 0000 1110 111 - DBGWCR15_EL1 = 0x807f, // 10 000 0000 1111 111 - TEEHBR32_EL1 = 0x9080, // 10 010 0001 0000 000 - OSDLR_EL1 = 0x809c, // 10 000 0001 0011 100 - DBGPRCR_EL1 = 0x80a4, // 10 000 0001 0100 100 - DBGCLAIMSET_EL1 = 0x83c6, // 10 000 0111 1000 110 - DBGCLAIMCLR_EL1 = 0x83ce, // 10 000 0111 1001 110 - CSSELR_EL1 = 0xd000, // 11 010 0000 0000 000 - VPIDR_EL2 = 0xe000, // 11 100 0000 0000 000 - VMPIDR_EL2 = 0xe005, // 11 100 0000 0000 101 - CPACR_EL1 = 0xc082, // 11 000 0001 0000 010 - SCTLR_EL1 = 0xc080, // 11 000 0001 0000 000 - SCTLR_EL2 = 0xe080, // 11 100 0001 0000 000 - SCTLR_EL3 = 0xf080, // 11 110 0001 0000 000 - ACTLR_EL1 = 0xc081, // 11 000 0001 0000 001 - ACTLR_EL2 = 0xe081, // 11 100 0001 0000 001 - ACTLR_EL3 = 0xf081, // 11 110 0001 0000 001 - HCR_EL2 = 0xe088, // 11 100 0001 0001 000 - SCR_EL3 = 0xf088, // 11 110 0001 0001 000 - MDCR_EL2 = 0xe089, // 11 100 0001 0001 001 - SDER32_EL3 = 0xf089, // 11 110 0001 0001 001 - CPTR_EL2 = 0xe08a, // 11 100 0001 0001 010 - CPTR_EL3 = 0xf08a, // 11 110 0001 0001 010 - HSTR_EL2 = 0xe08b, // 11 100 0001 0001 011 - HACR_EL2 = 0xe08f, // 11 100 0001 0001 111 - MDCR_EL3 = 0xf099, // 11 110 0001 0011 001 - TTBR0_EL1 = 0xc100, // 11 000 0010 0000 000 - TTBR0_EL2 = 0xe100, // 11 100 0010 0000 000 - TTBR0_EL3 = 0xf100, // 11 110 0010 0000 000 - TTBR1_EL1 = 0xc101, // 11 000 0010 0000 001 - TCR_EL1 = 0xc102, // 11 000 0010 0000 010 - TCR_EL2 = 0xe102, // 11 100 0010 0000 010 - TCR_EL3 = 0xf102, // 11 110 0010 0000 010 - VTTBR_EL2 = 0xe108, // 11 100 0010 0001 000 - VTCR_EL2 = 0xe10a, // 11 100 0010 0001 010 - DACR32_EL2 = 0xe180, // 11 100 0011 0000 000 - SPSR_EL1 = 0xc200, // 11 000 0100 0000 000 - SPSR_EL2 = 0xe200, // 11 100 0100 0000 000 - SPSR_EL3 = 0xf200, // 11 110 0100 0000 000 - ELR_EL1 = 0xc201, // 11 000 0100 0000 001 - ELR_EL2 = 0xe201, // 11 100 0100 0000 001 - ELR_EL3 = 0xf201, // 11 110 0100 0000 001 - SP_EL0 = 0xc208, // 11 000 0100 0001 000 - SP_EL1 = 0xe208, // 11 100 0100 0001 000 - SP_EL2 = 0xf208, // 11 110 0100 0001 000 - SPSel = 0xc210, // 11 000 0100 0010 000 - NZCV = 0xda10, // 11 011 0100 0010 000 - DAIF = 0xda11, // 11 011 0100 0010 001 - CurrentEL = 0xc212, // 11 000 0100 0010 010 - SPSR_irq = 0xe218, // 11 100 0100 0011 000 - SPSR_abt = 0xe219, // 11 100 0100 0011 001 - SPSR_und = 0xe21a, // 11 100 0100 0011 010 - SPSR_fiq = 0xe21b, // 11 100 0100 0011 011 - FPCR = 0xda20, // 11 011 0100 0100 000 - FPSR = 0xda21, // 11 011 0100 0100 001 - DSPSR_EL0 = 0xda28, // 11 011 0100 0101 000 - DLR_EL0 = 0xda29, // 11 011 0100 0101 001 - IFSR32_EL2 = 0xe281, // 11 100 0101 0000 001 - AFSR0_EL1 = 0xc288, // 11 000 0101 0001 000 - AFSR0_EL2 = 0xe288, // 11 100 0101 0001 000 - AFSR0_EL3 = 0xf288, // 11 110 0101 0001 000 - AFSR1_EL1 = 0xc289, // 11 000 0101 0001 001 - AFSR1_EL2 = 0xe289, // 11 100 0101 0001 001 - AFSR1_EL3 = 0xf289, // 11 110 0101 0001 001 - ESR_EL1 = 0xc290, // 11 000 0101 0010 000 - ESR_EL2 = 0xe290, // 11 100 0101 0010 000 - ESR_EL3 = 0xf290, // 11 110 0101 0010 000 - FPEXC32_EL2 = 0xe298, // 11 100 0101 0011 000 - FAR_EL1 = 0xc300, // 11 000 0110 0000 000 - FAR_EL2 = 0xe300, // 11 100 0110 0000 000 - FAR_EL3 = 0xf300, // 11 110 0110 0000 000 - HPFAR_EL2 = 0xe304, // 11 100 0110 0000 100 - PAR_EL1 = 0xc3a0, // 11 000 0111 0100 000 - PMCR_EL0 = 0xdce0, // 11 011 1001 1100 000 - PMCNTENSET_EL0 = 0xdce1, // 11 011 1001 1100 001 - PMCNTENCLR_EL0 = 0xdce2, // 11 011 1001 1100 010 - PMOVSCLR_EL0 = 0xdce3, // 11 011 1001 1100 011 - PMSELR_EL0 = 0xdce5, // 11 011 1001 1100 101 - PMCCNTR_EL0 = 0xdce8, // 11 011 1001 1101 000 - PMXEVTYPER_EL0 = 0xdce9, // 11 011 1001 1101 001 - PMXEVCNTR_EL0 = 0xdcea, // 11 011 1001 1101 010 - PMUSERENR_EL0 = 0xdcf0, // 11 011 1001 1110 000 - PMINTENSET_EL1 = 0xc4f1, // 11 000 1001 1110 001 - PMINTENCLR_EL1 = 0xc4f2, // 11 000 1001 1110 010 - PMOVSSET_EL0 = 0xdcf3, // 11 011 1001 1110 011 - MAIR_EL1 = 0xc510, // 11 000 1010 0010 000 - MAIR_EL2 = 0xe510, // 11 100 1010 0010 000 - MAIR_EL3 = 0xf510, // 11 110 1010 0010 000 - AMAIR_EL1 = 0xc518, // 11 000 1010 0011 000 - AMAIR_EL2 = 0xe518, // 11 100 1010 0011 000 - AMAIR_EL3 = 0xf518, // 11 110 1010 0011 000 - VBAR_EL1 = 0xc600, // 11 000 1100 0000 000 - VBAR_EL2 = 0xe600, // 11 100 1100 0000 000 - VBAR_EL3 = 0xf600, // 11 110 1100 0000 000 - RMR_EL1 = 0xc602, // 11 000 1100 0000 010 - RMR_EL2 = 0xe602, // 11 100 1100 0000 010 - RMR_EL3 = 0xf602, // 11 110 1100 0000 010 - CONTEXTIDR_EL1 = 0xc681, // 11 000 1101 0000 001 - TPIDR_EL0 = 0xde82, // 11 011 1101 0000 010 - TPIDR_EL2 = 0xe682, // 11 100 1101 0000 010 - TPIDR_EL3 = 0xf682, // 11 110 1101 0000 010 - TPIDRRO_EL0 = 0xde83, // 11 011 1101 0000 011 - TPIDR_EL1 = 0xc684, // 11 000 1101 0000 100 - CNTFRQ_EL0 = 0xdf00, // 11 011 1110 0000 000 - CNTVOFF_EL2 = 0xe703, // 11 100 1110 0000 011 - CNTKCTL_EL1 = 0xc708, // 11 000 1110 0001 000 - CNTHCTL_EL2 = 0xe708, // 11 100 1110 0001 000 - CNTP_TVAL_EL0 = 0xdf10, // 11 011 1110 0010 000 - CNTHP_TVAL_EL2 = 0xe710, // 11 100 1110 0010 000 - CNTPS_TVAL_EL1 = 0xff10, // 11 111 1110 0010 000 - CNTP_CTL_EL0 = 0xdf11, // 11 011 1110 0010 001 - CNTHP_CTL_EL2 = 0xe711, // 11 100 1110 0010 001 - CNTPS_CTL_EL1 = 0xff11, // 11 111 1110 0010 001 - CNTP_CVAL_EL0 = 0xdf12, // 11 011 1110 0010 010 - CNTHP_CVAL_EL2 = 0xe712, // 11 100 1110 0010 010 - CNTPS_CVAL_EL1 = 0xff12, // 11 111 1110 0010 010 - CNTV_TVAL_EL0 = 0xdf18, // 11 011 1110 0011 000 - CNTV_CTL_EL0 = 0xdf19, // 11 011 1110 0011 001 - CNTV_CVAL_EL0 = 0xdf1a, // 11 011 1110 0011 010 - PMEVCNTR0_EL0 = 0xdf40, // 11 011 1110 1000 000 - PMEVCNTR1_EL0 = 0xdf41, // 11 011 1110 1000 001 - PMEVCNTR2_EL0 = 0xdf42, // 11 011 1110 1000 010 - PMEVCNTR3_EL0 = 0xdf43, // 11 011 1110 1000 011 - PMEVCNTR4_EL0 = 0xdf44, // 11 011 1110 1000 100 - PMEVCNTR5_EL0 = 0xdf45, // 11 011 1110 1000 101 - PMEVCNTR6_EL0 = 0xdf46, // 11 011 1110 1000 110 - PMEVCNTR7_EL0 = 0xdf47, // 11 011 1110 1000 111 - PMEVCNTR8_EL0 = 0xdf48, // 11 011 1110 1001 000 - PMEVCNTR9_EL0 = 0xdf49, // 11 011 1110 1001 001 - PMEVCNTR10_EL0 = 0xdf4a, // 11 011 1110 1001 010 - PMEVCNTR11_EL0 = 0xdf4b, // 11 011 1110 1001 011 - PMEVCNTR12_EL0 = 0xdf4c, // 11 011 1110 1001 100 - PMEVCNTR13_EL0 = 0xdf4d, // 11 011 1110 1001 101 - PMEVCNTR14_EL0 = 0xdf4e, // 11 011 1110 1001 110 - PMEVCNTR15_EL0 = 0xdf4f, // 11 011 1110 1001 111 - PMEVCNTR16_EL0 = 0xdf50, // 11 011 1110 1010 000 - PMEVCNTR17_EL0 = 0xdf51, // 11 011 1110 1010 001 - PMEVCNTR18_EL0 = 0xdf52, // 11 011 1110 1010 010 - PMEVCNTR19_EL0 = 0xdf53, // 11 011 1110 1010 011 - PMEVCNTR20_EL0 = 0xdf54, // 11 011 1110 1010 100 - PMEVCNTR21_EL0 = 0xdf55, // 11 011 1110 1010 101 - PMEVCNTR22_EL0 = 0xdf56, // 11 011 1110 1010 110 - PMEVCNTR23_EL0 = 0xdf57, // 11 011 1110 1010 111 - PMEVCNTR24_EL0 = 0xdf58, // 11 011 1110 1011 000 - PMEVCNTR25_EL0 = 0xdf59, // 11 011 1110 1011 001 - PMEVCNTR26_EL0 = 0xdf5a, // 11 011 1110 1011 010 - PMEVCNTR27_EL0 = 0xdf5b, // 11 011 1110 1011 011 - PMEVCNTR28_EL0 = 0xdf5c, // 11 011 1110 1011 100 - PMEVCNTR29_EL0 = 0xdf5d, // 11 011 1110 1011 101 - PMEVCNTR30_EL0 = 0xdf5e, // 11 011 1110 1011 110 - PMCCFILTR_EL0 = 0xdf7f, // 11 011 1110 1111 111 - PMEVTYPER0_EL0 = 0xdf60, // 11 011 1110 1100 000 - PMEVTYPER1_EL0 = 0xdf61, // 11 011 1110 1100 001 - PMEVTYPER2_EL0 = 0xdf62, // 11 011 1110 1100 010 - PMEVTYPER3_EL0 = 0xdf63, // 11 011 1110 1100 011 - PMEVTYPER4_EL0 = 0xdf64, // 11 011 1110 1100 100 - PMEVTYPER5_EL0 = 0xdf65, // 11 011 1110 1100 101 - PMEVTYPER6_EL0 = 0xdf66, // 11 011 1110 1100 110 - PMEVTYPER7_EL0 = 0xdf67, // 11 011 1110 1100 111 - PMEVTYPER8_EL0 = 0xdf68, // 11 011 1110 1101 000 - PMEVTYPER9_EL0 = 0xdf69, // 11 011 1110 1101 001 - PMEVTYPER10_EL0 = 0xdf6a, // 11 011 1110 1101 010 - PMEVTYPER11_EL0 = 0xdf6b, // 11 011 1110 1101 011 - PMEVTYPER12_EL0 = 0xdf6c, // 11 011 1110 1101 100 - PMEVTYPER13_EL0 = 0xdf6d, // 11 011 1110 1101 101 - PMEVTYPER14_EL0 = 0xdf6e, // 11 011 1110 1101 110 - PMEVTYPER15_EL0 = 0xdf6f, // 11 011 1110 1101 111 - PMEVTYPER16_EL0 = 0xdf70, // 11 011 1110 1110 000 - PMEVTYPER17_EL0 = 0xdf71, // 11 011 1110 1110 001 - PMEVTYPER18_EL0 = 0xdf72, // 11 011 1110 1110 010 - PMEVTYPER19_EL0 = 0xdf73, // 11 011 1110 1110 011 - PMEVTYPER20_EL0 = 0xdf74, // 11 011 1110 1110 100 - PMEVTYPER21_EL0 = 0xdf75, // 11 011 1110 1110 101 - PMEVTYPER22_EL0 = 0xdf76, // 11 011 1110 1110 110 - PMEVTYPER23_EL0 = 0xdf77, // 11 011 1110 1110 111 - PMEVTYPER24_EL0 = 0xdf78, // 11 011 1110 1111 000 - PMEVTYPER25_EL0 = 0xdf79, // 11 011 1110 1111 001 - PMEVTYPER26_EL0 = 0xdf7a, // 11 011 1110 1111 010 - PMEVTYPER27_EL0 = 0xdf7b, // 11 011 1110 1111 011 - PMEVTYPER28_EL0 = 0xdf7c, // 11 011 1110 1111 100 - PMEVTYPER29_EL0 = 0xdf7d, // 11 011 1110 1111 101 - PMEVTYPER30_EL0 = 0xdf7e, // 11 011 1110 1111 110 - - // Trace registers - TRCPRGCTLR = 0x8808, // 10 001 0000 0001 000 - TRCPROCSELR = 0x8810, // 10 001 0000 0010 000 - TRCCONFIGR = 0x8820, // 10 001 0000 0100 000 - TRCAUXCTLR = 0x8830, // 10 001 0000 0110 000 - TRCEVENTCTL0R = 0x8840, // 10 001 0000 1000 000 - TRCEVENTCTL1R = 0x8848, // 10 001 0000 1001 000 - TRCSTALLCTLR = 0x8858, // 10 001 0000 1011 000 - TRCTSCTLR = 0x8860, // 10 001 0000 1100 000 - TRCSYNCPR = 0x8868, // 10 001 0000 1101 000 - TRCCCCTLR = 0x8870, // 10 001 0000 1110 000 - TRCBBCTLR = 0x8878, // 10 001 0000 1111 000 - TRCTRACEIDR = 0x8801, // 10 001 0000 0000 001 - TRCQCTLR = 0x8809, // 10 001 0000 0001 001 - TRCVICTLR = 0x8802, // 10 001 0000 0000 010 - TRCVIIECTLR = 0x880a, // 10 001 0000 0001 010 - TRCVISSCTLR = 0x8812, // 10 001 0000 0010 010 - TRCVIPCSSCTLR = 0x881a, // 10 001 0000 0011 010 - TRCVDCTLR = 0x8842, // 10 001 0000 1000 010 - TRCVDSACCTLR = 0x884a, // 10 001 0000 1001 010 - TRCVDARCCTLR = 0x8852, // 10 001 0000 1010 010 - TRCSEQEVR0 = 0x8804, // 10 001 0000 0000 100 - TRCSEQEVR1 = 0x880c, // 10 001 0000 0001 100 - TRCSEQEVR2 = 0x8814, // 10 001 0000 0010 100 - TRCSEQRSTEVR = 0x8834, // 10 001 0000 0110 100 - TRCSEQSTR = 0x883c, // 10 001 0000 0111 100 - TRCEXTINSELR = 0x8844, // 10 001 0000 1000 100 - TRCCNTRLDVR0 = 0x8805, // 10 001 0000 0000 101 - TRCCNTRLDVR1 = 0x880d, // 10 001 0000 0001 101 - TRCCNTRLDVR2 = 0x8815, // 10 001 0000 0010 101 - TRCCNTRLDVR3 = 0x881d, // 10 001 0000 0011 101 - TRCCNTCTLR0 = 0x8825, // 10 001 0000 0100 101 - TRCCNTCTLR1 = 0x882d, // 10 001 0000 0101 101 - TRCCNTCTLR2 = 0x8835, // 10 001 0000 0110 101 - TRCCNTCTLR3 = 0x883d, // 10 001 0000 0111 101 - TRCCNTVR0 = 0x8845, // 10 001 0000 1000 101 - TRCCNTVR1 = 0x884d, // 10 001 0000 1001 101 - TRCCNTVR2 = 0x8855, // 10 001 0000 1010 101 - TRCCNTVR3 = 0x885d, // 10 001 0000 1011 101 - TRCIMSPEC0 = 0x8807, // 10 001 0000 0000 111 - TRCIMSPEC1 = 0x880f, // 10 001 0000 0001 111 - TRCIMSPEC2 = 0x8817, // 10 001 0000 0010 111 - TRCIMSPEC3 = 0x881f, // 10 001 0000 0011 111 - TRCIMSPEC4 = 0x8827, // 10 001 0000 0100 111 - TRCIMSPEC5 = 0x882f, // 10 001 0000 0101 111 - TRCIMSPEC6 = 0x8837, // 10 001 0000 0110 111 - TRCIMSPEC7 = 0x883f, // 10 001 0000 0111 111 - TRCRSCTLR2 = 0x8890, // 10 001 0001 0010 000 - TRCRSCTLR3 = 0x8898, // 10 001 0001 0011 000 - TRCRSCTLR4 = 0x88a0, // 10 001 0001 0100 000 - TRCRSCTLR5 = 0x88a8, // 10 001 0001 0101 000 - TRCRSCTLR6 = 0x88b0, // 10 001 0001 0110 000 - TRCRSCTLR7 = 0x88b8, // 10 001 0001 0111 000 - TRCRSCTLR8 = 0x88c0, // 10 001 0001 1000 000 - TRCRSCTLR9 = 0x88c8, // 10 001 0001 1001 000 - TRCRSCTLR10 = 0x88d0, // 10 001 0001 1010 000 - TRCRSCTLR11 = 0x88d8, // 10 001 0001 1011 000 - TRCRSCTLR12 = 0x88e0, // 10 001 0001 1100 000 - TRCRSCTLR13 = 0x88e8, // 10 001 0001 1101 000 - TRCRSCTLR14 = 0x88f0, // 10 001 0001 1110 000 - TRCRSCTLR15 = 0x88f8, // 10 001 0001 1111 000 - TRCRSCTLR16 = 0x8881, // 10 001 0001 0000 001 - TRCRSCTLR17 = 0x8889, // 10 001 0001 0001 001 - TRCRSCTLR18 = 0x8891, // 10 001 0001 0010 001 - TRCRSCTLR19 = 0x8899, // 10 001 0001 0011 001 - TRCRSCTLR20 = 0x88a1, // 10 001 0001 0100 001 - TRCRSCTLR21 = 0x88a9, // 10 001 0001 0101 001 - TRCRSCTLR22 = 0x88b1, // 10 001 0001 0110 001 - TRCRSCTLR23 = 0x88b9, // 10 001 0001 0111 001 - TRCRSCTLR24 = 0x88c1, // 10 001 0001 1000 001 - TRCRSCTLR25 = 0x88c9, // 10 001 0001 1001 001 - TRCRSCTLR26 = 0x88d1, // 10 001 0001 1010 001 - TRCRSCTLR27 = 0x88d9, // 10 001 0001 1011 001 - TRCRSCTLR28 = 0x88e1, // 10 001 0001 1100 001 - TRCRSCTLR29 = 0x88e9, // 10 001 0001 1101 001 - TRCRSCTLR30 = 0x88f1, // 10 001 0001 1110 001 - TRCRSCTLR31 = 0x88f9, // 10 001 0001 1111 001 - TRCSSCCR0 = 0x8882, // 10 001 0001 0000 010 - TRCSSCCR1 = 0x888a, // 10 001 0001 0001 010 - TRCSSCCR2 = 0x8892, // 10 001 0001 0010 010 - TRCSSCCR3 = 0x889a, // 10 001 0001 0011 010 - TRCSSCCR4 = 0x88a2, // 10 001 0001 0100 010 - TRCSSCCR5 = 0x88aa, // 10 001 0001 0101 010 - TRCSSCCR6 = 0x88b2, // 10 001 0001 0110 010 - TRCSSCCR7 = 0x88ba, // 10 001 0001 0111 010 - TRCSSCSR0 = 0x88c2, // 10 001 0001 1000 010 - TRCSSCSR1 = 0x88ca, // 10 001 0001 1001 010 - TRCSSCSR2 = 0x88d2, // 10 001 0001 1010 010 - TRCSSCSR3 = 0x88da, // 10 001 0001 1011 010 - TRCSSCSR4 = 0x88e2, // 10 001 0001 1100 010 - TRCSSCSR5 = 0x88ea, // 10 001 0001 1101 010 - TRCSSCSR6 = 0x88f2, // 10 001 0001 1110 010 - TRCSSCSR7 = 0x88fa, // 10 001 0001 1111 010 - TRCSSPCICR0 = 0x8883, // 10 001 0001 0000 011 - TRCSSPCICR1 = 0x888b, // 10 001 0001 0001 011 - TRCSSPCICR2 = 0x8893, // 10 001 0001 0010 011 - TRCSSPCICR3 = 0x889b, // 10 001 0001 0011 011 - TRCSSPCICR4 = 0x88a3, // 10 001 0001 0100 011 - TRCSSPCICR5 = 0x88ab, // 10 001 0001 0101 011 - TRCSSPCICR6 = 0x88b3, // 10 001 0001 0110 011 - TRCSSPCICR7 = 0x88bb, // 10 001 0001 0111 011 - TRCPDCR = 0x88a4, // 10 001 0001 0100 100 - TRCACVR0 = 0x8900, // 10 001 0010 0000 000 - TRCACVR1 = 0x8910, // 10 001 0010 0010 000 - TRCACVR2 = 0x8920, // 10 001 0010 0100 000 - TRCACVR3 = 0x8930, // 10 001 0010 0110 000 - TRCACVR4 = 0x8940, // 10 001 0010 1000 000 - TRCACVR5 = 0x8950, // 10 001 0010 1010 000 - TRCACVR6 = 0x8960, // 10 001 0010 1100 000 - TRCACVR7 = 0x8970, // 10 001 0010 1110 000 - TRCACVR8 = 0x8901, // 10 001 0010 0000 001 - TRCACVR9 = 0x8911, // 10 001 0010 0010 001 - TRCACVR10 = 0x8921, // 10 001 0010 0100 001 - TRCACVR11 = 0x8931, // 10 001 0010 0110 001 - TRCACVR12 = 0x8941, // 10 001 0010 1000 001 - TRCACVR13 = 0x8951, // 10 001 0010 1010 001 - TRCACVR14 = 0x8961, // 10 001 0010 1100 001 - TRCACVR15 = 0x8971, // 10 001 0010 1110 001 - TRCACATR0 = 0x8902, // 10 001 0010 0000 010 - TRCACATR1 = 0x8912, // 10 001 0010 0010 010 - TRCACATR2 = 0x8922, // 10 001 0010 0100 010 - TRCACATR3 = 0x8932, // 10 001 0010 0110 010 - TRCACATR4 = 0x8942, // 10 001 0010 1000 010 - TRCACATR5 = 0x8952, // 10 001 0010 1010 010 - TRCACATR6 = 0x8962, // 10 001 0010 1100 010 - TRCACATR7 = 0x8972, // 10 001 0010 1110 010 - TRCACATR8 = 0x8903, // 10 001 0010 0000 011 - TRCACATR9 = 0x8913, // 10 001 0010 0010 011 - TRCACATR10 = 0x8923, // 10 001 0010 0100 011 - TRCACATR11 = 0x8933, // 10 001 0010 0110 011 - TRCACATR12 = 0x8943, // 10 001 0010 1000 011 - TRCACATR13 = 0x8953, // 10 001 0010 1010 011 - TRCACATR14 = 0x8963, // 10 001 0010 1100 011 - TRCACATR15 = 0x8973, // 10 001 0010 1110 011 - TRCDVCVR0 = 0x8904, // 10 001 0010 0000 100 - TRCDVCVR1 = 0x8924, // 10 001 0010 0100 100 - TRCDVCVR2 = 0x8944, // 10 001 0010 1000 100 - TRCDVCVR3 = 0x8964, // 10 001 0010 1100 100 - TRCDVCVR4 = 0x8905, // 10 001 0010 0000 101 - TRCDVCVR5 = 0x8925, // 10 001 0010 0100 101 - TRCDVCVR6 = 0x8945, // 10 001 0010 1000 101 - TRCDVCVR7 = 0x8965, // 10 001 0010 1100 101 - TRCDVCMR0 = 0x8906, // 10 001 0010 0000 110 - TRCDVCMR1 = 0x8926, // 10 001 0010 0100 110 - TRCDVCMR2 = 0x8946, // 10 001 0010 1000 110 - TRCDVCMR3 = 0x8966, // 10 001 0010 1100 110 - TRCDVCMR4 = 0x8907, // 10 001 0010 0000 111 - TRCDVCMR5 = 0x8927, // 10 001 0010 0100 111 - TRCDVCMR6 = 0x8947, // 10 001 0010 1000 111 - TRCDVCMR7 = 0x8967, // 10 001 0010 1100 111 - TRCCIDCVR0 = 0x8980, // 10 001 0011 0000 000 - TRCCIDCVR1 = 0x8990, // 10 001 0011 0010 000 - TRCCIDCVR2 = 0x89a0, // 10 001 0011 0100 000 - TRCCIDCVR3 = 0x89b0, // 10 001 0011 0110 000 - TRCCIDCVR4 = 0x89c0, // 10 001 0011 1000 000 - TRCCIDCVR5 = 0x89d0, // 10 001 0011 1010 000 - TRCCIDCVR6 = 0x89e0, // 10 001 0011 1100 000 - TRCCIDCVR7 = 0x89f0, // 10 001 0011 1110 000 - TRCVMIDCVR0 = 0x8981, // 10 001 0011 0000 001 - TRCVMIDCVR1 = 0x8991, // 10 001 0011 0010 001 - TRCVMIDCVR2 = 0x89a1, // 10 001 0011 0100 001 - TRCVMIDCVR3 = 0x89b1, // 10 001 0011 0110 001 - TRCVMIDCVR4 = 0x89c1, // 10 001 0011 1000 001 - TRCVMIDCVR5 = 0x89d1, // 10 001 0011 1010 001 - TRCVMIDCVR6 = 0x89e1, // 10 001 0011 1100 001 - TRCVMIDCVR7 = 0x89f1, // 10 001 0011 1110 001 - TRCCIDCCTLR0 = 0x8982, // 10 001 0011 0000 010 - TRCCIDCCTLR1 = 0x898a, // 10 001 0011 0001 010 - TRCVMIDCCTLR0 = 0x8992, // 10 001 0011 0010 010 - TRCVMIDCCTLR1 = 0x899a, // 10 001 0011 0011 010 - TRCITCTRL = 0x8b84, // 10 001 0111 0000 100 - TRCCLAIMSET = 0x8bc6, // 10 001 0111 1000 110 - TRCCLAIMCLR = 0x8bce, // 10 001 0111 1001 110 - - // GICv3 registers - ICC_BPR1_EL1 = 0xc663, // 11 000 1100 1100 011 - ICC_BPR0_EL1 = 0xc643, // 11 000 1100 1000 011 - ICC_PMR_EL1 = 0xc230, // 11 000 0100 0110 000 - ICC_CTLR_EL1 = 0xc664, // 11 000 1100 1100 100 - ICC_CTLR_EL3 = 0xf664, // 11 110 1100 1100 100 - ICC_SRE_EL1 = 0xc665, // 11 000 1100 1100 101 - ICC_SRE_EL2 = 0xe64d, // 11 100 1100 1001 101 - ICC_SRE_EL3 = 0xf665, // 11 110 1100 1100 101 - ICC_IGRPEN0_EL1 = 0xc666, // 11 000 1100 1100 110 - ICC_IGRPEN1_EL1 = 0xc667, // 11 000 1100 1100 111 - ICC_IGRPEN1_EL3 = 0xf667, // 11 110 1100 1100 111 - ICC_SEIEN_EL1 = 0xc668, // 11 000 1100 1101 000 - ICC_AP0R0_EL1 = 0xc644, // 11 000 1100 1000 100 - ICC_AP0R1_EL1 = 0xc645, // 11 000 1100 1000 101 - ICC_AP0R2_EL1 = 0xc646, // 11 000 1100 1000 110 - ICC_AP0R3_EL1 = 0xc647, // 11 000 1100 1000 111 - ICC_AP1R0_EL1 = 0xc648, // 11 000 1100 1001 000 - ICC_AP1R1_EL1 = 0xc649, // 11 000 1100 1001 001 - ICC_AP1R2_EL1 = 0xc64a, // 11 000 1100 1001 010 - ICC_AP1R3_EL1 = 0xc64b, // 11 000 1100 1001 011 - ICH_AP0R0_EL2 = 0xe640, // 11 100 1100 1000 000 - ICH_AP0R1_EL2 = 0xe641, // 11 100 1100 1000 001 - ICH_AP0R2_EL2 = 0xe642, // 11 100 1100 1000 010 - ICH_AP0R3_EL2 = 0xe643, // 11 100 1100 1000 011 - ICH_AP1R0_EL2 = 0xe648, // 11 100 1100 1001 000 - ICH_AP1R1_EL2 = 0xe649, // 11 100 1100 1001 001 - ICH_AP1R2_EL2 = 0xe64a, // 11 100 1100 1001 010 - ICH_AP1R3_EL2 = 0xe64b, // 11 100 1100 1001 011 - ICH_HCR_EL2 = 0xe658, // 11 100 1100 1011 000 - ICH_MISR_EL2 = 0xe65a, // 11 100 1100 1011 010 - ICH_VMCR_EL2 = 0xe65f, // 11 100 1100 1011 111 - ICH_VSEIR_EL2 = 0xe64c, // 11 100 1100 1001 100 - ICH_LR0_EL2 = 0xe660, // 11 100 1100 1100 000 - ICH_LR1_EL2 = 0xe661, // 11 100 1100 1100 001 - ICH_LR2_EL2 = 0xe662, // 11 100 1100 1100 010 - ICH_LR3_EL2 = 0xe663, // 11 100 1100 1100 011 - ICH_LR4_EL2 = 0xe664, // 11 100 1100 1100 100 - ICH_LR5_EL2 = 0xe665, // 11 100 1100 1100 101 - ICH_LR6_EL2 = 0xe666, // 11 100 1100 1100 110 - ICH_LR7_EL2 = 0xe667, // 11 100 1100 1100 111 - ICH_LR8_EL2 = 0xe668, // 11 100 1100 1101 000 - ICH_LR9_EL2 = 0xe669, // 11 100 1100 1101 001 - ICH_LR10_EL2 = 0xe66a, // 11 100 1100 1101 010 - ICH_LR11_EL2 = 0xe66b, // 11 100 1100 1101 011 - ICH_LR12_EL2 = 0xe66c, // 11 100 1100 1101 100 - ICH_LR13_EL2 = 0xe66d, // 11 100 1100 1101 101 - ICH_LR14_EL2 = 0xe66e, // 11 100 1100 1101 110 - ICH_LR15_EL2 = 0xe66f, // 11 100 1100 1101 111 - - // v8.1a "Privileged Access Never" extension-specific system registers - PAN = 0xc213, // 11 000 0100 0010 011 - - // v8.1a "Limited Ordering Regions" extension-specific system registers - LORSA_EL1 = 0xc520, // 11 000 1010 0100 000 - LOREA_EL1 = 0xc521, // 11 000 1010 0100 001 - LORN_EL1 = 0xc522, // 11 000 1010 0100 010 - LORC_EL1 = 0xc523, // 11 000 1010 0100 011 - LORID_EL1 = 0xc527, // 11 000 1010 0100 111 - - // v8.1a "Virtualization host extensions" system registers - TTBR1_EL2 = 0xe101, // 11 100 0010 0000 001 - CONTEXTIDR_EL2 = 0xe681, // 11 100 1101 0000 001 - CNTHV_TVAL_EL2 = 0xe718, // 11 100 1110 0011 000 - CNTHV_CVAL_EL2 = 0xe71a, // 11 100 1110 0011 010 - CNTHV_CTL_EL2 = 0xe719, // 11 100 1110 0011 001 - SCTLR_EL12 = 0xe880, // 11 101 0001 0000 000 - CPACR_EL12 = 0xe882, // 11 101 0001 0000 010 - TTBR0_EL12 = 0xe900, // 11 101 0010 0000 000 - TTBR1_EL12 = 0xe901, // 11 101 0010 0000 001 - TCR_EL12 = 0xe902, // 11 101 0010 0000 010 - AFSR0_EL12 = 0xea88, // 11 101 0101 0001 000 - AFSR1_EL12 = 0xea89, // 11 101 0101 0001 001 - ESR_EL12 = 0xea90, // 11 101 0101 0010 000 - FAR_EL12 = 0xeb00, // 11 101 0110 0000 000 - MAIR_EL12 = 0xed10, // 11 101 1010 0010 000 - AMAIR_EL12 = 0xed18, // 11 101 1010 0011 000 - VBAR_EL12 = 0xee00, // 11 101 1100 0000 000 - CONTEXTIDR_EL12 = 0xee81, // 11 101 1101 0000 001 - CNTKCTL_EL12 = 0xef08, // 11 101 1110 0001 000 - CNTP_TVAL_EL02 = 0xef10, // 11 101 1110 0010 000 - CNTP_CTL_EL02 = 0xef11, // 11 101 1110 0010 001 - CNTP_CVAL_EL02 = 0xef12, // 11 101 1110 0010 010 - CNTV_TVAL_EL02 = 0xef18, // 11 101 1110 0011 000 - CNTV_CTL_EL02 = 0xef19, // 11 101 1110 0011 001 - CNTV_CVAL_EL02 = 0xef1a, // 11 101 1110 0011 010 - SPSR_EL12 = 0xea00, // 11 101 0100 0000 000 - ELR_EL12 = 0xea01, // 11 101 0100 0000 001 - - // v8.2a registers - UAO = 0xc214, // 11 000 0100 0010 100 - - // v8.2a "Statistical Profiling extension" registers - PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000 - PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001 - PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011 - PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111 - PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000 - PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000 - PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000 - PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010 - PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011 - PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100 - PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101 - PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110 - PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111 + struct SysReg { + const char *Name; + unsigned Encoding; + bool Readable; + bool Writeable; + FeatureBitset FeaturesRequired; - // Cyclone specific system registers - CPM_IOACC_CTL_EL3 = 0xff90, + bool haveFeatures(FeatureBitset ActiveFeatures) const { + return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; + } }; - // Note that these do not inherit from AArch64NamedImmMapper. This class is - // sufficiently different in its behaviour that I don't believe it's worth - // burdening the common AArch64NamedImmMapper with abstractions only needed in - // this one case. - struct SysRegMapper { - static const AArch64NamedImmMapper::Mapping SysRegMappings[]; + #define GET_SYSREG_DECL + #include "AArch64GenSystemOperands.inc" - const AArch64NamedImmMapper::Mapping *InstMappings; - size_t NumInstMappings; + const SysReg *lookupSysRegByName(StringRef); + const SysReg *lookupSysRegByEncoding(uint16_t); - SysRegMapper() { } - uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, - bool &Valid) const; - std::string toString(uint32_t Bits, const FeatureBitset& FeatureBits) const; - }; - - struct MSRMapper : SysRegMapper { - static const AArch64NamedImmMapper::Mapping MSRMappings[]; - MSRMapper(); - }; - - struct MRSMapper : SysRegMapper { - static const AArch64NamedImmMapper::Mapping MRSMappings[]; - MRSMapper(); - }; - - uint32_t ParseGenericRegister(StringRef Name, bool &Valid); + uint32_t parseGenericRegister(StringRef Name); + std::string genericRegisterString(uint32_t Bits); } namespace AArch64TLBI { - enum TLBIValues { - Invalid = -1, // Op0 Op1 CRn CRm Op2 - IPAS2E1IS = 0x6401, // 01 100 1000 0000 001 - IPAS2LE1IS = 0x6405, // 01 100 1000 0000 101 - VMALLE1IS = 0x4418, // 01 000 1000 0011 000 - ALLE2IS = 0x6418, // 01 100 1000 0011 000 - ALLE3IS = 0x7418, // 01 110 1000 0011 000 - VAE1IS = 0x4419, // 01 000 1000 0011 001 - VAE2IS = 0x6419, // 01 100 1000 0011 001 - VAE3IS = 0x7419, // 01 110 1000 0011 001 - ASIDE1IS = 0x441a, // 01 000 1000 0011 010 - VAAE1IS = 0x441b, // 01 000 1000 0011 011 - ALLE1IS = 0x641c, // 01 100 1000 0011 100 - VALE1IS = 0x441d, // 01 000 1000 0011 101 - VALE2IS = 0x641d, // 01 100 1000 0011 101 - VALE3IS = 0x741d, // 01 110 1000 0011 101 - VMALLS12E1IS = 0x641e, // 01 100 1000 0011 110 - VAALE1IS = 0x441f, // 01 000 1000 0011 111 - IPAS2E1 = 0x6421, // 01 100 1000 0100 001 - IPAS2LE1 = 0x6425, // 01 100 1000 0100 101 - VMALLE1 = 0x4438, // 01 000 1000 0111 000 - ALLE2 = 0x6438, // 01 100 1000 0111 000 - ALLE3 = 0x7438, // 01 110 1000 0111 000 - VAE1 = 0x4439, // 01 000 1000 0111 001 - VAE2 = 0x6439, // 01 100 1000 0111 001 - VAE3 = 0x7439, // 01 110 1000 0111 001 - ASIDE1 = 0x443a, // 01 000 1000 0111 010 - VAAE1 = 0x443b, // 01 000 1000 0111 011 - ALLE1 = 0x643c, // 01 100 1000 0111 100 - VALE1 = 0x443d, // 01 000 1000 0111 101 - VALE2 = 0x643d, // 01 100 1000 0111 101 - VALE3 = 0x743d, // 01 110 1000 0111 101 - VMALLS12E1 = 0x643e, // 01 100 1000 0111 110 - VAALE1 = 0x443f // 01 000 1000 0111 111 - }; - - struct TLBIMapper : AArch64NamedImmMapper { - const static Mapping TLBIMappings[]; - - TLBIMapper(); + struct TLBI { + const char *Name; + uint16_t Encoding; + bool NeedsReg; }; - - static inline bool NeedsRegister(TLBIValues Val) { - switch (Val) { - case VMALLE1IS: - case ALLE2IS: - case ALLE3IS: - case ALLE1IS: - case VMALLS12E1IS: - case VMALLE1: - case ALLE2: - case ALLE3: - case ALLE1: - case VMALLS12E1: - return false; - default: - return true; - } - } + #define GET_TLBI_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64II { @@ -1379,12 +515,7 @@ namespace AArch64II { /// thread-local symbol. On Darwin, only one type of thread-local access /// exists (pre linker-relaxation), but on ELF the TLSModel used for the /// referee will affect interpretation. - MO_TLS = 0x40, - - /// MO_CONSTPOOL - This flag indicates that a symbol operand represents - /// the address of a constant pool entry for the symbol, rather than the - /// address of the symbol itself. - MO_CONSTPOOL = 0x80 + MO_TLS = 0x40 }; } // end namespace AArch64II diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile deleted file mode 100644 index 0b80f82f2b99..000000000000 --- a/lib/Target/AArch64/Utils/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AArch64/Utils/Makefile -------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64Utils - -# Hack: we need to include 'main' AArch64 target directory to grab private -# headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 4f718e1ca310..7e59710a427a 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H -#define LLVM_LIB_TARGET_R600_AMDGPU_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetMachine.h" @@ -29,7 +29,6 @@ class TargetMachine; // R600 Passes FunctionPass *createR600VectorRegMerger(TargetMachine &tm); -FunctionPass *createR600TextureIntrinsicsReplacer(); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); FunctionPass *createR600EmitClauseMarkers(); FunctionPass *createR600ClauseMergePass(TargetMachine &tm); @@ -44,12 +43,14 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); -FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); +FunctionPass *createSIWholeQuadModePass(); +FunctionPass *createSILowerControlFlowPass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIFixSGPRCopiesPass(); -FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); -FunctionPass *createSIInsertWaits(TargetMachine &tm); +FunctionPass *createSIDebuggerInsertNopsPass(); +FunctionPass *createSIInsertWaitsPass(); +FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr); ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); @@ -60,6 +61,9 @@ extern char &AMDGPUAnnotateKernelFeaturesID; void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; +void initializeSIShrinkInstructionsPass(PassRegistry&); +extern char &SIShrinkInstructionsID; + void initializeSIFixSGPRCopiesPass(PassRegistry &); extern char &SIFixSGPRCopiesID; @@ -69,8 +73,19 @@ extern char &SILowerI1CopiesID; void initializeSILoadStoreOptimizerPass(PassRegistry &); extern char &SILoadStoreOptimizerID; +void initializeSIWholeQuadModePass(PassRegistry &); +extern char &SIWholeQuadModeID; + +void initializeSILowerControlFlowPass(PassRegistry &); +extern char &SILowerControlFlowPassID; + + // Passes common to R600 and SI -FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); +FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr); +void initializeAMDGPUPromoteAllocaPass(PassRegistry&); +extern char &AMDGPUPromoteAllocaID; + +FunctionPass *createAMDGPUAddDivergenceMetadata(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ModulePass *createAMDGPUAlwaysInlinePass(); @@ -80,12 +95,21 @@ FunctionPass *createAMDGPUAnnotateUniformValues(); void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); extern char &SIFixControlFlowLiveIntervalsID; -void initializeSIFixSGPRLiveRangesPass(PassRegistry&); -extern char &SIFixSGPRLiveRangesID; - void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); extern char &AMDGPUAnnotateUniformValuesPassID; +void initializeAMDGPUCodeGenPreparePass(PassRegistry&); +extern char &AMDGPUCodeGenPrepareID; + +void initializeSIAnnotateControlFlowPass(PassRegistry&); +extern char &SIAnnotateControlFlowPassID; + +void initializeSIDebuggerInsertNopsPass(PassRegistry&); +extern char &SIDebuggerInsertNopsID; + +void initializeSIInsertWaitsPass(PassRegistry&); +extern char &SIInsertWaitsID; + extern Target TheAMDGPUTarget; extern Target TheGCNTarget; @@ -101,15 +125,6 @@ enum TargetIndex { } // End namespace llvm -namespace ShaderType { - enum Type { - PIXEL = 0, - VERTEX = 1, - GEOMETRY = 2, - COMPUTE = 3 - }; -} - /// OpenCL uses address spaces to differentiate between /// various memory regions on the hardware. On the CPU /// all of the address spaces point to the same memory, @@ -120,7 +135,7 @@ namespace AMDGPUAS { enum AddressSpaces : unsigned { PRIVATE_ADDRESS = 0, ///< Address space for private memory. GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - CONSTANT_ADDRESS = 2, ///< Address space for constant memory + CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) LOCAL_ADDRESS = 3, ///< Address space for local memory. FLAT_ADDRESS = 4, ///< Address space for flat memory. REGION_ADDRESS = 5, ///< Address space for region memory. @@ -148,8 +163,6 @@ enum AddressSpaces : unsigned { CONSTANT_BUFFER_13 = 21, CONSTANT_BUFFER_14 = 22, CONSTANT_BUFFER_15 = 23, - ADDRESS_NONE = 24, ///< Address space for unknown memory. - LAST_ADDRESS = ADDRESS_NONE, // Some places use this if the address space can't be determined. UNKNOWN_ADDRESS_SPACE = ~0u diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 844d89c737bf..72c455354411 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -1,182 +1,121 @@ -//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===// +//===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// include "llvm/Target/Target.td" -//===----------------------------------------------------------------------===// -// Subtarget Features -//===----------------------------------------------------------------------===// - -// Debugging Features - -def FeatureDumpCode : SubtargetFeature <"DumpCode", - "DumpCode", - "true", - "Dump MachineInstrs in the CodeEmitter">; - -def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", - "DumpCode", - "true", - "Dump MachineInstrs in the CodeEmitter">; - -def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer", - "EnableIRStructurizer", - "false", - "Disable IR Structurizer">; - -def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", - "EnablePromoteAlloca", - "true", - "Enable promote alloca pass">; - -// Target features - -def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", - "EnableIfCvt", - "false", - "Disable the if conversion pass">; +//===------------------------------------------------------------===// +// Subtarget Features (device properties) +//===------------------------------------------------------------===// def FeatureFP64 : SubtargetFeature<"fp64", - "FP64", - "true", - "Enable double precision operations">; - -def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", - "FP64Denormals", - "true", - "Enable double precision denormal handling", - [FeatureFP64]>; + "FP64", + "true", + "Enable double precision operations" +>; def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", - "FastFMAF32", - "true", - "Assuming f32 fma is at least as fast as mul + add", - []>; - -// Some instructions do not support denormals despite this flag. Using -// fp32 denormals also causes instructions to run at the double -// precision rate for the device. -def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", - "FP32Denormals", - "true", - "Enable single precision denormal handling">; + "FastFMAF32", + "true", + "Assuming f32 fma is at least as fast as mul + add" +>; -def Feature64BitPtr : SubtargetFeature<"64BitPtr", - "Is64bit", - "true", - "Specify if 64-bit addressing should be used">; +def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops", + "HalfRate64Ops", + "true", + "Most fp64 instructions are half rate instead of quarter" +>; def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", - "R600ALUInst", - "false", - "Older version of ALU instructions encoding">; + "R600ALUInst", + "false", + "Older version of ALU instructions encoding" +>; def FeatureVertexCache : SubtargetFeature<"HasVertexCache", - "HasVertexCache", - "true", - "Specify use of dedicated vertex cache">; + "HasVertexCache", + "true", + "Specify use of dedicated vertex cache" +>; def FeatureCaymanISA : SubtargetFeature<"caymanISA", - "CaymanISA", - "true", - "Use Cayman ISA">; + "CaymanISA", + "true", + "Use Cayman ISA" +>; def FeatureCFALUBug : SubtargetFeature<"cfalubug", - "CFALUBug", - "true", - "GPU has CF_ALU bug">; - -// XXX - This should probably be removed once enabled by default -def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", - "EnableLoadStoreOpt", - "true", - "Enable SI load/store optimizer pass">; - -// Performance debugging feature. Allow using DS instruction immediate -// offsets even if the base pointer can't be proven to be base. On SI, -// base pointer values that won't give the same result as a 16-bit add -// are not safe to fold, but this will override the conservative test -// for the base pointer. -def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-folding", - "EnableUnsafeDSOffsetFolding", - "true", - "Force using DS instruction immediate offsets on SI">; - -def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", - "FlatForGlobal", - "true", - "Force to generate flat instruction for global">; + "CFALUBug", + "true", + "GPU has CF_ALU bug" +>; def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", - "FlatAddressSpace", - "true", - "Support flat address space">; + "FlatAddressSpace", + "true", + "Support flat address space" +>; -def FeatureXNACK : SubtargetFeature<"xnack", - "EnableXNACK", - "true", - "Enable XNACK support">; +def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", + "UnalignedBufferAccess", + "true", + "Support unaligned global loads and stores" +>; -def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", - "EnableVGPRSpilling", - "true", - "Enable spilling of VGPRs to scratch memory">; +def FeatureXNACK : SubtargetFeature<"xnack", + "EnableXNACK", + "true", + "Enable XNACK support" +>; def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", - "SGPRInitBug", - "true", - "VI SGPR initilization bug requiring a fixed SGPR allocation size">; - -def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer", - "EnableHugeScratchBuffer", - "true", - "Enable scratch buffer sizes greater than 128 GB">; - -def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", - "EnableSIScheduler", - "true", - "Enable SI Machine Scheduler">; + "SGPRInitBug", + "true", + "VI SGPR initilization bug requiring a fixed SGPR allocation size" +>; class SubtargetFeatureFetchLimit : SubtargetFeature <"fetch"#Value, - "TexVTXClauseSize", - Value, - "Limit the maximum number of fetches in a clause to "#Value>; + "TexVTXClauseSize", + Value, + "Limit the maximum number of fetches in a clause to "#Value +>; def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; class SubtargetFeatureWavefrontSize : SubtargetFeature< - "wavefrontsize"#Value, - "WavefrontSize", - !cast(Value), - "The number of threads per wavefront">; + "wavefrontsize"#Value, + "WavefrontSize", + !cast(Value), + "The number of threads per wavefront" +>; def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; class SubtargetFeatureLDSBankCount : SubtargetFeature < - "ldsbankcount"#Value, - "LDSBankCount", - !cast(Value), - "The number of LDS banks per compute unit.">; + "ldsbankcount"#Value, + "LDSBankCount", + !cast(Value), + "The number of LDS banks per compute unit." +>; def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; class SubtargetFeatureISAVersion : SubtargetFeature < - "isaver"#Major#"."#Minor#"."#Stepping, - "IsaVersion", - "ISAVersion"#Major#"_"#Minor#"_"#Stepping, - "Instruction set version number" + "isaver"#Major#"."#Minor#"."#Stepping, + "IsaVersion", + "ISAVersion"#Major#"_"#Minor#"_"#Stepping, + "Instruction set version number" >; def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>; @@ -186,36 +125,145 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>; def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>; class SubtargetFeatureLocalMemorySize : SubtargetFeature< - "localmemorysize"#Value, - "LocalMemorySize", - !cast(Value), - "The size of local memory in bytes">; + "localmemorysize"#Value, + "LocalMemorySize", + !cast(Value), + "The size of local memory in bytes" +>; def FeatureGCN : SubtargetFeature<"gcn", - "IsGCN", - "true", - "GCN or newer GPU">; + "IsGCN", + "true", + "GCN or newer GPU" +>; def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding", - "GCN1Encoding", - "true", - "Encoding format for SI and CI">; + "GCN1Encoding", + "true", + "Encoding format for SI and CI" +>; def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", - "GCN3Encoding", - "true", - "Encoding format for VI">; + "GCN3Encoding", + "true", + "Encoding format for VI" +>; def FeatureCIInsts : SubtargetFeature<"ci-insts", - "CIInsts", - "true", - "Additional intstructions for CI+">; + "CIInsts", + "true", + "Additional intstructions for CI+" +>; + +def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime", + "HasSMemRealTime", + "true", + "Has s_memrealtime instruction" +>; + +def Feature16BitInsts : SubtargetFeature<"16-bit-insts", + "Has16BitInsts", + "true", + "Has i16/f16 instructions" +>; + +//===------------------------------------------------------------===// +// Subtarget Features (options and debugging) +//===------------------------------------------------------------===// + +// Some instructions do not support denormals despite this flag. Using +// fp32 denormals also causes instructions to run at the double +// precision rate for the device. +def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", + "FP32Denormals", + "true", + "Enable single precision denormal handling" +>; + +def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", + "FP64Denormals", + "true", + "Enable double precision denormal handling", + [FeatureFP64] +>; + +def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", + "FPExceptions", + "true", + "Enable floating point exceptions" +>; + +class FeatureMaxPrivateElementSize : SubtargetFeature< + "max-private-element-size-"#size, + "MaxPrivateElementSize", + !cast(size), + "Maximum private access size may be "#size +>; + +def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; +def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; +def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; + +def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", + "EnableVGPRSpilling", + "true", + "Enable spilling of VGPRs to scratch memory" +>; + +def FeatureDumpCode : SubtargetFeature <"DumpCode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter" +>; + +def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter" +>; + +def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", + "EnablePromoteAlloca", + "true", + "Enable promote alloca pass" +>; + +// XXX - This should probably be removed once enabled by default +def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", + "EnableLoadStoreOpt", + "true", + "Enable SI load/store optimizer pass" +>; + +// Performance debugging feature. Allow using DS instruction immediate +// offsets even if the base pointer can't be proven to be base. On SI, +// base pointer values that won't give the same result as a 16-bit add +// are not safe to fold, but this will override the conservative test +// for the base pointer. +def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature < + "unsafe-ds-offset-folding", + "EnableUnsafeDSOffsetFolding", + "true", + "Force using DS instruction immediate offsets on SI" +>; + +def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", + "EnableSIScheduler", + "true", + "Enable SI Machine Scheduler" +>; + +def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", + "FlatForGlobal", + "true", + "Force to generate flat instruction for global" +>; // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", - "FeatureDisable","true", - "Dummy feature to disable assembler" - " instructions">; + "FeatureDisable","true", + "Dummy feature to disable assembler instructions" +>; class SubtargetFeatureGeneration Implies> : @@ -227,33 +275,66 @@ def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; def FeatureR600 : SubtargetFeatureGeneration<"R600", - [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>; + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0] +>; def FeatureR700 : SubtargetFeatureGeneration<"R700", - [FeatureFetchLimit16, FeatureLocalMemorySize0]>; + [FeatureFetchLimit16, FeatureLocalMemorySize0] +>; def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", - [FeatureFetchLimit16, FeatureLocalMemorySize32768]>; + [FeatureFetchLimit16, FeatureLocalMemorySize32768] +>; def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", - [FeatureFetchLimit16, FeatureWavefrontSize64, - FeatureLocalMemorySize32768] + [FeatureFetchLimit16, FeatureWavefrontSize64, + FeatureLocalMemorySize32768] >; def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, - FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, - FeatureLDSBankCount32]>; + [FeatureFP64, FeatureLocalMemorySize32768, + FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, + FeatureLDSBankCount32] +>; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureGCN1Encoding, FeatureCIInsts]>; + [FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, + FeatureGCN1Encoding, FeatureCIInsts] +>; def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts]>; + [FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, + FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, + FeatureSMemRealTime + ] +>; + +//===----------------------------------------------------------------------===// +// Debugger related subtarget features. +//===----------------------------------------------------------------------===// + +def FeatureDebuggerInsertNops : SubtargetFeature< + "amdgpu-debugger-insert-nops", + "DebuggerInsertNops", + "true", + "Insert one nop instruction for each high level source statement" +>; + +def FeatureDebuggerReserveRegs : SubtargetFeature< + "amdgpu-debugger-reserve-regs", + "DebuggerReserveRegs", + "true", + "Reserve registers for debugger usage" +>; + +def FeatureDebuggerEmitPrologue : SubtargetFeature< + "amdgpu-debugger-emit-prologue", + "DebuggerEmitPrologue", + "true", + "Emit debugger prologue" +>; //===----------------------------------------------------------------------===// @@ -283,6 +364,7 @@ def NullALU : InstrItinClass; //===----------------------------------------------------------------------===// def TruePredicate : Predicate<"true">; + def isSICI : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" @@ -292,6 +374,13 @@ def isVI : Predicate < "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, AssemblerPredicate<"FeatureGCN3Encoding">; +def isCIVI : Predicate < + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" +>, AssemblerPredicate<"FeatureCIInsts">; + +def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; + class PredicateControl { Predicate SubtargetPredicate; Predicate SIAssemblerPredicate = isSICI; diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index ad267d350850..63f5fb3cdf00 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -45,9 +45,8 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { for (Function *F : FuncsToClone) { ValueToValueMapTy VMap; - Function *NewFunc = CloneFunction(F, VMap, false); + Function *NewFunc = CloneFunction(F, VMap); NewFunc->setLinkage(GlobalValue::InternalLinkage); - M.getFunctionList().push_back(NewFunc); F->replaceAllUsesWith(NewFunc); } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 378183927242..0910b2877b09 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -24,6 +25,8 @@ namespace { class AMDGPUAnnotateKernelFeatures : public ModulePass { private: + static bool hasAddrSpaceCast(const Function &F); + void addAttrToCallers(Function *Intrin, StringRef AttrName); bool addAttrsForIntrinsics(Module &M, ArrayRef); @@ -40,6 +43,11 @@ public: AU.setPreservesAll(); ModulePass::getAnalysisUsage(AU); } + + static bool visitConstantExpr(const ConstantExpr *CE); + static bool visitConstantExprsRecursively( + const Constant *EntryC, + SmallPtrSet &ConstantExprVisited); }; } @@ -48,12 +56,87 @@ char AMDGPUAnnotateKernelFeatures::ID = 0; char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; +INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) + + +// The queue ptr is only needed when casting to flat, not from it. +static bool castRequiresQueuePtr(unsigned SrcAS) { + return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; +} + +static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { + return castRequiresQueuePtr(ASC->getSrcAddressSpace()); +} + +bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { + if (CE->getOpcode() == Instruction::AddrSpaceCast) { + unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); + return castRequiresQueuePtr(SrcAS); + } + + return false; +} + +bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( + const Constant *EntryC, + SmallPtrSet &ConstantExprVisited) { -INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, - "Add AMDGPU function attributes", false, false) -INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, - "Add AMDGPU function attributes", false, false) + if (!ConstantExprVisited.insert(EntryC).second) + return false; + SmallVector Stack; + Stack.push_back(EntryC); + + while (!Stack.empty()) { + const Constant *C = Stack.pop_back_val(); + + // Check this constant expression. + if (const auto *CE = dyn_cast(C)) { + if (visitConstantExpr(CE)) + return true; + } + + // Visit all sub-expressions. + for (const Use &U : C->operands()) { + const auto *OpC = dyn_cast(U); + if (!OpC) + continue; + + if (!ConstantExprVisited.insert(OpC).second) + continue; + + Stack.push_back(OpC); + } + } + + return false; +} + +// Return true if an addrspacecast is used that requires the queue ptr. +bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) { + SmallPtrSet ConstantExprVisited; + + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + if (const AddrSpaceCastInst *ASC = dyn_cast(&I)) { + if (castRequiresQueuePtr(ASC)) + return true; + } + + for (const Use &U : I.operands()) { + const auto *OpC = dyn_cast(U); + if (!OpC) + continue; + + if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) + return true; + } + } + } + + return false; +} void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, StringRef AttrName) { @@ -89,35 +172,46 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { static const StringRef IntrinsicToAttr[][2] = { // .x omitted + { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" }, + { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" }, + + { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" }, + { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" }, + { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, // .x omitted { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } - }; static const StringRef HSAIntrinsicToAttr[][2] = { - { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" }, - { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" }, - { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" }, - - { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" }, - { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" }, - { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" }, - { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" } + { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }, + { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" } }; + // TODO: We should not add the attributes if the known compile time workgroup + // size is 1 for y/z. + // TODO: Intrinsics that require queue ptr. // We do not need to note the x workitem or workgroup id because they are // always initialized. bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); - if (TT.getOS() == Triple::AMDHSA) + if (TT.getOS() == Triple::AMDHSA) { Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); + for (Function &F : M) { + if (F.hasFnAttribute("amdgpu-queue-ptr")) + continue; + + if (hasAddrSpaceCast(F)) + F.addFnAttr("amdgpu-queue-ptr"); + } + } + return Changed; } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index dfddc345f286..2010cc952265 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -43,6 +43,7 @@ public: AU.setPreservesAll(); } + void visitBranchInst(BranchInst &I); void visitLoadInst(LoadInst &I); }; @@ -57,13 +58,28 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, char AMDGPUAnnotateUniformValues::ID = 0; +static void setUniformMetadata(Instruction *I) { + I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {})); +} + +void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { + if (I.isUnconditional()) + return; + + Value *Cond = I.getCondition(); + if (!DA->isUniform(Cond)) + return; + + setUniformMetadata(I.getParent()->getTerminator()); +} + void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; if (Instruction *PtrI = dyn_cast(Ptr)) - PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {})); + setUniformMetadata(PtrI); } @@ -72,6 +88,9 @@ bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { } bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + DA = &getAnalysis(); visit(F); diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 1239dfb235ef..cfe6346fb6b1 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -28,8 +28,10 @@ #include "R600RegisterInfo.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" +#include "SIInstrInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" @@ -37,7 +39,9 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "AMDGPURuntimeMetadata.h" +using namespace ::AMDGPU; using namespace llvm; // TODO: This should get the default rounding mode from the kernel. We just set @@ -61,7 +65,7 @@ using namespace llvm; // instructions to run at the double precision rate for the device so it's // probably best to just report no single precision denormals. static uint32_t getFPMode(const MachineFunction &F) { - const AMDGPUSubtarget& ST = F.getSubtarget(); + const SISubtarget& ST = F.getSubtarget(); // TODO: Is there any real use for the flush in only / flush out only modes? uint32_t FP32Denormals = @@ -104,10 +108,12 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { AMDGPUTargetStreamer *TS = static_cast(OutStreamer->getTargetStreamer()); - TS->EmitDirectiveHSACodeObjectVersion(1, 0); + TS->EmitDirectiveHSACodeObjectVersion(2, 1); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); + emitStartOfRuntimeMetadata(M); } void AMDGPUAsmPrinter::EmitFunctionBodyStart() { @@ -132,54 +138,13 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { AsmPrinter::EmitFunctionEntryLabel(); } -static bool isModuleLinkage(const GlobalValue *GV) { - switch (GV->getLinkage()) { - case GlobalValue::InternalLinkage: - case GlobalValue::CommonLinkage: - return true; - case GlobalValue::ExternalLinkage: - return false; - default: llvm_unreachable("unknown linkage type"); - } -} - void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { - if (TM.getTargetTriple().getOS() != Triple::AMDHSA) { - AsmPrinter::EmitGlobalVariable(GV); - return; - } - - if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) { - AsmPrinter::EmitGlobalVariable(GV); - return; - } - // Group segment variables aren't emitted in HSA. if (AMDGPU::isGroupSegment(GV)) return; - AMDGPUTargetStreamer *TS = - static_cast(OutStreamer->getTargetStreamer()); - if (isModuleLinkage(GV)) { - TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName()); - } else { - TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName()); - } - - MCSymbolELF *GVSym = cast(getSymbol(GV)); - const DataLayout &DL = getDataLayout(); - - // Emit the size - uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); - OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext)); - OutStreamer->PushSection(); - OutStreamer->SwitchSection( - getObjFileLowering().SectionForGlobal(GV, *Mang, TM)); - const Constant *C = GV->getInitializer(); - OutStreamer->EmitLabel(GVSym); - EmitGlobalConstant(DL, C); - OutStreamer->PopSection(); + AsmPrinter::EmitGlobalVariable(GV); } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -230,6 +195,20 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), false); + OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) + + " bytes/workgroup (compile time only)", false); + + OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst), + false); + OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount), + false); + + if (MF.getSubtarget().debuggerEmitPrologue()) { + OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + + Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); + OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" + + Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false); + } OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), @@ -268,15 +247,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { } } + emitRuntimeMetadata(*MF.getFunction()); + return false; } void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { unsigned MaxGPR = 0; bool killPixel = false; - const AMDGPUSubtarget &STM = MF.getSubtarget(); - const R600RegisterInfo *RI = - static_cast(STM.getRegisterInfo()); + const R600Subtarget &STM = MF.getSubtarget(); + const R600RegisterInfo *RI = STM.getRegisterInfo(); const R600MachineFunctionInfo *MFI = MF.getInfo(); for (const MachineBasicBlock &MBB : MF) { @@ -299,23 +279,23 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { } unsigned RsrcReg; - if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { + if (STM.getGeneration() >= R600Subtarget::EVERGREEN) { // Evergreen / Northern Islands - switch (MFI->getShaderType()) { + switch (MF.getFunction()->getCallingConv()) { default: // Fall through - case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; - case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; - case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; - case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; + case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; + case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; + case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; + case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; } } else { // R600 / R700 - switch (MFI->getShaderType()) { + switch (MF.getFunction()->getCallingConv()) { default: // Fall through - case ShaderType::GEOMETRY: // Fall through - case ShaderType::COMPUTE: // Fall through - case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; - case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; + case CallingConv::AMDGPU_GS: // Fall through + case CallingConv::AMDGPU_CS: // Fall through + case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; + case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; } } @@ -325,23 +305,23 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); - if (MFI->getShaderType() == ShaderType::COMPUTE) { + if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); - OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); + OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4); } } void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) const { - const AMDGPUSubtarget &STM = MF.getSubtarget(); + const SISubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); uint64_t CodeSize = 0; unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; bool FlatUsed = false; - const SIRegisterInfo *RI = - static_cast(STM.getRegisterInfo()); + const SIRegisterInfo *RI = STM.getRegisterInfo(); + const SIInstrInfo *TII = STM.getInstrInfo(); for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { @@ -351,8 +331,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (MI.isDebugValue()) continue; - // FIXME: This is reporting 0 for many instructions. - CodeSize += MI.getDesc().Size; + CodeSize += TII->getInstSizeInBytes(MI); unsigned numOperands = MI.getNumOperands(); for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { @@ -366,6 +345,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, unsigned reg = MO.getReg(); switch (reg) { case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: case AMDGPU::SCC: case AMDGPU::M0: continue; @@ -382,17 +363,32 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, FlatUsed = true; continue; + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("Trap Handler registers should not be used"); + continue; + default: break; } if (AMDGPU::SReg_32RegClass.contains(reg)) { + if (AMDGPU::TTMP_32RegClass.contains(reg)) { + llvm_unreachable("Trap Handler registers should not be used"); + } isSGPR = true; width = 1; } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { isSGPR = false; width = 1; } else if (AMDGPU::SReg_64RegClass.contains(reg)) { + if (AMDGPU::TTMP_64RegClass.contains(reg)) { + llvm_unreachable("Trap Handler registers should not be used"); + } isSGPR = true; width = 2; } else if (AMDGPU::VReg_64RegClass.contains(reg)) { @@ -438,7 +434,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (VCCUsed) ExtraSGPRs = 2; - if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { if (FlatUsed) ExtraSGPRs = 4; } else { @@ -451,23 +447,54 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, MaxSGPR += ExtraSGPRs; + // Record first reserved register and reserved register count fields, and + // update max register counts if "amdgpu-debugger-reserve-regs" attribute was + // specified. + if (STM.debuggerReserveRegs()) { + ProgInfo.ReservedVGPRFirst = MaxVGPR + 1; + ProgInfo.ReservedVGPRCount = MFI->getDebuggerReservedVGPRCount(); + MaxVGPR += MFI->getDebuggerReservedVGPRCount(); + } + + // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and + // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" + // attribute was specified. + if (STM.debuggerEmitPrologue()) { + ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = + RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); + ProgInfo.DebuggerPrivateSegmentBufferSGPR = + RI->getHWRegIndex(MFI->getScratchRSrcReg()); + } + // We found the maximum register index. They start at 0, so add one to get the // number of registers. ProgInfo.NumVGPR = MaxVGPR + 1; ProgInfo.NumSGPR = MaxSGPR + 1; if (STM.hasSGPRInitBug()) { - if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) { + if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) { LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("too many SGPRs used with the SGPR init bug"); + DiagnosticInfoResourceLimit Diag(*MF.getFunction(), + "SGPRs with SGPR init bug", + ProgInfo.NumSGPR, DS_Error); + Ctx.diagnose(Diag); } - ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; } if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("too many user SGPRs used"); + DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs", + MFI->NumUserSGPRs, DS_Error); + Ctx.diagnose(Diag); + } + + if (MFI->LDSSize > static_cast(STM.getLocalMemorySize())) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory", + MFI->LDSSize, DS_Error); + Ctx.diagnose(Diag); } ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; @@ -476,21 +503,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // register. ProgInfo.FloatMode = getFPMode(MF); - // XXX: Not quite sure what this does, but sc seems to unset this. ProgInfo.IEEEMode = 0; - // Do not clamp NAN to 0. - ProgInfo.DX10Clamp = 0; + // Make clamp modifier on NaN input returns 0. + ProgInfo.DX10Clamp = 1; const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); + ProgInfo.ScratchSize = FrameInfo->getStackSize(); ProgInfo.FlatUsed = FlatUsed; ProgInfo.VCCUsed = VCCUsed; ProgInfo.CodeLen = CodeSize; unsigned LDSAlignShift; - if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { + if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { // LDS is allocated in 64 dword blocks. LDSAlignShift = 8; } else { @@ -503,7 +529,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; ProgInfo.LDSBlocks = - RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; // Scratch is allocated in 256 dword blocks. unsigned ScratchAlignShift = 10; @@ -511,8 +537,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // is used by the entire wave. ProgInfo.ScratchSize is the amount of // scratch memory used per thread. ProgInfo.ScratchBlocks = - RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(), - 1 << ScratchAlignShift) >> ScratchAlignShift; + alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), + 1ULL << ScratchAlignShift) >> + ScratchAlignShift; ProgInfo.ComputePGMRSrc1 = S_00B848_VGPRS(ProgInfo.VGPRBlocks) | @@ -544,23 +571,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B84C_EXCP_EN(0); } -static unsigned getRsrcReg(unsigned ShaderType) { - switch (ShaderType) { +static unsigned getRsrcReg(CallingConv::ID CallConv) { + switch (CallConv) { default: // Fall through - case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; - case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; - case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; - case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; + case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; + case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; + case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; } } void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo) { - const AMDGPUSubtarget &STM = MF.getSubtarget(); + const SISubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); + unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv()); - if (MFI->getShaderType() == ShaderType::COMPUTE) { + if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); @@ -577,13 +604,13 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(RsrcReg, 4); OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); - if (STM.isVGPRSpillingEnabled(MFI)) { + if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); } } - if (MFI->getShaderType() == ShaderType::PIXEL) { + if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); @@ -591,12 +618,31 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); } + + OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); + OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4); + OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4); + OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4); +} + +// This is supposed to be log2(Size) +static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { + switch (Size) { + case 4: + return AMD_ELEMENT_4_BYTES; + case 8: + return AMD_ELEMENT_8_BYTES; + case 16: + return AMD_ELEMENT_16_BYTES; + default: + llvm_unreachable("invalid private_element_size"); + } } void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, const SIProgramInfo &KernelInfo) const { const SIMachineFunctionInfo *MFI = MF.getInfo(); - const AMDGPUSubtarget &STM = MF.getSubtarget(); + const SISubtarget &STM = MF.getSubtarget(); amd_kernel_code_t header; AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits()); @@ -606,6 +652,11 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, (KernelInfo.ComputePGMRSrc2 << 32); header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + AMD_HSA_BITS_SET(header.code_properties, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, + getElementByteSizeValue(STM.getMaxPrivateElementSize())); + if (MFI->hasPrivateSegmentBuffer()) { header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; @@ -646,6 +697,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, if (MFI->hasDispatchPtr()) header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + if (STM.debuggerSupported()) + header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; + if (STM.isXNACKEnabled()) header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; @@ -654,9 +708,20 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.workitem_vgpr_count = KernelInfo.NumVGPR; header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; + header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; + header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; + + if (STM.debuggerEmitPrologue()) { + header.debug_wavefront_private_segment_offset_sgpr = + KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; + header.debug_private_segment_buffer_sgpr = + KernelInfo.DebuggerPrivateSegmentBufferSGPR; + } AMDGPUTargetStreamer *TS = static_cast(OutStreamer->getTargetStreamer()); + + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); TS->EmitAMDKernelCodeT(header); } @@ -680,3 +745,227 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); return false; } + +// Emit a key and an integer value for runtime metadata. +static void emitRuntimeMDIntValue(std::unique_ptr &Streamer, + RuntimeMD::Key K, uint64_t V, + unsigned Size) { + Streamer->EmitIntValue(K, 1); + Streamer->EmitIntValue(V, Size); +} + +// Emit a key and a string value for runtime metadata. +static void emitRuntimeMDStringValue(std::unique_ptr &Streamer, + RuntimeMD::Key K, StringRef S) { + Streamer->EmitIntValue(K, 1); + Streamer->EmitIntValue(S.size(), 4); + Streamer->EmitBytes(S); +} + +// Emit a key and three integer values for runtime metadata. +// The three integer values are obtained from MDNode \p Node; +static void emitRuntimeMDThreeIntValues(std::unique_ptr &Streamer, + RuntimeMD::Key K, MDNode *Node, + unsigned Size) { + Streamer->EmitIntValue(K, 1); + Streamer->EmitIntValue(mdconst::extract( + Node->getOperand(0))->getZExtValue(), Size); + Streamer->EmitIntValue(mdconst::extract( + Node->getOperand(1))->getZExtValue(), Size); + Streamer->EmitIntValue(mdconst::extract( + Node->getOperand(2))->getZExtValue(), Size); +} + +void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) { + OutStreamer->SwitchSection(getObjFileLowering().getContext() + .getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0)); + + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion, + RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2); + if (auto MD = M.getNamedMetadata("opencl.ocl.version")) { + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage, + RuntimeMD::OpenCL_C, 1); + auto Node = MD->getOperand(0); + unsigned short Major = mdconst::extract(Node->getOperand(0)) + ->getZExtValue(); + unsigned short Minor = mdconst::extract(Node->getOperand(1)) + ->getZExtValue(); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion, + Major * 100 + Minor * 10, 2); + } +} + +static std::string getOCLTypeName(Type *Ty, bool isSigned) { + if (VectorType* VecTy = dyn_cast(Ty)) { + Type* EleTy = VecTy->getElementType(); + unsigned Size = VecTy->getVectorNumElements(); + return (Twine(getOCLTypeName(EleTy, isSigned)) + Twine(Size)).str(); + } + switch (Ty->getTypeID()) { + case Type::HalfTyID: return "half"; + case Type::FloatTyID: return "float"; + case Type::DoubleTyID: return "double"; + case Type::IntegerTyID: { + if (!isSigned) + return (Twine('u') + Twine(getOCLTypeName(Ty, true))).str(); + auto IntTy = cast(Ty); + auto BW = IntTy->getIntegerBitWidth(); + switch (BW) { + case 8: + return "char"; + case 16: + return "short"; + case 32: + return "int"; + case 64: + return "long"; + default: + return (Twine('i') + Twine(BW)).str(); + } + } + default: + llvm_unreachable("invalid type"); + } +} + +static RuntimeMD::KernelArg::ValueType getRuntimeMDValueType( + Type *Ty, StringRef TypeName) { + if (auto VT = dyn_cast(Ty)) + return getRuntimeMDValueType(VT->getElementType(), TypeName); + else if (auto PT = dyn_cast(Ty)) + return getRuntimeMDValueType(PT->getElementType(), TypeName); + else if (Ty->isHalfTy()) + return RuntimeMD::KernelArg::F16; + else if (Ty->isFloatTy()) + return RuntimeMD::KernelArg::F32; + else if (Ty->isDoubleTy()) + return RuntimeMD::KernelArg::F64; + else if (IntegerType* intTy = dyn_cast(Ty)) { + bool Signed = !TypeName.startswith("u"); + switch (intTy->getIntegerBitWidth()) { + case 8: + return Signed ? RuntimeMD::KernelArg::I8 : RuntimeMD::KernelArg::U8; + case 16: + return Signed ? RuntimeMD::KernelArg::I16 : RuntimeMD::KernelArg::U16; + case 32: + return Signed ? RuntimeMD::KernelArg::I32 : RuntimeMD::KernelArg::U32; + case 64: + return Signed ? RuntimeMD::KernelArg::I64 : RuntimeMD::KernelArg::U64; + default: + // Runtime does not recognize other integer types. Report as + // struct type. + return RuntimeMD::KernelArg::Struct; + } + } else + return RuntimeMD::KernelArg::Struct; +} + +void AMDGPUAsmPrinter::emitRuntimeMetadata(const Function &F) { + if (!F.getMetadata("kernel_arg_type")) + return; + + MCContext &Context = getObjFileLowering().getContext(); + OutStreamer->SwitchSection( + Context.getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0)); + OutStreamer->EmitIntValue(RuntimeMD::KeyKernelBegin, 1); + emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyKernelName, F.getName()); + + for (auto &Arg:F.args()) { + // Emit KeyArgBegin. + unsigned I = Arg.getArgNo(); + OutStreamer->EmitIntValue(RuntimeMD::KeyArgBegin, 1); + + // Emit KeyArgSize and KeyArgAlign. + auto T = Arg.getType(); + auto DL = F.getParent()->getDataLayout(); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgSize, + DL.getTypeAllocSize(T), 4); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAlign, + DL.getABITypeAlignment(T), 4); + + // Emit KeyArgTypeName. + auto TypeName = dyn_cast(F.getMetadata( + "kernel_arg_type")->getOperand(I))->getString(); + emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgTypeName, TypeName); + + // Emit KeyArgName. + if (auto ArgNameMD = F.getMetadata("kernel_arg_name")) { + auto ArgName = cast(ArgNameMD->getOperand( + I))->getString(); + emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgName, ArgName); + } + + // Emit KeyArgIsVolatile, KeyArgIsRestrict, KeyArgIsConst and KeyArgIsPipe. + auto TypeQual = cast(F.getMetadata( + "kernel_arg_type_qual")->getOperand(I))->getString(); + SmallVector SplitQ; + TypeQual.split(SplitQ, " ", -1, false/* drop empty entry*/); + for (auto &I:SplitQ) { + auto Key = StringSwitch(I) + .Case("volatile", RuntimeMD::KeyArgIsVolatile) + .Case("restrict", RuntimeMD::KeyArgIsRestrict) + .Case("const", RuntimeMD::KeyArgIsConst) + .Case("pipe", RuntimeMD::KeyArgIsPipe) + .Default(RuntimeMD::KeyNull); + OutStreamer->EmitIntValue(Key, 1); + } + + // Emit KeyArgTypeKind. + auto BaseTypeName = cast( + F.getMetadata("kernel_arg_base_type")->getOperand(I))->getString(); + auto TypeKind = StringSwitch(BaseTypeName) + .Case("sampler_t", RuntimeMD::KernelArg::Sampler) + .Case("queue_t", RuntimeMD::KernelArg::Queue) + .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t", + "image2d_t" , "image2d_array_t", RuntimeMD::KernelArg::Image) + .Cases("image2d_depth_t", "image2d_array_depth_t", + "image2d_msaa_t", "image2d_array_msaa_t", + "image2d_msaa_depth_t", RuntimeMD::KernelArg::Image) + .Cases("image2d_array_msaa_depth_t", "image3d_t", + RuntimeMD::KernelArg::Image) + .Default(isa(T) ? RuntimeMD::KernelArg::Pointer : + RuntimeMD::KernelArg::Value); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgTypeKind, TypeKind, 1); + + // Emit KeyArgValueType. + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgValueType, + getRuntimeMDValueType(T, BaseTypeName), 2); + + // Emit KeyArgAccQual. + auto AccQual = cast(F.getMetadata( + "kernel_arg_access_qual")->getOperand(I))->getString(); + auto AQ = StringSwitch(AccQual) + .Case("read_only", RuntimeMD::KernelArg::ReadOnly) + .Case("write_only", RuntimeMD::KernelArg::WriteOnly) + .Case("read_write", RuntimeMD::KernelArg::ReadWrite) + .Default(RuntimeMD::KernelArg::None); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAccQual, + AQ, 1); + + // Emit KeyArgAddrQual. + if (isa(T)) + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAddrQual, + T->getPointerAddressSpace(), 1); + + // Emit KeyArgEnd + OutStreamer->EmitIntValue(RuntimeMD::KeyArgEnd, 1); + } + + // Emit KeyReqdWorkGroupSize, KeyWorkGroupSizeHint, and KeyVecTypeHint. + if (auto RWGS = F.getMetadata("reqd_work_group_size")) + emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyReqdWorkGroupSize, + RWGS, 4); + if (auto WGSH = F.getMetadata("work_group_size_hint")) + emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyWorkGroupSizeHint, + WGSH, 4); + if (auto VTH = F.getMetadata("vec_type_hint")) { + auto TypeName = getOCLTypeName(cast( + VTH->getOperand(0))->getType(), mdconst::extract( + VTH->getOperand(1))->getZExtValue()); + emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyVecTypeHint, + TypeName); + } + + // Emit KeyKernelEnd + OutStreamer->EmitIntValue(RuntimeMD::KeyKernelEnd, 1); +} diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 99d4091670fe..7b04c539520d 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -12,15 +12,15 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H -#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H #include "llvm/CodeGen/AsmPrinter.h" #include namespace llvm { -class AMDGPUAsmPrinter : public AsmPrinter { +class AMDGPUAsmPrinter final : public AsmPrinter { private: struct SIProgramInfo { SIProgramInfo() : @@ -40,6 +40,10 @@ private: NumVGPR(0), NumSGPR(0), FlatUsed(false), + ReservedVGPRFirst(0), + ReservedVGPRCount(0), + DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1), + DebuggerPrivateSegmentBufferSGPR((uint16_t)-1), VCCUsed(false), CodeLen(0) {} @@ -67,6 +71,20 @@ private: uint32_t LDSSize; bool FlatUsed; + // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first + // fixed VGPR number reserved. + uint16_t ReservedVGPRFirst; + // The number of consecutive VGPRs reserved. + uint16_t ReservedVGPRCount; + + // Fixed SGPR number used to hold wave scratch offset for entire kernel + // execution, or uint16_t(-1) if the register is not used or not known. + uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR; + // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire + // kernel execution, or uint16_t(-1) if the register is not used or not + // known. + uint16_t DebuggerPrivateSegmentBufferSGPR; + // Bonus information for debugging. bool VCCUsed; uint64_t CodeLen; @@ -109,6 +127,10 @@ public: unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) override; + void emitStartOfRuntimeMetadata(const Module &M); + + void emitRuntimeMetadata(const Function &F); + protected: std::vector DisasmLines, HexLines; size_t DisasmLineMaxLen; diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp new file mode 100644 index 000000000000..1a1da8a254a7 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -0,0 +1,42 @@ +//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the lowering of LLVM calls to machine code calls for +/// GlobalISel. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPUCallLowering.h" +#include "AMDGPUISelLowering.h" + +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "This shouldn't be built without GISel" +#endif + +AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) + : CallLowering(&TLI) { +} + +bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, unsigned VReg) const { + return true; +} + +bool AMDGPUCallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args, + const SmallVectorImpl &VRegs) const { + // TODO: Implement once there are generic loads/stores. + return true; +} diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h new file mode 100644 index 000000000000..61174bacdac3 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -0,0 +1,36 @@ +//===- lib/Target/AMDGPU/AMDGPUCallLowering.h - Call lowering -*- C++ -*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file describes how to lower LLVM calls to machine code calls. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H + +#include "llvm/CodeGen/GlobalISel/CallLowering.h" + +namespace llvm { + +class AMDGPUTargetLowering; + +class AMDGPUCallLowering: public CallLowering { + public: + AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); + + bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, + unsigned VReg) const override; + bool + lowerFormalArguments(MachineIRBuilder &MIRBuilder, + const Function::ArgumentListType &Args, + const SmallVectorImpl &VRegs) const override; +}; +} // End of namespace llvm; +#endif diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index b0db26124a0c..47dfa4992068 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -110,21 +110,19 @@ def CC_R600 : CallingConv<[ // Calling convention for compute kernels def CC_AMDGPU_Kernel : CallingConv<[ - CCCustom<"allocateStack"> + CCCustom<"allocateKernArg"> ]>; def CC_AMDGPU : CallingConv<[ CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() >=" "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo()" - "->getShaderType() == ShaderType::COMPUTE", + "!AMDGPU::isShader(State.getCallingConv())", CCDelegateTo>, CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() < " "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo()" - "->getShaderType() == ShaderType::COMPUTE", + "!AMDGPU::isShader(State.getCallingConv())", CCDelegateTo>, CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() >= " diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp new file mode 100644 index 000000000000..3b415774df49 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -0,0 +1,82 @@ +//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass does misc. AMDGPU optimizations on IR before instruction +/// selection. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" + +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-codegenprepare" + +using namespace llvm; + +namespace { + +class AMDGPUCodeGenPrepare : public FunctionPass, + public InstVisitor { + DivergenceAnalysis *DA; + const TargetMachine *TM; + +public: + static char ID; + AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : + FunctionPass(ID), + TM(TM) { } + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + + const char *getPassName() const override { + return "AMDGPU IR optimizations"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesAll(); + } +}; + +} // End anonymous namespace + +bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { + return false; +} + +bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { + if (!TM || skipFunction(F)) + return false; + + DA = &getAnalysis(); + visit(F); + + return true; +} + +INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, + "AMDGPU IR optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, + "AMDGPU IR optimizations", false, false) + +char AMDGPUCodeGenPrepare::ID = 0; + +FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) { + return new AMDGPUCodeGenPrepare(TM); +} diff --git a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp deleted file mode 100644 index 2f6b3022dd6e..000000000000 --- a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp +++ /dev/null @@ -1,26 +0,0 @@ -//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUDiagnosticInfoUnsupported.h" - -using namespace llvm; - -DiagnosticInfoUnsupported::DiagnosticInfoUnsupported( - const Function &Fn, - const Twine &Desc, - DiagnosticSeverity Severity) - : DiagnosticInfo(getKindID(), Severity), - Description(Desc), - Fn(Fn) { } - -int DiagnosticInfoUnsupported::KindID = 0; - -void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const { - DP << "unsupported " << getDescription() << " in " << Fn.getName(); -} diff --git a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h deleted file mode 100644 index 0fd37e1ede6b..000000000000 --- a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h +++ /dev/null @@ -1,48 +0,0 @@ -//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H - -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/DiagnosticPrinter.h" - -namespace llvm { - -/// Diagnostic information for unimplemented or unsupported feature reporting. -class DiagnosticInfoUnsupported : public DiagnosticInfo { -private: - const Twine &Description; - const Function &Fn; - - static int KindID; - - static int getKindID() { - if (KindID == 0) - KindID = llvm::getNextAvailablePluginDiagnosticKind(); - return KindID; - } - -public: - DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, - DiagnosticSeverity Severity = DS_Error); - - const Function &getFunction() const { return Fn; } - const Twine &getDescription() const { return Description; } - - void print(DiagnosticPrinter &DP) const override; - - static bool classof(const DiagnosticInfo *DI) { - return DI->getKind() == getKindID(); - } -}; - -} - -#endif diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index 4d84d281d998..bbc28b885721 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -7,12 +7,13 @@ // //==-----------------------------------------------------------------------===// // -// Interface to describe a layout of a stack frame on a AMDIL target machine +// Interface to describe a layout of a stack frame on a AMDGPU target machine. // //===----------------------------------------------------------------------===// #include "AMDGPUFrameLowering.h" #include "AMDGPURegisterInfo.h" -#include "R600MachineFunctionInfo.h" +#include "AMDGPUSubtarget.h" + #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Instructions.h" @@ -57,7 +58,7 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { // T2.Y = stack[1].y // T3.X = stack[1].z // T3.Y = stack[1].w - // + // // StackWidth = 4: // T0.X = stack[0].x // T0.Y = stack[0].y @@ -75,7 +76,8 @@ int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); - const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + const AMDGPURegisterInfo *RI + = MF.getSubtarget().getRegisterInfo(); // Fill in FrameReg output argument. FrameReg = RI->getFrameRegister(MF); @@ -87,32 +89,16 @@ int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { - OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i)); + OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(i)); OffsetBytes += MFI->getObjectSize(i); // Each register holds 4 bytes, so we must always align the offset to at // least 4 bytes, so that 2 frame objects won't share the same register. - OffsetBytes = RoundUpToAlignment(OffsetBytes, 4); + OffsetBytes = alignTo(OffsetBytes, 4); } if (FI != -1) - OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI)); + OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(FI)); return OffsetBytes / (getStackWidth(MF) * 4); } -const TargetFrameLowering::SpillSlot * -AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { - NumEntries = 0; - return nullptr; -} -void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const {} -void -AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { -} - -bool -AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { - return false; -} diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 257a3da40589..513848a1d887 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -32,13 +32,13 @@ public: /// \returns The number of 32-bit sub-registers that are used when storing /// values to the stack. unsigned getStackWidth(const MachineFunction &MF) const; + int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; - const SpillSlot * - getCalleeSavedSpillSlots(unsigned &NumEntries) const override; - void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - bool hasFP(const MachineFunction &MF) const override; + + bool hasFP(const MachineFunction &MF) const override { + return false; + } }; } // namespace llvm #endif diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b33040b4d06a..23c9352ce273 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1,4 +1,4 @@ -//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// +//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===// // // The LLVM Compiler Infrastructure // @@ -12,30 +12,44 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD -#include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" -#include "R600InstrInfo.h" -#include "SIDefines.h" #include "SIISelLowering.h" #include "SIMachineFunctionInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/IR/Function.h" +#include "llvm/IR/DiagnosticInfo.h" using namespace llvm; +namespace llvm { +class R600InstrInfo; +} + //===----------------------------------------------------------------------===// // Instruction Selector Implementation //===----------------------------------------------------------------------===// namespace { + +static bool isCBranchSCC(const SDNode *N) { + assert(N->getOpcode() == ISD::BRCOND); + if (!N->hasOneUse()) + return false; + + SDValue Cond = N->getOperand(1); + if (Cond.getOpcode() == ISD::CopyToReg) + Cond = Cond.getOperand(2); + return Cond.getOpcode() == ISD::SETCC && + Cond.getOperand(0).getValueType() == MVT::i32 && Cond.hasOneUse(); +} + /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -47,7 +61,7 @@ public: AMDGPUDAGToDAGISel(TargetMachine &TM); virtual ~AMDGPUDAGToDAGISel(); bool runOnMachineFunction(MachineFunction &MF) override; - SDNode *Select(SDNode *N) override; + void Select(SDNode *N) override; const char *getPassName() const override; void PreprocessISelDAG() override; void PostprocessISelDAG() override; @@ -59,28 +73,8 @@ private: bool FoldOperands(unsigned, const R600InstrInfo *, std::vector &); bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector &); - // Complex pattern selectors - bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); - bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); - bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); - - static bool checkType(const Value *ptr, unsigned int addrspace); - static bool checkPrivateAddress(const MachineMemOperand *Op); - - static bool isGlobalStore(const StoreSDNode *N); - static bool isFlatStore(const StoreSDNode *N); - static bool isPrivateStore(const StoreSDNode *N); - static bool isLocalStore(const StoreSDNode *N); - static bool isRegionStore(const StoreSDNode *N); - - bool isCPLoad(const LoadSDNode *N) const; - bool isConstantLoad(const LoadSDNode *N, int cbID) const; - bool isGlobalLoad(const LoadSDNode *N) const; - bool isFlatLoad(const LoadSDNode *N) const; - bool isParamLoad(const LoadSDNode *N) const; - bool isPrivateLoad(const LoadSDNode *N) const; - bool isLocalLoad(const LoadSDNode *N) const; - bool isRegionLoad(const LoadSDNode *N) const; + bool isConstantLoad(const MemSDNode *N, int cbID) const; + bool isUniformBr(const SDNode *N) const; SDNode *glueCopyToM0(SDNode *N) const; @@ -111,7 +105,20 @@ private: SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, - SDValue &Offset, SDValue &GLC) const; + SDValue &Offset, SDValue &SLC) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + SDValue &Offset) const; + bool SelectMUBUFConstant(SDValue Constant, + SDValue &SOffset, + SDValue &ImmOffset) const; + bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset, + SDValue &ImmOffset) const; + bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset, + SDValue &ImmOffset, SDValue &VOffset) const; + + bool SelectFlat(SDValue Addr, SDValue &VAddr, + SDValue &SLC, SDValue &TFE) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, @@ -122,7 +129,7 @@ private: bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; - SDNode *SelectAddrSpaceCast(SDNode *N); + bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, @@ -136,13 +143,15 @@ private: SDValue &Clamp, SDValue &Omod) const; - SDNode *SelectADD_SUB_I64(SDNode *N); - SDNode *SelectDIV_SCALE(SDNode *N); + void SelectADD_SUB_I64(SDNode *N); + void SelectDIV_SCALE(SDNode *N); - SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); - SDNode *SelectS_BFEFromShifts(SDNode *N); - SDNode *SelectS_BFE(SDNode *N); + void SelectS_BFEFromShifts(SDNode *N); + void SelectS_BFE(SDNode *N); + void SelectBRCOND(SDNode *N); + void SelectATOMIC_CMP_SWAP(SDNode *N); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" @@ -159,7 +168,7 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) : SelectionDAGISel(TM) {} bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { - Subtarget = &static_cast(MF.getSubtarget()); + Subtarget = &MF.getSubtarget(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -207,64 +216,9 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, } } -bool AMDGPUDAGToDAGISel::SelectADDRParam( - SDValue Addr, SDValue& R1, SDValue& R2) { - - if (Addr.getOpcode() == ISD::FrameIndex) { - if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { - R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } - } else if (Addr.getOpcode() == ISD::ADD) { - R1 = Addr.getOperand(0); - R2 = Addr.getOperand(1); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } - return true; -} - -bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) { - return false; - } - return SelectADDRParam(Addr, R1, R2); -} - - -bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) { - return false; - } - - if (Addr.getOpcode() == ISD::FrameIndex) { - if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { - R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } - } else if (Addr.getOpcode() == ISD::ADD) { - R1 = Addr.getOperand(0); - R2 = Addr.getOperand(1); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } - return true; -} - SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - !checkType(cast(N)->getMemOperand()->getValue(), - AMDGPUAS::LOCAL_ADDRESS)) + cast(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) return N; const SITargetLowering& Lowering = @@ -304,14 +258,15 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { llvm_unreachable("invalid vector size"); } -SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { +void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { N->setNodeId(-1); - return nullptr; // Already selected. + return; // Already selected. } - if (isa(N)) + if (isa(N) || + (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC)) N = glueCopyToM0(N); switch (Opc) { @@ -325,7 +280,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; - return SelectADD_SUB_I64(N); + SelectADD_SUB_I64(N); + return; } case ISD::SCALAR_TO_VECTOR: case AMDGPUISD::BUILD_VERTICAL_VECTOR: @@ -359,8 +315,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); if (NumVectorElts == 1) { - return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, - N->getOperand(0), RegClass); + CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), + RegClass); + return; } assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " @@ -400,8 +357,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (!IsRegSeq) break; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), - RegSeqArgs); + CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); + return; } case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; @@ -422,8 +379,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, N->getOperand(1), SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - DL, N->getValueType(0), Ops); + ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, + N->getValueType(0), Ops)); + return; } case ISD::Constant: @@ -452,8 +410,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, - N->getValueType(0), Ops); + ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, + N->getValueType(0), Ops)); + return; } case ISD::LOAD: case ISD::STORE: { @@ -487,11 +446,13 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { uint32_t OffsetVal = Offset->getZExtValue(); uint32_t WidthVal = Width->getZExtValue(); - return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), - N->getOperand(0), OffsetVal, WidthVal); + ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, + SDLoc(N), N->getOperand(0), OffsetVal, WidthVal)); + return; } case AMDGPUISD::DIV_SCALE: { - return SelectDIV_SCALE(N); + SelectDIV_SCALE(N); + return; } case ISD::CopyToReg: { const SITargetLowering& Lowering = @@ -499,139 +460,48 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { Lowering.legalizeTargetIndependentNode(N, *CurDAG); break; } - case ISD::ADDRSPACECAST: - return SelectAddrSpaceCast(N); case ISD::AND: case ISD::SRL: case ISD::SRA: + case ISD::SIGN_EXTEND_INREG: if (N->getValueType(0) != MVT::i32 || Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; - return SelectS_BFE(N); + SelectS_BFE(N); + return; + case ISD::BRCOND: + SelectBRCOND(N); + return; + + case AMDGPUISD::ATOMIC_CMP_SWAP: + SelectATOMIC_CMP_SWAP(N); + return; } - return SelectCode(N); + SelectCode(N); } -bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { - assert(AS != 0 && "Use checkPrivateAddress instead."); - if (!Ptr) +bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { + if (!N->readMem()) return false; - - return Ptr->getType()->getPointerAddressSpace() == AS; -} - -bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) { - if (Op->getPseudoValue()) - return true; - - if (PointerType *PT = dyn_cast(Op->getValue()->getType())) - return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; - - return false; -} - -bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { - const Value *MemVal = N->getMemOperand()->getValue(); - return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::REGION_ADDRESS)); -} - -bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const { - const Value *MemVal = N->getMemOperand()->getValue(); if (CbId == -1) - return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS); + return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; - return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId); + return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId; } -bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { - if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - N->getMemoryVT().bitsLT(MVT::i32)) - return true; - - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { - MachineMemOperand *MMO = N->getMemOperand(); - if (checkPrivateAddress(N->getMemOperand())) { - if (MMO) { - const PseudoSourceValue *PSV = MMO->getPseudoValue(); - if (PSV && PSV->isConstantPool()) { - return true; - } - } - } - return false; -} - -bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { - if (checkPrivateAddress(N->getMemOperand())) { - // Check to make sure we are not a constant pool load or a constant load - // that is marked as a private load - if (isCPLoad(N) || isConstantLoad(N, -1)) { - return false; - } - } - - const Value *MemVal = N->getMemOperand()->getValue(); - if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) && - !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && - !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) { - return true; - } - return false; +bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { + const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); + const Instruction *Term = BB->getTerminator(); + return Term->getMetadata("amdgpu.uniform") || + Term->getMetadata("structurizecfg.uniform"); } const char *AMDGPUDAGToDAGISel::getPassName() const { return "AMDGPU DAG->DAG Pattern Instruction Selection"; } -#ifdef DEBUGTMP -#undef INT64_C -#endif -#undef DEBUGTMP - //===----------------------------------------------------------------------===// // Complex Patterns //===----------------------------------------------------------------------===// @@ -705,7 +575,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } -SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDLoc DL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -728,7 +598,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; - unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; @@ -745,12 +614,12 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDValue(AddHi,0), Sub1, }; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); + CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); } // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. -SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { SDLoc SL(N); EVT VT = N->getValueType(0); @@ -766,7 +635,7 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); - return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); + CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); } bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, @@ -786,6 +655,7 @@ bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, SDValue &Offset) const { + SDLoc DL(Addr); if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); @@ -793,7 +663,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { // (add n0, c0) Base = N0; - Offset = N1; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } } else if (Addr.getOpcode() == ISD::SUB) { @@ -801,7 +671,6 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, if (const ConstantSDNode *C = dyn_cast(Addr.getOperand(0))) { int64_t ByteOffset = C->getSExtValue(); if (isUInt<16>(ByteOffset)) { - SDLoc DL(Addr); SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); // XXX - This is kind of hacky. Create a dummy sub node so we can check @@ -816,7 +685,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); Base = SDValue(MachineSub, 0); - Offset = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16); return true; } } @@ -834,7 +703,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); Base = SDValue(MovZero, 0); - Offset = Addr; + Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); return true; } } @@ -932,8 +801,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDLoc DL(Addr); - GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + if (!GLC.getNode()) + GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + if (!SLC.getNode()) + SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -961,9 +832,11 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, } if (isLegalMUBUFImmOffset(C1)) { - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; - } else if (isUInt<32>(C1->getZExtValue())) { + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } + + if (isUInt<32>(C1->getZExtValue())) { // Illegal offset, store it in soffset. Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, @@ -1045,14 +918,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); + // Offsets in vaddr must be positive. - if (CurDAG->SignBitIsZero(N0)) { - ConstantSDNode *C1 = cast(N1); - if (isLegalMUBUFImmOffset(C1)) { - VAddr = N0; - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; - } + ConstantSDNode *C1 = cast(N1); + if (isLegalMUBUFImmOffset(C1)) { + VAddr = N0; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; } } @@ -1090,14 +962,119 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return false; } +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &Soffset, SDValue &Offset + ) const { + SDValue GLC, SLC, TFE; + + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); +} bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, - SDValue &GLC) const { - SDValue SLC, TFE; + SDValue &SLC) const { + SDValue GLC, TFE; return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); } +bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, + SDValue &SOffset, + SDValue &ImmOffset) const { + SDLoc DL(Constant); + uint32_t Imm = cast(Constant)->getZExtValue(); + uint32_t Overflow = 0; + + if (Imm >= 4096) { + if (Imm <= 4095 + 64) { + // Use an SOffset inline constant for 1..64 + Overflow = Imm - 4095; + Imm = 4095; + } else { + // Try to keep the same value in SOffset for adjacent loads, so that + // the corresponding register contents can be re-used. + // + // Load values with all low-bits set into SOffset, so that a larger + // range of values can be covered using s_movk_i32 + uint32_t High = (Imm + 1) & ~4095; + uint32_t Low = (Imm + 1) & 4095; + Imm = Low; + Overflow = High - 1; + } + } + + // There is a hardware bug in SI and CI which prevents address clamping in + // MUBUF instructions from working correctly with SOffsets. The immediate + // offset is unaffected. + if (Overflow > 0 && + Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + return false; + + ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16); + + if (Overflow <= 64) + SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32); + else + SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(Overflow, DL, MVT::i32)), + 0); + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset, + SDValue &SOffset, + SDValue &ImmOffset) const { + SDLoc DL(Offset); + + if (!isa(Offset)) + return false; + + return SelectMUBUFConstant(Offset, SOffset, ImmOffset); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, + SDValue &SOffset, + SDValue &ImmOffset, + SDValue &VOffset) const { + SDLoc DL(Offset); + + // Don't generate an unnecessary voffset for constant offsets. + if (isa(Offset)) { + SDValue Tmp1, Tmp2; + + // When necessary, use a voffset in <= CI anyway to work around a hardware + // bug. + if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS || + SelectMUBUFConstant(Offset, Tmp1, Tmp2)) + return false; + } + + if (CurDAG->isBaseWithConstantOffset(Offset)) { + SDValue N0 = Offset.getOperand(0); + SDValue N1 = Offset.getOperand(1); + if (cast(N1)->getSExtValue() >= 0 && + SelectMUBUFConstant(N1, SOffset, ImmOffset)) { + VOffset = N0; + return true; + } + } + + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); + VOffset = Offset; + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr, + SDValue &VAddr, + SDValue &SLC, + SDValue &TFE) const { + VAddr = Addr; + TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); + return true; +} + /// /// \param EncodedOffset This is the immediate value that will be encoded /// directly into the instruction. On SI/CI the \p EncodedOffset @@ -1213,71 +1190,33 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, !isa(Offset); } -// FIXME: This is incorrect and only enough to be able to compile. -SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { - AddrSpaceCastSDNode *ASC = cast(N); - SDLoc DL(N); - - const MachineFunction &MF = CurDAG->getMachineFunction(); - DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(), - "addrspacecast not implemented"); - CurDAG->getContext()->diagnose(NotImplemented); - - assert(Subtarget->hasFlatAddressSpace() && - "addrspacecast only supported with flat address space!"); - - assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || - ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && - "Can only cast to / from flat address space!"); - - // The flat instructions read the address as the index of the VGPR holding the - // address, so casting should just be reinterpreting the base VGPR, so just - // insert trunc / bitcast / zext. - - SDValue Src = ASC->getOperand(0); - EVT DestVT = ASC->getValueType(0); - EVT SrcVT = Src.getValueType(); - - unsigned SrcSize = SrcVT.getSizeInBits(); - unsigned DestSize = DestVT.getSizeInBits(); - - if (SrcSize > DestSize) { - assert(SrcSize == 64 && DestSize == 32); - return CurDAG->getMachineNode( - TargetOpcode::EXTRACT_SUBREG, - DL, - DestVT, - Src, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); - } - - if (DestSize > SrcSize) { - assert(SrcSize == 32 && DestSize == 64); - - // FIXME: This is probably wrong, we should never be defining - // a register class with both VGPRs and SGPRs - SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL, - MVT::i32); +bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, + SDValue &Base, + SDValue &Offset) const { + SDLoc DL(Index); - const SDValue Ops[] = { - RC, - Src, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(0, DL, MVT::i32)), 0), - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; + if (CurDAG->isBaseWithConstantOffset(Index)) { + SDValue N0 = Index.getOperand(0); + SDValue N1 = Index.getOperand(1); + ConstantSDNode *C1 = cast(N1); - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - DL, N->getValueType(0), Ops); + // (add n0, c0) + Base = N0; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); + return true; } - assert(SrcSize == 64 && DestSize == 64); - return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); + if (isa(Index)) + return false; + + Base = Index; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); + return true; } -SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, - uint32_t Offset, uint32_t Width) { +SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL, + SDValue Val, uint32_t Offset, + uint32_t Width) { // Transformation function, pack the offset and width of a BFE into // the format expected by the S_BFE_I32 / S_BFE_U32. In the second // source, bits [5:0] contain the offset and bits [22:16] the width. @@ -1287,7 +1226,7 @@ SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); } -SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) // Predicate: 0 < b <= c < 32 @@ -1304,14 +1243,15 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { bool Signed = N->getOpcode() == ISD::SRA; unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; - return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), - CVal - BVal, 32 - CVal); + ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal, + 32 - CVal)); + return; } } - return SelectCode(N); + SelectCode(N); } -SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { switch (N->getOpcode()) { case ISD::AND: if (N->getOperand(0).getOpcode() == ISD::SRL) { @@ -1328,8 +1268,9 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { if (isMask_32(MaskVal)) { uint32_t WidthVal = countPopulation(MaskVal); - return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0), - ShiftVal, WidthVal); + ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), + Srl.getOperand(0), ShiftVal, WidthVal)); + return; } } } @@ -1349,20 +1290,139 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { if (isMask_32(MaskVal)) { uint32_t WidthVal = countPopulation(MaskVal); - return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0), - ShiftVal, WidthVal); + ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), + And.getOperand(0), ShiftVal, WidthVal)); + return; } } - } else if (N->getOperand(0).getOpcode() == ISD::SHL) - return SelectS_BFEFromShifts(N); + } else if (N->getOperand(0).getOpcode() == ISD::SHL) { + SelectS_BFEFromShifts(N); + return; + } break; case ISD::SRA: - if (N->getOperand(0).getOpcode() == ISD::SHL) - return SelectS_BFEFromShifts(N); + if (N->getOperand(0).getOpcode() == ISD::SHL) { + SelectS_BFEFromShifts(N); + return; + } break; + + case ISD::SIGN_EXTEND_INREG: { + // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8 + SDValue Src = N->getOperand(0); + if (Src.getOpcode() != ISD::SRL) + break; + + const ConstantSDNode *Amt = dyn_cast(Src.getOperand(1)); + if (!Amt) + break; + + unsigned Width = cast(N->getOperand(1))->getVT().getSizeInBits(); + ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0), + Amt->getZExtValue(), Width)); + return; + } } - return SelectCode(N); + SelectCode(N); +} + +void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { + SDValue Cond = N->getOperand(1); + + if (isCBranchSCC(N)) { + // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it. + SelectCode(N); + return; + } + + // The result of VOPC instructions is or'd against ~EXEC before it is + // written to vcc or another SGPR. This means that the value '1' is always + // written to the corresponding bit for results that are masked. In order + // to correctly check against vccz, we need to and VCC with the EXEC + // register in order to clear the value from the masked bits. + + SDLoc SL(N); + + SDNode *MaskedCond = + CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1, + CurDAG->getRegister(AMDGPU::EXEC, MVT::i1), + Cond); + SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, + SDValue(MaskedCond, 0), + SDValue()); // Passing SDValue() adds a + // glue output. + CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other, + N->getOperand(2), // Basic Block + VCC.getValue(0), // Chain + VCC.getValue(1)); // Glue + return; +} + +// This is here because there isn't a way to use the generated sub0_sub1 as the +// subreg index to EXTRACT_SUBREG in tablegen. +void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { + MemSDNode *Mem = cast(N); + unsigned AS = Mem->getAddressSpace(); + if (AS == AMDGPUAS::FLAT_ADDRESS) { + SelectCode(N); + return; + } + + MVT VT = N->getSimpleValueType(0); + bool Is32 = (VT == MVT::i32); + SDLoc SL(N); + + MachineSDNode *CmpSwap = nullptr; + if (Subtarget->hasAddr64()) { + SDValue SRsrc, VAddr, SOffset, Offset, GLC, SLC; + + if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) { + unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64 : + AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64; + SDValue CmpVal = Mem->getOperand(2); + + // XXX - Do we care about glue operands? + + SDValue Ops[] = { + CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain() + }; + + CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); + } + } + + if (!CmpSwap) { + SDValue SRsrc, SOffset, Offset, SLC; + if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) { + unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET : + AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET; + + SDValue CmpVal = Mem->getOperand(2); + SDValue Ops[] = { + CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain() + }; + + CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); + } + } + + if (!CmpSwap) { + SelectCode(N); + return; + } + + MachineSDNode::mmo_iterator MMOs = MF->allocateMemRefsArray(1); + *MMOs = Mem->getMemOperand(); + CmpSwap->setMemRefs(MMOs, MMOs + 1); + + unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + SDValue Extract + = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0)); + + ReplaceUses(SDValue(N, 0), Extract); + ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1)); + CurDAG->RemoveDeadNode(N); } bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, @@ -1432,62 +1492,59 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, } void AMDGPUDAGToDAGISel::PreprocessISelDAG() { - bool Modified = false; - - // XXX - Other targets seem to be able to do this without a worklist. - SmallVector LoadsToReplace; - SmallVector StoresToReplace; - - for (SDNode &Node : CurDAG->allnodes()) { - if (LoadSDNode *LD = dyn_cast(&Node)) { - EVT VT = LD->getValueType(0); - if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) - continue; - - // To simplify the TableGen patters, we replace all i64 loads with v2i32 - // loads. Alternatively, we could promote i64 loads to v2i32 during DAG - // legalization, however, so places (ExpandUnalignedLoad) in the DAG - // legalizer assume that if i64 is legal, so doing this promotion early - // can cause problems. - LoadsToReplace.push_back(LD); - } else if (StoreSDNode *ST = dyn_cast(&Node)) { - // Handle i64 stores here for the same reason mentioned above for loads. - SDValue Value = ST->getValue(); - if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) - continue; - StoresToReplace.push_back(ST); + MachineFrameInfo *MFI = CurDAG->getMachineFunction().getFrameInfo(); + + // Handle the perverse case where a frame index is being stored. We don't + // want to see multiple frame index operands on the same instruction since + // it complicates things and violates some assumptions about frame index + // lowering. + for (int I = MFI->getObjectIndexBegin(), E = MFI->getObjectIndexEnd(); + I != E; ++I) { + SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32); + + // It's possible that we have a frame index defined in the function that + // isn't used in this block. + if (FI.use_empty()) + continue; + + // Skip over the AssertZext inserted during lowering. + SDValue EffectiveFI = FI; + auto It = FI->use_begin(); + if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) { + EffectiveFI = SDValue(*It, 0); + It = EffectiveFI->use_begin(); } - } - - for (LoadSDNode *LD : LoadsToReplace) { - SDLoc SL(LD); - - SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(), - LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, - MVT::i64, NewLoad); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast); - Modified = true; - } - for (StoreSDNode *ST : StoresToReplace) { - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST), - MVT::v2i32, ST->getValue()); - const SDValue StoreOps[] = { - ST->getChain(), - NewValue, - ST->getBasePtr(), - ST->getOffset() - }; + for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) { + SDUse &Use = It.getUse(); + SDNode *User = Use.getUser(); + unsigned OpIdx = It.getOperandNo(); + ++It; + + if (MemSDNode *M = dyn_cast(User)) { + unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1; + if (OpIdx == PtrIdx) + continue; + + unsigned OpN = M->getNumOperands(); + SDValue NewOps[8]; + + assert(OpN < array_lengthof(NewOps)); + for (unsigned Op = 0; Op != OpN; ++Op) { + if (Op != OpIdx) { + NewOps[Op] = M->getOperand(Op); + continue; + } + + MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SDLoc(M), MVT::i32, FI); + NewOps[Op] = SDValue(Mov, 0); + } - CurDAG->UpdateNodeOperands(ST, StoreOps); - Modified = true; + CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN)); + } + } } - - // XXX - Is this necessary? - if (Modified) - CurDAG->RemoveDeadNodes(); } void AMDGPUDAGToDAGISel::PostprocessISelDAG() { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1a59a460ee7d..352423ed3ad6 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -15,7 +15,6 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" -#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" @@ -28,16 +27,19 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" - +#include "llvm/IR/DiagnosticInfo.h" +#include "SIInstrInfo.h" using namespace llvm; -static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), - ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); +static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + MachineFunction &MF = State.getMachineFunction(); + AMDGPUMachineFunction *MFI = MF.getInfo(); + uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); return true; } @@ -53,60 +55,104 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } -// Type for a vector that will be loaded to. -EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { +EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) { unsigned StoreSize = VT.getStoreSizeInBits(); if (StoreSize <= 32) - return EVT::getIntegerVT(Ctx, 32); + return EVT::getIntegerVT(Ctx, StoreSize); return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } -AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, +AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { - setOperationAction(ISD::Constant, MVT::i32, Legal); - setOperationAction(ISD::Constant, MVT::i64, Legal); - setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + // Lower floating point store/load to integer store/load to reduce the number + // of patterns in tablegen. + setOperationAction(ISD::LOAD, MVT::f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - setOperationAction(ISD::BRIND, MVT::Other, Expand); + setOperationAction(ISD::LOAD, MVT::v2f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); - // This is totally unsupported, just custom lower to produce an error. - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); - // We need to custom lower some of the intrinsics - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); - // Library functions. These default to Expand, but we have instructions - // for them. - setOperationAction(ISD::FCEIL, MVT::f32, Legal); - setOperationAction(ISD::FEXP2, MVT::f32, Legal); - setOperationAction(ISD::FPOW, MVT::f32, Legal); - setOperationAction(ISD::FLOG2, MVT::f32, Legal); - setOperationAction(ISD::FABS, MVT::f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::f32, Legal); - setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - setOperationAction(ISD::FMINNUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::LOAD, MVT::v16f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); - setOperationAction(ISD::FROUND, MVT::f32, Custom); - setOperationAction(ISD::FROUND, MVT::f64, Custom); + setOperationAction(ISD::LOAD, MVT::i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); - setOperationAction(ISD::FREM, MVT::f32, Custom); - setOperationAction(ISD::FREM, MVT::f64, Custom); + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); - // v_mad_f32 does not support denormals according to some sources. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); + setOperationAction(ISD::LOAD, MVT::f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32); - // Expand to fneg + fadd. - setOperationAction(ISD::FSUB, MVT::f64, Expand); + setOperationAction(ISD::LOAD, MVT::v2f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); + + // There are no 64-bit extloads. These should be done as a 32-bit extload and + // an extension to 64-bit. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); + } + + for (MVT VT : MVT::integer_valuetypes()) { + if (VT == MVT::i64) + continue; + + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); + } + + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); - // Lower floating point store/load to integer store/load to reduce the number - // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); @@ -122,51 +168,99 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v16f32, Promote); AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); + setOperationAction(ISD::STORE, MVT::i64, Promote); + AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::f64, Promote); - AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32); setOperationAction(ISD::STORE, MVT::v2f64, Promote); - AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); + AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); - // Custom lowering of vector stores is required for local address space - // stores. - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - - setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); - setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); - // XXX: This can be change to Custom, once ExpandVectorStores can - // handle 64-bit stores. + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); - setTruncStoreAction(MVT::i64, MVT::i16, Expand); - setTruncStoreAction(MVT::i64, MVT::i8, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); + setTruncStoreAction(MVT::i64, MVT::i8, Expand); + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); - setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); - setOperationAction(ISD::LOAD, MVT::f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); - setOperationAction(ISD::LOAD, MVT::v2f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); - setOperationAction(ISD::LOAD, MVT::v4f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); + setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); - setOperationAction(ISD::LOAD, MVT::v8f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); + setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); + setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); - setOperationAction(ISD::LOAD, MVT::v16f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); - setOperationAction(ISD::LOAD, MVT::f64, Promote); - AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + setOperationAction(ISD::Constant, MVT::i32, Legal); + setOperationAction(ISD::Constant, MVT::i64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); - setOperationAction(ISD::LOAD, MVT::v2f64, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + + // This is totally unsupported, just custom lower to produce an error. + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + + // We need to custom lower some of the intrinsics + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + + // Library functions. These default to Expand, but we have instructions + // for them. + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FEXP2, MVT::f32, Legal); + setOperationAction(ISD::FPOW, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + setOperationAction(ISD::FABS, MVT::f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + + setOperationAction(ISD::FROUND, MVT::f32, Custom); + setOperationAction(ISD::FROUND, MVT::f64, Custom); + + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + + setOperationAction(ISD::FREM, MVT::f32, Custom); + setOperationAction(ISD::FREM, MVT::f64, Custom); + + // v_mad_f32 does not support denormals according to some sources. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + + // Expand to fneg + fadd. + setOperationAction(ISD::FSUB, MVT::f64, Expand); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); @@ -179,31 +273,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); - // There are no 64-bit extloads. These should be done as a 32-bit extload and - // an extension to 64-bit. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); - } - - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); - } - - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FCEIL, MVT::f64, Custom); setOperationAction(ISD::FTRUNC, MVT::f64, Custom); @@ -219,28 +288,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); - - setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); - setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); - setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); - - setTruncStoreAction(MVT::f64, MVT::f16, Expand); - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { - setOperationAction(ISD::SREM, VT, Expand); + // These should use [SU]DIVREM, so set them to expand setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); // GPU does not have divrem function for signed or unsigned. setOperationAction(ISD::SDIVREM, VT, Custom); @@ -284,17 +338,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, if (Subtarget->hasFFBH()) setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); - else - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - if (!Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + if (Subtarget->hasFFBL()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal); setOperationAction(ISD::CTLZ, MVT::i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + // We only really have 32-bit BFE instructions (and 16-bit on VI). + // + // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any + // effort to match them now. We want this to be false for i64 cases when the + // extraction isn't restricted to the upper or lower half. Ideally we would + // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that + // span the midpoint are probably relatively rare, so don't worry about them + // for now. + if (Subtarget->hasBFE()) + setHasExtractBitsInsn(true); + static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; @@ -334,9 +395,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); } @@ -366,24 +425,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); } - setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); - setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + // This causes using an unrolled select operation rather than expansion with + // bit operations. This is in general better, but the alternative using BFI + // instructions may be better if the select sources are SGPRs. + setOperationAction(ISD::SELECT, MVT::v2f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); - setTargetDAGCombine(ISD::SHL); - setTargetDAGCombine(ISD::MUL); - setTargetDAGCombine(ISD::SELECT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::STORE); - - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::FSUB); + setOperationAction(ISD::SELECT, MVT::v4f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); @@ -394,7 +449,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, // SI at least has hardware support for floating point exceptions, but no way // of using or handling them is implemented. They are also optional in OpenCL // (Section 7.3) - setHasFloatingPointExceptions(false); + setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); setSelectIsExpensive(false); PredictableSelectIsExpensive = false; @@ -415,6 +470,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, MaxStoresPerMemcpy = 4096; MaxStoresPerMemmove = 4096; MaxStoresPerMemset = 4096; + + setTargetDAGCombine(ISD::BITCAST); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); } //===----------------------------------------------------------------------===// @@ -467,15 +534,17 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy) const { - if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) - return true; - unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); - unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); + assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); + + if (LoadTy.getScalarType() == MVT::i32) + return false; - return ((LScalarSize <= CastScalarSize) || - (CastScalarSize >= 32) || - (LScalarSize < 32)); + unsigned LScalarSize = LoadTy.getScalarSizeInBits(); + unsigned CastScalarSize = CastTy.getScalarSizeInBits(); + + return (LScalarSize < CastScalarSize) || + (CastScalarSize >= 32); } // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also @@ -578,14 +647,13 @@ void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, State.AnalyzeReturn(Outs, RetCC_SI); } -SDValue AMDGPUTargetLowering::LowerReturn( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - SDLoc DL, SelectionDAG &DAG) const { - return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); +SDValue +AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SDLoc &DL, SelectionDAG &DAG) const { + return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); } //===---------------------------------------------------------------------===// @@ -606,32 +674,38 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, else if (const GlobalAddressSDNode *G = dyn_cast(Callee)) FuncName = G->getGlobal()->getName(); - DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); + DiagnosticInfoUnsupported NoCalls( + Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc()); DAG.getContext()->diagnose(NoCalls); - return SDValue(); + + for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) + InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + + return DAG.getEntryNode(); } SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { const Function &Fn = *DAG.getMachineFunction().getFunction(); - DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca"); + DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca", + SDLoc(Op).getDebugLoc()); DAG.getContext()->diagnose(NoDynamicAlloca); - return SDValue(); + auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; + return DAG.getMergeValues(Ops, SDLoc()); } SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: - Op.getNode()->dump(); + Op->dump(&DAG); llvm_unreachable("Custom lowering code for this" "instruction is not implemented yet!"); break; case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); - case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); @@ -666,24 +740,6 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do // nothing here and let the illegal result integer be handled normally. return; - case ISD::LOAD: { - SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); - if (!Node) - return; - - Results.push_back(SDValue(Node, 0)); - Results.push_back(SDValue(Node, 1)); - // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode - // function - DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); - return; - } - case ISD::STORE: { - SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); - if (Lowered.getNode()) - Results.push_back(Lowered); - return; - } default: return; } @@ -712,16 +768,16 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, EVT VT = EVT::getEVT(InitTy); PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, - false, TD.getPrefTypeAlignment(InitTy)); + MachinePointerInfo(UndefValue::get(PtrTy)), + TD.getPrefTypeAlignment(InitTy)); } if (const ConstantFP *CFP = dyn_cast(Init)) { EVT VT = EVT::getEVT(CFP->getType()); PointerType *PtrTy = PointerType::get(CFP->getType(), 0); return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, - false, TD.getPrefTypeAlignment(CFP->getType())); + MachinePointerInfo(UndefValue::get(PtrTy)), + TD.getPrefTypeAlignment(CFP->getType())); } if (StructType *ST = dyn_cast(InitTy)) { @@ -769,8 +825,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, EVT VT = EVT::getEVT(InitTy); PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, - false, TD.getPrefTypeAlignment(InitTy)); + MachinePointerInfo(UndefValue::get(PtrTy)), + TD.getPrefTypeAlignment(InitTy)); } Init->dump(); @@ -782,10 +838,7 @@ static bool hasDefinedInitializer(const GlobalValue *GV) { if (!GVar || !GVar->hasInitializer()) return false; - if (isa(GVar->getInitializer())) - return false; - - return true; + return !isa(GVar->getInitializer()); } SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, @@ -797,6 +850,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, const GlobalValue *GV = G->getGlobal(); switch (G->getAddressSpace()) { + case AMDGPUAS::CONSTANT_ADDRESS: { + MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA); + } case AMDGPUAS::LOCAL_ADDRESS: { // XXX: What does the value of G->getOffset() mean? assert(G->getOffset() == 0 && @@ -808,11 +866,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, unsigned Offset; if (MFI->LocalMemoryObjects.count(GV) == 0) { - uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); - Offset = MFI->LDSSize; + unsigned Align = GV->getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV->getValueType()); + + /// TODO: We should sort these to minimize wasted space due to alignment + /// padding. Currently the padding is decided by the first encountered use + /// during lowering. + Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align); MFI->LocalMemoryObjects[GV] = Offset; - // XXX: Account for alignment? - MFI->LDSSize += Size; + MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType()); } else { Offset = MFI->LocalMemoryObjects[GV]; } @@ -820,50 +883,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, return DAG.getConstant(Offset, SDLoc(Op), getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS)); } - case AMDGPUAS::CONSTANT_ADDRESS: { - MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); - Type *EltType = GV->getType()->getElementType(); - unsigned Size = DL.getTypeAllocSize(EltType); - unsigned Alignment = DL.getPrefTypeAlignment(EltType); - - MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS); - MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); - - int FI = FrameInfo->CreateStackObject(Size, Alignment, false); - SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); - - const GlobalVariable *Var = cast(GV); - if (!Var->hasInitializer()) { - // This has no use, but bugpoint will hit it. - return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); - } - - const Constant *Init = Var->getInitializer(); - SmallVector WorkList; - - for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), - E = DAG.getEntryNode()->use_end(); I != E; ++I) { - if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) - continue; - WorkList.push_back(*I); - } - SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); - for (SmallVector::iterator I = WorkList.begin(), - E = WorkList.end(); I != E; ++I) { - SmallVector Ops; - Ops.push_back(Chain); - for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { - Ops.push_back((*I)->getOperand(i)); - } - DAG.UpdateNodeOperands(*I, Ops); - } - return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); - } } const Function &Fn = *DAG.getMachineFunction().getFunction(); - DiagnosticInfoUnsupported BadInit(Fn, - "initializer for address space"); + DiagnosticInfoUnsupported BadInit( + Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc()); DAG.getContext()->diagnose(BadInit); return SDValue(); } @@ -875,7 +899,7 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, for (const SDUse &U : Op->ops()) DAG.ExtractVectorElements(U.get(), Args); - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); + return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); } SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, @@ -887,23 +911,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); -} - -SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, - SelectionDAG &DAG) const { - - MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); - - FrameIndexSDNode *FIN = cast(Op); - - unsigned FrameIndex = FIN->getIndex(); - unsigned IgnoredFrameReg; - unsigned Offset = - TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); - return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), - Op.getValueType()); + return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); } SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, @@ -914,121 +922,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { default: return Op; - case AMDGPUIntrinsic::AMDGPU_abs: - case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. - return LowerIntrinsicIABS(Op, DAG); - case AMDGPUIntrinsic::AMDGPU_lrp: - return LowerIntrinsicLRP(Op, DAG); - - case AMDGPUIntrinsic::AMDGPU_clamp: - case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. + case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name. return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::AMDGPU_div_scale: { - // 3rd parameter required to be a constant. - const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); - if (!Param) - return DAG.getUNDEF(VT); - - // Translate to the operands expected by the machine instruction. The - // first parameter must be the same as the first instruction. - SDValue Numerator = Op.getOperand(1); - SDValue Denominator = Op.getOperand(2); - - // Note this order is opposite of the machine instruction's operations, - // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The - // intrinsic has the numerator as the first operand to match a normal - // division operation. - - SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; - - return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, - Denominator, Numerator); - } - - case Intrinsic::AMDGPU_div_fmas: - return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(4)); - - case Intrinsic::AMDGPU_div_fixup: - return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case Intrinsic::AMDGPU_trig_preop: - return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::AMDGPU_rcp: - return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); - - case Intrinsic::AMDGPU_rsq: - return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_legacy_rsq: - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - - case Intrinsic::AMDGPU_rsq_clamped: - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - Type *Type = VT.getTypeForEVT(*DAG.getContext()); - APFloat Max = APFloat::getLargest(Type->getFltSemantics()); - APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); - - SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, - DAG.getConstantFP(Max, DL, VT)); - return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, - DAG.getConstantFP(Min, DL, VT)); - } else { - return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); - } - - case Intrinsic::AMDGPU_ldexp: - return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_imax: - return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_umax: - return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_imin: - return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_umin: - return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_umul24: - return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_imul24: - return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_umad24: - return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_imad24: - return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_bfe_i32: return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1), @@ -1039,70 +936,14 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1), Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfi: - return DAG.getNode(AMDGPUISD::BFI, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfm: - return DAG.getNode(AMDGPUISD::BFM, DL, VT, - Op.getOperand(1), - Op.getOperand(2)); - - case Intrinsic::AMDGPU_class: - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. - return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. - return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. - return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name - return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1)); - } -} - -///IABS(a) = SMAX(sub(0, a), a) -SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - Op.getOperand(1)); - - return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1)); -} - -/// Linear Interpolation -/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) -SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - // TODO: Should this propagate fast-math-flags? - SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, - DAG.getConstantFP(1.0f, DL, MVT::f32), - Op.getOperand(1)); - SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, - Op.getOperand(3)); - return DAG.getNode(ISD::FADD, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), - OneSubAC); + Op.getOperand(3)); + } } /// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, +SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, + SDValue LHS, SDValue RHS, + SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const { if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) @@ -1176,56 +1017,48 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, return SDValue(); } -SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, - SelectionDAG &DAG) const { - LoadSDNode *Load = cast(Op); - EVT MemVT = Load->getMemoryVT(); - EVT MemEltVT = MemVT.getVectorElementType(); +std::pair +AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); - EVT LoadVT = Op.getValueType(); - EVT EltVT = LoadVT.getVectorElementType(); - EVT PtrVT = Load->getBasePtr().getValueType(); + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); - unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); - SmallVector Loads; - SmallVector Chains; + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); - SDLoc SL(Op); - unsigned MemEltSize = MemEltVT.getStoreSize(); - MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), - DAG.getConstant(i * MemEltSize, SL, PtrVT)); + return std::make_pair(Lo, Hi); +} - SDValue NewLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, - Load->getChain(), Ptr, - SrcValue.getWithOffset(i * MemEltSize), - MemEltVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); - Loads.push_back(NewLoad.getValue(0)); - Chains.push_back(NewLoad.getValue(1)); - } +SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); - SDValue Ops[] = { - DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), - DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) - }; + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); +} - return DAG.getMergeValues(Ops, SL); +SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); } SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, SelectionDAG &DAG) const { + LoadSDNode *Load = cast(Op); EVT VT = Op.getValueType(); + // If this is a 2 element vector, we really want to scalarize and not create // weird 1 element vectors. if (VT.getVectorNumElements() == 2) - return ScalarizeVectorLoad(Op, DAG); + return scalarizeVectorLoad(Load, DAG); - LoadSDNode *Load = cast(Op); SDValue BasePtr = Load->getBasePtr(); EVT PtrVT = BasePtr.getValueType(); EVT MemVT = Load->getMemoryVT(); @@ -1245,22 +1078,15 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, unsigned BaseAlign = Load->getAlignment(); unsigned HiAlign = MinAlign(BaseAlign, Size); - SDValue LoLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, - Load->getChain(), BasePtr, - SrcValue, - LoMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), BaseAlign); - + SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, + Load->getChain(), BasePtr, SrcValue, LoMemVT, + BaseAlign, Load->getMemOperand()->getFlags()); SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Size, SL, PtrVT)); - - SDValue HiLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, - Load->getChain(), HiPtr, - SrcValue.getWithOffset(LoMemVT.getStoreSize()), - HiMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), HiAlign); + SDValue HiLoad = + DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), + HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); SDValue Ops[] = { DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), @@ -1271,6 +1097,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, return DAG.getMergeValues(Ops, SL); } +// FIXME: This isn't doing anything for SI. This should be used in a target +// combine during type legalization. SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast(Op); @@ -1317,48 +1145,15 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, if (PackedSize < 32) { EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, - Store->getMemOperand()->getPointerInfo(), - PackedVT, - Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); + Store->getMemOperand()->getPointerInfo(), PackedVT, + Store->getAlignment(), + Store->getMemOperand()->getFlags()); } return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, Store->getMemOperand()->getPointerInfo(), - Store->isVolatile(), Store->isNonTemporal(), - Store->getAlignment()); -} - -SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = cast(Op); - EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); - EVT EltVT = Store->getValue().getValueType().getVectorElementType(); - EVT PtrVT = Store->getBasePtr().getValueType(); - unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); - SDLoc SL(Op); - - SmallVector Chains; - - unsigned EltSize = MemEltVT.getStoreSize(); - MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); - - for (unsigned i = 0, e = NumElts; i != e; ++i) { - SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Store->getValue(), - DAG.getConstant(i, SL, MVT::i32)); - - SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); - SDValue NewStore = - DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, - SrcValue.getWithOffset(i * EltSize), - MemEltVT, Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); - Chains.push_back(NewStore); - } - - return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); + Store->getAlignment(), + Store->getMemOperand()->getFlags()); } SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, @@ -1370,7 +1165,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, // If this is a 2 element vector, we really want to scalarize and not create // weird 1 element vectors. if (VT.getVectorNumElements() == 2) - return ScalarizeVectorStore(Op, DAG); + return scalarizeVectorStore(Store, DAG); EVT MemVT = Store->getMemoryVT(); SDValue Chain = Store->getChain(); @@ -1395,171 +1190,21 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, unsigned Size = LoMemVT.getStoreSize(); unsigned HiAlign = MinAlign(BaseAlign, Size); - SDValue LoStore - = DAG.getTruncStore(Chain, SL, Lo, - BasePtr, - SrcValue, - LoMemVT, - Store->isNonTemporal(), - Store->isVolatile(), - BaseAlign); - SDValue HiStore - = DAG.getTruncStore(Chain, SL, Hi, - HiPtr, - SrcValue.getWithOffset(Size), - HiMemVT, - Store->isNonTemporal(), - Store->isVolatile(), - HiAlign); + SDValue LoStore = + DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, + Store->getMemOperand()->getFlags()); + SDValue HiStore = + DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), + HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); } - -SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - LoadSDNode *Load = cast(Op); - ISD::LoadExtType ExtType = Load->getExtensionType(); - EVT VT = Op.getValueType(); - EVT MemVT = Load->getMemoryVT(); - - if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { - assert(VT == MVT::i1 && "Only i1 non-extloads expected"); - // FIXME: Copied from PPC - // First, load into 32 bits, then truncate to 1 bit. - - SDValue Chain = Load->getChain(); - SDValue BasePtr = Load->getBasePtr(); - MachineMemOperand *MMO = Load->getMemOperand(); - - SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, - BasePtr, MVT::i8, MMO); - - SDValue Ops[] = { - DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), - NewLD.getValue(1) - }; - - return DAG.getMergeValues(Ops, DL); - } - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS || - Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || - ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) - return SDValue(); - - // getBasePtr(), - DAG.getConstant(2, DL, MVT::i32)); - // Load the Register. - SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), - Load->getChain(), Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); - - // Get offset within the register. - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, - Load->getBasePtr(), - DAG.getConstant(0x3, DL, MVT::i32)); - - // Bit offset of target byte (byteIdx * 8). - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, DL, MVT::i32)); - - // Shift to the right. - Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); - - // Eliminate the upper bits by setting them to ... - EVT MemEltVT = MemVT.getScalarType(); - - // ... ones. - if (ExtType == ISD::SEXTLOAD) { - SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - - SDValue Ops[] = { - DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); - } - - // ... or zeros. - SDValue Ops[] = { - DAG.getZeroExtendInReg(Ret, DL, MemEltVT), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); -} - -SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); - if (Result.getNode()) { - return Result; - } - - StoreSDNode *Store = cast(Op); - SDValue Chain = Store->getChain(); - if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && - Store->getValue().getValueType().isVector()) { - return SplitVectorStore(Op, DAG); - } - - EVT MemVT = Store->getMemoryVT(); - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && - MemVT.bitsLT(MVT::i32)) { - unsigned Mask = 0; - if (Store->getMemoryVT() == MVT::i8) { - Mask = 0xff; - } else if (Store->getMemoryVT() == MVT::i16) { - Mask = 0xffff; - } - SDValue BasePtr = Store->getBasePtr(); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); - - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, - DAG.getConstant(0x3, DL, MVT::i32)); - - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, DL, MVT::i32)); - - SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, - Store->getValue()); - - SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); - - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, - MaskedValue, ShiftAmt); - - SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, - DAG.getConstant(Mask, DL, MVT::i32), - ShiftAmt); - DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, - DAG.getConstant(0xffffffff, DL, MVT::i32)); - Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); - - SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); - return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); - } - return SDValue(); -} - // This is a shortcut for integer division because we have fast i32<->f32 // conversions, and fast f32 reciprocal instructions. The fractional part of a -// float is enough to accurately represent up to a 24-bit integer. -SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { +// float is enough to accurately represent up to a 24-bit signed integer. +SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, + bool Sign) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); @@ -1567,20 +1212,26 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool MVT IntVT = MVT::i32; MVT FltVT = MVT::f32; - ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; - ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS); + if (LHSSignBits < 9) + return SDValue(); - if (VT.isVector()) { - unsigned NElts = VT.getVectorNumElements(); - IntVT = MVT::getVectorVT(MVT::i32, NElts); - FltVT = MVT::getVectorVT(MVT::f32, NElts); - } + unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS); + if (RHSSignBits < 9) + return SDValue(); + + unsigned BitSize = VT.getSizeInBits(); + unsigned SignBits = std::min(LHSSignBits, RHSSignBits); + unsigned DivBits = BitSize - SignBits; + if (Sign) + ++DivBits; - unsigned BitSize = VT.getScalarType().getSizeInBits(); + ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; SDValue jq = DAG.getConstant(1, DL, IntVT); - if (sign) { + if (Sign) { // char|short jq = ia ^ ib; jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); @@ -1590,18 +1241,13 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // jq = jq | 0x1 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); - - // jq = (int)jq - jq = DAG.getSExtOrTrunc(jq, DL, IntVT); } // int ia = (int)LHS; - SDValue ia = sign ? - DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); + SDValue ia = LHS; // int ib, (int)RHS; - SDValue ib = sign ? - DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); + SDValue ib = RHS; // float fa = (float)ia; SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); @@ -1609,8 +1255,6 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // float fb = (float)ib; SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); - // TODO: Should this propagate fast-math-flags? - // float fq = native_divide(fa, fb); SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); @@ -1621,8 +1265,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, - DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); + SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa); // int iq = (int)fq; SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); @@ -1641,9 +1284,6 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // jq = (cv ? jq : 0); jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); - // dst = trunc/extend to legal type - iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); - // dst = iq + jq; SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); @@ -1651,11 +1291,19 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); - SDValue Res[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Res, DL); + // Truncate to number of bits this divide really is. + if (Sign) { + SDValue InRegSize + = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits)); + Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize); + Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize); + } else { + SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT); + Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask); + Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask); + } + + return DAG.getMergeValues({ Div, Rem }, DL); } void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, @@ -1686,10 +1334,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), LHS_Lo, RHS_Lo); - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); - Results.push_back(DIV); - Results.push_back(REM); + SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero}); + SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero}); + + Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); + Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); return; } @@ -1698,7 +1347,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero); + SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero}); + REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); SDValue DIV_Lo = zero; @@ -1718,7 +1368,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, // Add LHS high bit REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); - SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT); + SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); @@ -1728,7 +1378,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); } - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi}); + DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV); Results.push_back(DIV); Results.push_back(REM); } @@ -1744,19 +1395,14 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, return DAG.getMergeValues(Results, DL); } - SDValue Num = Op.getOperand(0); - SDValue Den = Op.getOperand(1); - if (VT == MVT::i32) { - if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && - DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { - // TODO: We technically could do this for i64, but shouldn't that just be - // handled by something generally reducing 64-bit division on 32-bit - // values to 32-bit? - return LowerDIVREM24(Op, DAG, false); - } + if (SDValue Res = LowerDIVREM24(Op, DAG, false)) + return Res; } + SDValue Num = Op.getOperand(0); + SDValue Den = Op.getOperand(1); + // RCP = URECIP(Den) = 2^32 / Den + e // e is rounding error. SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); @@ -1864,11 +1510,11 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, SDValue Zero = DAG.getConstant(0, DL, VT); SDValue NegOne = DAG.getConstant(-1, DL, VT); - if (VT == MVT::i32 && - DAG.ComputeNumSignBits(LHS) > 8 && - DAG.ComputeNumSignBits(RHS) > 8) { - return LowerDIVREM24(Op, DAG, true); + if (VT == MVT::i32) { + if (SDValue Res = LowerDIVREM24(Op, DAG, true)) + return Res; } + if (VT == MVT::i64 && DAG.ComputeNumSignBits(LHS) > 32 && DAG.ComputeNumSignBits(RHS) > 32) { @@ -1954,7 +1600,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } -static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { +static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, + SelectionDAG &DAG) { const unsigned FractBits = 52; const unsigned ExpBits = 11; @@ -1992,8 +1639,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); // Extend back to to 64-bits. - SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, - Zero, SignBit); + SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); @@ -2391,7 +2037,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, MVT::i32, FloorMul); SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); - SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi); + SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}); return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); } @@ -2437,7 +2083,7 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, for (unsigned I = 0; I < NElts; ++I) Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); - return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); + return DAG.getBuildVector(VT, DL, Args); } //===----------------------------------------------------------------------===// @@ -2476,8 +2122,8 @@ static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { } template -static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, - uint32_t Offset, uint32_t Width, SDLoc DL) { +static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, + uint32_t Width, const SDLoc &DL) { if (Width + Offset < 32) { uint32_t Shl = static_cast(Src0) << (32 - Offset - Width); IntTy Result = static_cast(Shl) >> (32 - Width); @@ -2487,55 +2133,175 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); } -static bool usesAllNormalStores(SDNode *LoadVal) { - for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { - if (!ISD::isNormalStore(*I)) - return false; +static bool hasVolatileUser(SDNode *Val) { + for (SDNode *U : Val->uses()) { + if (MemSDNode *M = dyn_cast(U)) { + if (M->isVolatile()) + return true; + } } + return false; +} + +bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { + // i32 vectors are the canonical memory type. + if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT)) + return false; + + if (!VT.isByteSized()) + return false; + + unsigned Size = VT.getStoreSize(); + + if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector()) + return false; + + if (Size == 3 || (Size > 4 && (Size % 4 != 0))) + return false; + return true; } -// If we have a copy of an illegal type, replace it with a load / store of an -// equivalently sized legal type. This avoids intermediate bit pack / unpack -// instructions emitted when handling extloads and truncstores. Ideally we could -// recognize the pack / unpack pattern to eliminate it. +// Replace load of an illegal type with a store of a bitcast to a friendlier +// type. +SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + LoadSDNode *LN = cast(N); + if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) + return SDValue(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT VT = LN->getMemoryVT(); + + unsigned Size = VT.getStoreSize(); + unsigned Align = LN->getAlignment(); + if (Align < Size && isTypeLegal(VT)) { + bool IsFast; + unsigned AS = LN->getAddressSpace(); + + // Expand unaligned loads earlier than legalization. Due to visitation order + // problems during legalization, the emitted instructions to pack and unpack + // the bytes again are not eliminated in the case of an unaligned copy. + if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); + return DAG.getMergeValues(Ops, SDLoc(N)); + } + + if (!IsFast) + return SDValue(); + } + + if (!shouldCombineMemoryType(VT)) + return SDValue(); + + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + + SDValue NewLoad + = DAG.getLoad(NewVT, SL, LN->getChain(), + LN->getBasePtr(), LN->getMemOperand()); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); + DCI.CombineTo(N, BC, NewLoad.getValue(1)); + return SDValue(N, 0); +} + +// Replace store of an illegal type with a store of a bitcast to a friendlier +// type. SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (!DCI.isBeforeLegalize()) return SDValue(); StoreSDNode *SN = cast(N); - SDValue Value = SN->getValue(); - EVT VT = Value.getValueType(); + if (SN->isVolatile() || !ISD::isNormalStore(SN)) + return SDValue(); + + EVT VT = SN->getMemoryVT(); + unsigned Size = VT.getStoreSize(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + unsigned Align = SN->getAlignment(); + if (Align < Size && isTypeLegal(VT)) { + bool IsFast; + unsigned AS = SN->getAddressSpace(); + + // Expand unaligned stores earlier than legalization. Due to visitation + // order problems during legalization, the emitted instructions to pack and + // unpack the bytes again are not eliminated in the case of an unaligned + // copy. + if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) + return expandUnalignedStore(SN, DAG); + + if (!IsFast) + return SDValue(); + } + + if (!shouldCombineMemoryType(VT)) + return SDValue(); + + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + SDValue Val = SN->getValue(); + + //DCI.AddToWorklist(Val.getNode()); + + bool OtherUses = !Val.hasOneUse(); + SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); + if (OtherUses) { + SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); + DAG.ReplaceAllUsesOfValueWith(Val, CastBack); + } + + return DAG.getStore(SN->getChain(), SL, CastVal, + SN->getBasePtr(), SN->getMemOperand()); +} - if (isTypeLegal(VT) || SN->isVolatile() || - !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) +// TODO: Should repeat for other bit ops. +SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) return SDValue(); - LoadSDNode *LoadVal = cast(Value); - if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + // Break up 64-bit and of a constant into two 32-bit ands. This will typically + // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer + // combine opportunities since most 64-bit operations are decomposed this way. + // TODO: We won't want this for SALU especially if it is an inline immediate. + const ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); + if (!RHS) return SDValue(); - EVT MemVT = LoadVal->getMemoryVT(); + uint64_t Val = RHS->getZExtValue(); + if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) { + // If either half of the constant is 0, this is really a 32-bit and, so + // split it. If we can re-use the full materialized constant, keep it. + return SDValue(); + } SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); - SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, - LoadVT, SL, - LoadVal->getChain(), - LoadVal->getBasePtr(), - LoadVal->getOffset(), - LoadVT, - LoadVal->getMemOperand()); + SDValue Lo, Hi; + std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG); + + SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32); + SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32); - SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); - DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS); + SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS); - return DAG.getStore(SN->getChain(), SL, NewLoad, - SN->getBasePtr(), SN->getMemOperand()); + // Re-visit the ands. It's possible we eliminated one of them and it could + // simplify the vector. + DCI.AddToWorklist(Lo.getNode()); + DCI.AddToWorklist(Hi.getNode()); + + SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd}); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); } SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, @@ -2543,14 +2309,17 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, if (N->getValueType(0) != MVT::i64) return SDValue(); - // i64 (shl x, 32) -> (build_pair 0, x) + // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) - // Doing this with moves theoretically helps MI optimizations that understand - // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as - // v_lshl_b64. In the SALU case, I think this is slightly worse since it - // doubles the code size and I'm unsure about cycle count. + // On some subtargets, 64-bit shift is a quarter rate instruction. In the + // common case, splitting this into a move and a 32-bit shift is faster and + // the same code size. const ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); - if (!RHS || RHS->getZExtValue() != 32) + if (!RHS) + return SDValue(); + + unsigned RHSVal = RHS->getZExtValue(); + if (RHSVal < 32) return SDValue(); SDValue LHS = N->getOperand(0); @@ -2558,11 +2327,85 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - // Extract low 32-bits. + SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt); const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo); + + SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift}); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); +} + +SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) + return SDValue(); + + const ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); + if (!RHS) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + unsigned RHSVal = RHS->getZExtValue(); + + // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31) + if (RHSVal == 32) { + SDValue Hi = getHiHalf64(N->getOperand(0), DAG); + SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, + DAG.getConstant(31, SL, MVT::i32)); + + SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift}); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); + } + + // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31) + if (RHSVal == 63) { + SDValue Hi = getHiHalf64(N->getOperand(0), DAG); + SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, + DAG.getConstant(31, SL, MVT::i32)); + SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift}); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); + } + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) + return SDValue(); + + const ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); + if (!RHS) + return SDValue(); + + unsigned ShiftAmt = RHS->getZExtValue(); + if (ShiftAmt < 32) + return SDValue(); + + // srl i64:x, C for C >= 32 + // => + // build_pair (srl hi_32(x), C - 32), 0 + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue One = DAG.getConstant(1, SL, MVT::i32); + SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + + SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, + VecOp, One); + + SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); + SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); + + SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero}); + + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); } SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, @@ -2610,8 +2453,8 @@ static bool isCtlzOpc(unsigned Opc) { // type VT. // Need to match pre-legalized type because the generic legalization inserts the // add/sub between the select and compare. -static SDValue getFFBH_U32(const TargetLowering &TLI, - SelectionDAG &DAG, SDLoc SL, SDValue Op) { +static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG, + const SDLoc &SL, SDValue Op) { EVT VT = Op.getValueType(); EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); if (LegalVT != MVT::i32) @@ -2634,10 +2477,8 @@ static SDValue getFFBH_U32(const TargetLowering &TLI, // against the bitwidth. // // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. -SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL, - SDValue Cond, - SDValue LHS, - SDValue RHS, +SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, + SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const { ConstantSDNode *CmpRhs = dyn_cast(Cond.getOperand(1)); if (!CmpRhs || !CmpRhs->isNullValue()) @@ -2680,8 +2521,13 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, SDValue True = N->getOperand(1); SDValue False = N->getOperand(2); - if (VT == MVT::f32 && Cond.hasOneUse()) - return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + if (VT == MVT::f32 && Cond.hasOneUse()) { + SDValue MinMax + = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + // Revisit this node so we can catch min3/max3/med3 patterns. + //DCI.AddToWorklist(MinMax.getNode()); + return MinMax; + } // There's no reason to not do this if the condition has other uses. return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); @@ -2695,12 +2541,62 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, switch(N->getOpcode()) { default: break; + case ISD::BITCAST: { + EVT DestVT = N->getValueType(0); + if (DestVT.getSizeInBits() != 64 && !DestVT.isVector()) + break; + + // Fold bitcasts of constants. + // + // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k) + // TODO: Generalize and move to DAGCombiner + SDValue Src = N->getOperand(0); + if (ConstantSDNode *C = dyn_cast(Src)) { + assert(Src.getValueType() == MVT::i64); + SDLoc SL(N); + uint64_t CVal = C->getZExtValue(); + return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + } + + if (ConstantFPSDNode *C = dyn_cast(Src)) { + const APInt &Val = C->getValueAPF().bitcastToAPInt(); + SDLoc SL(N); + uint64_t CVal = Val.getZExtValue(); + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + + return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec); + } + + break; + } case ISD::SHL: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) break; return performShlCombine(N, DCI); } + case ISD::SRL: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performSrlCombine(N, DCI); + } + case ISD::SRA: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performSraCombine(N, DCI); + } + case ISD::AND: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performAndCombine(N, DCI); + } case ISD::MUL: return performMulCombine(N, DCI); case AMDGPUISD::MUL_I24: @@ -2797,7 +2693,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, break; } - + case ISD::LOAD: + return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); } @@ -2840,20 +2737,6 @@ void AMDGPUTargetLowering::getOriginalFunctionArgs( } } -bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { - if (ConstantFPSDNode * CFP = dyn_cast(Op)) { - return CFP->isExactlyValue(1.0); - } - return isAllOnesConstant(Op); -} - -bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { - if (ConstantFPSDNode * CFP = dyn_cast(Op)) { - return CFP->getValueAPF().isZero(); - } - return isNullConstant(Op); -} - SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { @@ -2889,10 +2772,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { // AMDIL DAG nodes NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); - NODE_NAME_CASE(RET_FLAG); NODE_NAME_CASE(BRANCH_COND); // AMDGPU DAG nodes + NODE_NAME_CASE(ENDPGM) + NODE_NAME_CASE(RETURN) NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(CLAMP) @@ -2906,6 +2790,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN3) NODE_NAME_CASE(SMIN3) NODE_NAME_CASE(UMIN3) + NODE_NAME_CASE(FMED3) + NODE_NAME_CASE(SMED3) + NODE_NAME_CASE(UMED3) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) @@ -2914,7 +2801,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RCP) NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RSQ_LEGACY) - NODE_NAME_CASE(RSQ_CLAMPED) + NODE_NAME_CASE(RSQ_CLAMP) NODE_NAME_CASE(LDEXP) NODE_NAME_CASE(FP_CLASS) NODE_NAME_CASE(DOT4) @@ -2934,7 +2821,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) NODE_NAME_CASE(REGISTER_STORE) - NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(LOAD_INPUT) NODE_NAME_CASE(SAMPLE) NODE_NAME_CASE(SAMPLEB) @@ -2946,13 +2832,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE3) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) + NODE_NAME_CASE(PC_ADD_REL_OFFSET) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(SENDMSG) NODE_NAME_CASE(INTERP_MOV) NODE_NAME_CASE(INTERP_P1) NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(STORE_MSKOR) + NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(ATOMIC_CMP_SWAP) + NODE_NAME_CASE(ATOMIC_INC) + NODE_NAME_CASE(ATOMIC_DEC) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; @@ -2998,21 +2889,6 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, return SDValue(); } -static void computeKnownBitsForMinMax(const SDValue Op0, - const SDValue Op1, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) { - APInt Op0Zero, Op0One; - APInt Op1Zero, Op1One; - DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); - DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); - - KnownZero = Op0Zero & Op1Zero; - KnownOne = Op0One & Op1One; -} - void AMDGPUTargetLowering::computeKnownBitsForTargetNode( const SDValue Op, APInt &KnownZero, @@ -3029,22 +2905,6 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( switch (Opc) { default: break; - case ISD::INTRINSIC_WO_CHAIN: { - // FIXME: The intrinsic should just use the node. - switch (cast(Op.getOperand(0))->getZExtValue()) { - case AMDGPUIntrinsic::AMDGPU_imax: - case AMDGPUIntrinsic::AMDGPU_umax: - case AMDGPUIntrinsic::AMDGPU_imin: - case AMDGPUIntrinsic::AMDGPU_umin: - computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), - KnownZero, KnownOne, DAG, Depth); - break; - default: - break; - } - - break; - } case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: { KnownZero = APInt::getHighBitsSet(32, 31); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 37925416a9c4..c2c758592d1c 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H -#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H #include "llvm/Target/TargetLowering.h" @@ -28,12 +28,10 @@ class AMDGPUTargetLowering : public TargetLowering { protected: const AMDGPUSubtarget *Subtarget; -private: SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, const SDValue &InitPtr, SDValue Chain, SelectionDAG &DAG) const; - SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; @@ -67,42 +65,43 @@ private: SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; +protected: + bool shouldCombineMemoryType(EVT VT) const; + SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS, - DAGCombinerInfo &DCI) const; + SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, + SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; -protected: static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); - static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); + static EVT getEquivalentBitType(LLVMContext &Context, EVT VT); virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const; - /// \brief Split a vector load into a scalar load of each component. - SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; + /// Return 64-bit value Op as two 32-bit integers. + std::pair split64BitValue(SDValue Op, + SelectionDAG &DAG) const; + SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const; + SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const; /// \brief Split a vector load into 2 loads of half the vector. SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; - /// \brief Split a vector store into a scalar store of each component. - SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const; - /// \brief Split a vector store into 2 stores of half the vector. SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl &Results) const; - bool isHWTrueValue(SDValue Op) const; - bool isHWFalseValue(SDValue Op) const; - /// The SelectionDAGBuilder will automatically promote function arguments /// with illegal types. However, this does not work for the AMDGPU targets /// since the function arguments are stored in memory as these illegal types. @@ -119,7 +118,7 @@ protected: const SmallVectorImpl &Outs) const; public: - AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); + AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); bool isFAbsFree(EVT VT) const override; bool isFNegFree(EVT VT) const override; @@ -141,7 +140,7 @@ public: ISD::LoadExtType ExtType, EVT ExtVT) const override; - bool isLoadBitCastBeneficial(EVT, EVT) const override; + bool isLoadBitCastBeneficial(EVT, EVT) const final; bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, @@ -150,11 +149,10 @@ public: bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; - SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - SDLoc DL, SelectionDAG &DAG) const override; + const SmallVectorImpl &OutVals, const SDLoc &DL, + SelectionDAG &DAG) const override; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; @@ -167,16 +165,9 @@ public: SmallVectorImpl &Results, SelectionDAG &DAG) const override; - SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; - SDValue CombineFMinMaxLegacy(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - DAGCombinerInfo &DCI) const; + SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, + SDValue RHS, SDValue True, SDValue False, + SDValue CC, DAGCombinerInfo &DCI) const; const char* getTargetNodeName(unsigned Opcode) const override; @@ -189,9 +180,7 @@ public: unsigned &RefinementSteps) const override; virtual SDNode *PostISelFolding(MachineSDNode *N, - SelectionDAG &DAG) const { - return N; - } + SelectionDAG &DAG) const = 0; /// \brief Determine which of the bits specified in \p Mask are known to be /// either zero or one and return them in the \p KnownZero and \p KnownOne @@ -214,8 +203,9 @@ public: unsigned Reg, EVT VT) const; enum ImplicitParameter { - GRID_DIM, - GRID_OFFSET + FIRST_IMPLICIT, + GRID_DIM = FIRST_IMPLICIT, + GRID_OFFSET, }; /// \brief Helper function that returns the byte offset of the given @@ -231,9 +221,10 @@ enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, CALL, // Function call based on a single integer UMUL, // 32bit unsigned multiplication - RET_FLAG, BRANCH_COND, // End AMDIL ISD Opcodes + ENDPGM, + RETURN, DWORDADDR, FRACT, CLAMP, @@ -250,6 +241,9 @@ enum NodeType : unsigned { FMIN3, SMIN3, UMIN3, + FMED3, + SMED3, + UMED3, URECIP, DIV_SCALE, DIV_FMAS, @@ -261,7 +255,7 @@ enum NodeType : unsigned { RCP, RSQ, RSQ_LEGACY, - RSQ_CLAMPED, + RSQ_CLAMP, LDEXP, FP_CLASS, DOT4, @@ -307,10 +301,14 @@ enum NodeType : unsigned { INTERP_MOV, INTERP_P1, INTERP_P2, + PC_ADD_REL_OFFSET, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + ATOMIC_CMP_SWAP, + ATOMIC_INC, + ATOMIC_DEC, LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index a266e711af5b..9a00ecb24ebe 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -30,163 +30,8 @@ using namespace llvm; // Pin the vtable to this file. void AMDGPUInstrInfo::anchor() {} -AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUGenInstrInfo(-1, -1), ST(st) {} - -const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { - return RI; -} - -bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SubIdx) const { -// TODO: Implement this function - return false; -} - -unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} - -unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} - -bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const { -// TODO: Implement this function - return false; -} -unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} -unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} -bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const { -// TODO: Implement this function - return false; -} - -MachineInstr * -AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const { -// TODO: Implement this function - return nullptr; -} - -void -AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, - int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - llvm_unreachable("Not Implemented"); -} - -void -AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - llvm_unreachable("Not Implemented"); -} - -bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const { - MachineBasicBlock *MBB = MI->getParent(); - int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::addr); - // addr is a custom operand with multiple MI operands, and only the - // first MI operand is given a name. - int RegOpIdx = OffsetOpIdx + 1; - int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::chan); - if (isRegisterLoad(*MI)) { - int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::dst); - unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); - unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); - unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { - buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(), - getIndirectAddrRegClass()->getRegister(Address)); - } else { - buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(), - Address, OffsetReg); - } - } else if (isRegisterStore(*MI)) { - int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::val); - unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); - unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); - unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { - buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), - MI->getOperand(ValOpIdx).getReg()); - } else { - buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(), - calculateIndirectAddress(RegIndex, Channel), - OffsetReg); - } - } else { - return false; - } - - MBB->erase(MI); - return true; -} - -MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, int FrameIndex) const { -// TODO: Implement this function - return nullptr; -} -MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const { - // TODO: Implement this function - return nullptr; -} -bool -AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, - unsigned Reg, bool UnfoldLoad, - bool UnfoldStore, - SmallVectorImpl &NewMIs) const { - // TODO: Implement this function - return false; -} - -bool -AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, - SmallVectorImpl &NewNodes) const { - // TODO: Implement this function - return false; -} - -unsigned -AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, - bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex) const { - // TODO: Implement this function - return 0; -} +AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST) + : AMDGPUGenInstrInfo(-1, -1), ST(ST) {} bool AMDGPUInstrInfo::enableClusterLoads() const { return true; @@ -214,106 +59,6 @@ bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, return (NumLoads <= 16 && (Offset1 - Offset0) < 64); } -bool -AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) - const { - // TODO: Implement this function - return true; -} -void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI) const { - // TODO: Implement this function -} - -bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::SubsumesPredicate(ArrayRef Pred1, - ArrayRef Pred2) const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const { - // TODO: Implement this function - return MI->getDesc().isPredicable(); -} - -bool -AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { - // TODO: Implement this function - return true; -} - -bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; -} - -bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; -} - -int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { - const MachineRegisterInfo &MRI = MF.getRegInfo(); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - int Offset = -1; - - if (MFI->getNumObjects() == 0) { - return -1; - } - - if (MRI.livein_empty()) { - return 0; - } - - const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); - for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), - LE = MRI.livein_end(); - LI != LE; ++LI) { - unsigned Reg = LI->first; - if (TargetRegisterInfo::isVirtualRegister(Reg) || - !IndirectRC->contains(Reg)) - continue; - - unsigned RegIndex; - unsigned RegEnd; - for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd; - ++RegIndex) { - if (IndirectRC->getRegister(RegIndex) == Reg) - break; - } - Offset = std::max(Offset, (int)RegIndex); - } - - return Offset + 1; -} - -int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { - int Offset = 0; - const MachineFrameInfo *MFI = MF.getFrameInfo(); - - // Variable sized objects are not supported - assert(!MFI->hasVarSizedObjects()); - - if (MFI->getNumObjects() == 0) { - return -1; - } - - unsigned IgnoredFrameReg; - Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference( - MF, -1, IgnoredFrameReg); - - return getIndirectIndexBegin(MF) + Offset; -} - int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { switch (Channels) { default: return Opcode; @@ -323,35 +68,44 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { } } +// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td +enum SIEncodingFamily { + SI = 0, + VI = 1 +}; + // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. namespace llvm { namespace AMDGPU { static int getMCOpcode(uint16_t Opcode, unsigned Gen) { - return getMCOpcodeGen(Opcode, (enum Subtarget)Gen); + return getMCOpcodeGen(Opcode, static_cast(Gen)); } } } -// This must be kept in sync with the SISubtarget class in SIInstrInfo.td -enum SISubtarget { - SI = 0, - VI = 1 -}; - -static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { - switch (Gen) { - default: - return SI; +static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { + switch (ST.getGeneration()) { + case AMDGPUSubtarget::SOUTHERN_ISLANDS: + case AMDGPUSubtarget::SEA_ISLANDS: + return SIEncodingFamily::SI; case AMDGPUSubtarget::VOLCANIC_ISLANDS: - return VI; + return SIEncodingFamily::VI; + + // FIXME: This should never be called for r600 GPUs. + case AMDGPUSubtarget::R600: + case AMDGPUSubtarget::R700: + case AMDGPUSubtarget::EVERGREEN: + case AMDGPUSubtarget::NORTHERN_ISLANDS: + return SIEncodingFamily::SI; } + + llvm_unreachable("Unknown subtarget generation!"); } int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - int MCOp = AMDGPU::getMCOpcode( - Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration())); + int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST)); // -1 means that Opcode is already a native instruction. if (MCOp == -1) @@ -364,14 +118,3 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { return MCOp; } - -ArrayRef> -AMDGPUInstrInfo::getSerializableTargetIndices() const { - static const std::pair TargetIndices[] = { - {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, - {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, - {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, - {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, - {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; - return makeArrayRef(TargetIndices); -} diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 53e8b23b3d62..a59eafadeb93 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -13,12 +13,10 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H -#include "AMDGPURegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" -#include #define GET_INSTRINFO_HEADER #define GET_INSTRINFO_ENUM @@ -39,78 +37,12 @@ class MachineInstrBuilder; class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { private: - const AMDGPURegisterInfo RI; - virtual void anchor(); -protected: const AMDGPUSubtarget &ST; -public: - explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); - virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; - - bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, - unsigned &DstReg, unsigned &SubIdx) const override; - - unsigned isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const override; - unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const override; - bool hasLoadFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const override; - unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; - unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const; - bool hasStoreFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const; - - MachineInstr * - convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const override; - - - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - - void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - -protected: - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, - int FrameIndex) const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, - MachineInstr *LoadMI) const override; + virtual void anchor(); public: - /// \returns the smallest register index that will be accessed by an indirect - /// read or write or -1 if indirect addressing is not used by this program. - int getIndirectIndexBegin(const MachineFunction &MF) const; - - /// \returns the largest register index that will be accessed by an indirect - /// read or write or -1 if indirect addressing is not used by this program. - int getIndirectIndexEnd(const MachineFunction &MF) const; - - bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, - unsigned Reg, bool UnfoldLoad, bool UnfoldStore, - SmallVectorImpl &NewMIs) const override; - bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, - SmallVectorImpl &NewNodes) const override; - unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, - bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex = nullptr) const override; + explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); bool enableClusterLoads() const override; @@ -118,81 +50,14 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; - bool - ReverseBranchCondition(SmallVectorImpl &Cond) const override; - void insertNoop(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI) const override; - bool isPredicated(const MachineInstr *MI) const override; - bool SubsumesPredicate(ArrayRef Pred1, - ArrayRef Pred2) const override; - bool DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const override; - bool isPredicable(MachineInstr *MI) const override; - bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - - // Helper functions that check the opcode for status information - bool isRegisterStore(const MachineInstr &MI) const; - bool isRegisterLoad(const MachineInstr &MI) const; - /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; - /// \brief Return the descriptor of the target-specific machine instruction - /// that corresponds to the specified pseudo or native opcode. - const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { - return get(pseudoToMCOpcode(Opcode)); - } - - ArrayRef> - getSerializableTargetIndices() const override; - -//===---------------------------------------------------------------------===// -// Pure virtual funtions to be implemented by sub-classes. -//===---------------------------------------------------------------------===// - - virtual bool isMov(unsigned opcode) const = 0; - - /// \brief Calculate the "Indirect Address" for the given \p RegIndex and - /// \p Channel - /// - /// We model indirect addressing using a virtual address space that can be - /// accesed with loads and stores. The "Indirect Address" is the memory - /// address in this virtual address space that maps to the given \p RegIndex - /// and \p Channel. - virtual unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const = 0; - - /// \returns The register class to be used for loading and storing values - /// from an "Indirect Address" . - virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0; - - /// \brief Build instruction(s) for an indirect register write. - /// - /// \returns The instruction that performs the indirect register write - virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const = 0; - - /// \brief Build instruction(s) for an indirect register read. - /// - /// \returns The instruction that performs the indirect register read - virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const = 0; - - /// \brief Build a MOV instruction. - virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const = 0; - /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the /// equivalent opcode that writes \p Channels Channels. int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; - }; namespace AMDGPU { diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 575dfe413658..2b13bb9079ea 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -44,6 +44,11 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4, // AMDGPU DAG Nodes // +def AMDGPUconstdata_ptr : SDNode< + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>, + SDTCisVT<0, iPTR>]> +>; + // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; @@ -63,7 +68,7 @@ def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. -def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; +def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; @@ -183,6 +188,11 @@ def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", SDTypeProfile<0, 2, []>, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", + SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + def AMDGPUround : SDNode<"ISD::FROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; @@ -209,6 +219,16 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, [] >; +def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; + def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; @@ -241,5 +261,8 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai //===----------------------------------------------------------------------===// // Call/Return DAG Nodes //===----------------------------------------------------------------------===// -def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, +def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + +def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 2a7ce6a47176..6761b4b5df95 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -12,7 +12,8 @@ // //===----------------------------------------------------------------------===// -class AMDGPUInst pattern> : Instruction { +class AMDGPUInst pattern = []> : Instruction { field bit isRegisterLoad = 0; field bit isRegisterStore = 0; @@ -23,15 +24,22 @@ class AMDGPUInst pattern> : Instructio let Pattern = pattern; let Itinerary = NullALU; + // SoftFail is a field the disassembler can use to provide a way for + // instructions to not match without killing the whole decode process. It is + // mainly used for ARM, but Tablegen expects this field to exist or it fails + // to build the decode table. + field bits<64> SoftFail = 0; + + let DecoderNamespace = Namespace; + let TSFlags{63} = isRegisterLoad; let TSFlags{62} = isRegisterStore; } -class AMDGPUShaderInst pattern> - : AMDGPUInst { +class AMDGPUShaderInst pattern = []> : AMDGPUInst { field bits<32> Inst = 0xffffffff; - } def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; @@ -41,6 +49,13 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; def InstFlag : OperandWithDefaultOps ; def ADDRIndirect : ComplexPattern; +// 32-bit VALU immediate operand that uses the constant bus. +def u32kimm : Operand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_KIMM32"; + let PrintMethod = "printU32ImmOperand"; +} + let OperandType = "OPERAND_IMMEDIATE" in { def u32imm : Operand { @@ -146,6 +161,17 @@ def COND_NULL : PatLeaf < [{(void)N; return false;}] >; + +//===----------------------------------------------------------------------===// +// Misc. PatFrags +//===----------------------------------------------------------------------===// + +class HasOneUseBinOp : PatFrag< + (ops node:$src0, node:$src1), + (op $src0, $src1), + [{ return N->hasOneUse(); }] +>; + //===----------------------------------------------------------------------===// // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// @@ -168,21 +194,58 @@ def truncstorei8_private : PrivateStore ; def truncstorei16_private : PrivateStore ; def store_private : PrivateStore ; -def global_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast(N)); +class GlobalMemOp : PatFrag (N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; // Global address space loads -def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); +class GlobalLoad : GlobalMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +def global_load : GlobalLoad ; + +// Global address space stores +class GlobalStore : GlobalMemOp < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +>; + +def global_store : GlobalStore ; +def global_store_atomic : GlobalStore; + + +class ConstantMemOp : PatFrag (N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; }]>; // Constant address space loads -def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); +class ConstantLoad : ConstantMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +def constant_load : ConstantLoad; + +class LocalMemOp : PatFrag (N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +}]>; + +// Local address space loads +class LocalLoad : LocalMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +class LocalStore : LocalMemOp < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +>; + +class FlatMemOp : PatFrag (N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS; }]>; +class FlatLoad : FlatMemOp < + (ops node:$ptr), (op node:$ptr) +>; + class AZExtLoadBase : PatFrag<(ops node:$ptr), (ld_node node:$ptr), [{ LoadSDNode *L = cast(N); @@ -196,29 +259,14 @@ def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i8; }]>; -def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; +def az_extloadi8_global : GlobalLoad ; +def sextloadi8_global : GlobalLoad ; -def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; +def az_extloadi8_constant : ConstantLoad ; +def sextloadi8_constant : ConstantLoad ; -def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; +def az_extloadi8_local : LocalLoad ; +def sextloadi8_local : LocalLoad ; def extloadi8_private : PrivateLoad ; def sextloadi8_private : PrivateLoad ; @@ -227,29 +275,14 @@ def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i16; }]>; -def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; +def az_extloadi16_global : GlobalLoad ; +def sextloadi16_global : GlobalLoad ; -def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; +def az_extloadi16_constant : ConstantLoad ; +def sextloadi16_constant : ConstantLoad ; -def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; +def az_extloadi16_local : LocalLoad ; +def sextloadi16_local : LocalLoad ; def extloadi16_private : PrivateLoad ; def sextloadi16_private : PrivateLoad ; @@ -258,49 +291,20 @@ def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i32; }]>; -def az_extloadi32_global : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; +def az_extloadi32_global : GlobalLoad ; -def az_extloadi32_flat : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; +def az_extloadi32_flat : FlatLoad ; -def az_extloadi32_constant : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; +def az_extloadi32_constant : ConstantLoad ; -def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast(N)); -}]>; - -def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast(N)); -}]>; +def truncstorei8_global : GlobalStore ; +def truncstorei16_global : GlobalStore ; -def local_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast(N)); -}]>; +def local_store : LocalStore ; +def truncstorei8_local : LocalStore ; +def truncstorei16_local : LocalStore ; -def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast(N)); -}]>; - -def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast(N)); -}]>; - -def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; +def local_load : LocalLoad ; class Aligned8Bytes : PatFrag (N)->getAlignment() % 8 == 0; @@ -370,6 +374,12 @@ class global_binary_atomic_op : PatFrag< [{return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}] >; +class flat_binary_atomic_op : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}] +>; + def atomic_swap_global : global_binary_atomic_op; def atomic_add_global : global_binary_atomic_op; def atomic_and_global : global_binary_atomic_op; @@ -381,6 +391,26 @@ def atomic_umax_global : global_binary_atomic_op; def atomic_umin_global : global_binary_atomic_op; def atomic_xor_global : global_binary_atomic_op; +def atomic_cmp_swap_global : global_binary_atomic_op; +def atomic_cmp_swap_global_nortn : PatFrag< + (ops node:$ptr, node:$value), + (atomic_cmp_swap_global node:$ptr, node:$value), + [{ return SDValue(N, 0).use_empty(); }] +>; + +def atomic_swap_flat : flat_binary_atomic_op; +def atomic_add_flat : flat_binary_atomic_op; +def atomic_and_flat : flat_binary_atomic_op; +def atomic_max_flat : flat_binary_atomic_op; +def atomic_min_flat : flat_binary_atomic_op; +def atomic_or_flat : flat_binary_atomic_op; +def atomic_sub_flat : flat_binary_atomic_op; +def atomic_umax_flat : flat_binary_atomic_op; +def atomic_umin_flat : flat_binary_atomic_op; +def atomic_xor_flat : flat_binary_atomic_op; + +def atomic_cmp_swap_flat : flat_binary_atomic_op; + //===----------------------------------------------------------------------===// // Misc Pattern Fragments //===----------------------------------------------------------------------===// @@ -392,6 +422,7 @@ int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding int FP32_NEG_ONE = 0xbf800000; int FP32_ONE = 0x3f800000; +int FP64_ONE = 0x3ff0000000000000; } def CONST : Constants; @@ -570,6 +601,25 @@ class ROTRPattern : Pat < (BIT_ALIGN $src0, $src0, $src1) >; +// This matches 16 permutations of +// max(min(x, y), min(max(x, y), z)) +class IntMed3Pat : Pat< + (max (min_oneuse i32:$src0, i32:$src1), + (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)), + (med3Inst $src0, $src1, $src2) +>; + +let Properties = [SDNPCommutative, SDNPAssociative] in { +def smax_oneuse : HasOneUseBinOp; +def smin_oneuse : HasOneUseBinOp; +def umax_oneuse : HasOneUseBinOp; +def umin_oneuse : HasOneUseBinOp; +} // Properties = [SDNPCommutative, SDNPAssociative] + + // 24-bit arithmetic patterns def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>; @@ -587,13 +637,6 @@ def cvt_flr_i32_f32 : PatFrag < [{ (void)N; return TM.Options.NoNaNsFPMath; }] >; -/* -class UMUL24Pattern : Pat < - (mul U24:$x, U24:$y), - (UMUL24 $x, $y) ->; -*/ - class IMad24Pat : Pat < (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2), (Inst $src0, $src1, $src2) @@ -604,30 +647,6 @@ class UMad24Pat : Pat < (Inst $src0, $src1, $src2) >; -multiclass Expand24IBitOps { - def _expand_imad24 : Pat < - (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2), - (AddInst (MulInst $src0, $src1), $src2) - >; - - def _expand_imul24 : Pat < - (AMDGPUmul_i24 i32:$src0, i32:$src1), - (MulInst $src0, $src1) - >; -} - -multiclass Expand24UBitOps { - def _expand_umad24 : Pat < - (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2), - (AddInst (MulInst $src0, $src1), $src2) - >; - - def _expand_umul24 : Pat < - (AMDGPUmul_u24 i32:$src0, i32:$src1), - (MulInst $src0, $src1) - >; -} - class RcpPat : Pat < (fdiv FP_ONE, vt:$src), (RcpInst $src) diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index e94bb6013d83..791872a9db40 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -20,46 +20,44 @@ using namespace llvm; -#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN -#include "AMDGPUGenIntrinsics.inc" -#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN - AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() : TargetIntrinsicInfo() {} -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, - unsigned numTys) const { - static const char *const names[] = { +static const char *const IntrinsicNameTable[] = { #define GET_INTRINSIC_NAME_TABLE #include "AMDGPUGenIntrinsics.inc" #undef GET_INTRINSIC_NAME_TABLE - }; +}; +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned numTys) const { if (IntrID < Intrinsic::num_intrinsics) { return nullptr; } assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && "Invalid intrinsic ID"); - std::string Result(names[IntrID - Intrinsic::num_intrinsics]); + std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]); return Result; } -unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name, +unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, unsigned Len) const { - if (!StringRef(Name, Len).startswith("llvm.")) + StringRef Name(NameData, Len); + if (!Name.startswith("llvm.")) return 0; // All intrinsics start with 'llvm.' -#define GET_FUNCTION_RECOGNIZER -#include "AMDGPUGenIntrinsics.inc" -#undef GET_FUNCTION_RECOGNIZER - AMDGPUIntrinsic::ID IntrinsicID = - (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; - IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); - - if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { - return IntrinsicID; + // Look for a name match in our table. If the intrinsic is not overloaded, + // require an exact match. If it is overloaded, require a prefix match. The + // AMDGPU enum enum starts at Intrinsic::num_intrinsics. + int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name); + if (Idx >= 0) { + bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]); + return IsPrefixMatch == isOverloaded(Idx + 1) + ? Intrinsic::num_intrinsics + Idx + : 0; } + return 0; } diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h index 4c95b5ec0974..f4173929259c 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -11,8 +11,8 @@ /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. // //===-----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H #include "llvm/IR/Intrinsics.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -31,7 +31,7 @@ enum ID { } // end namespace AMDGPUIntrinsic -class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { +class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo { public: AMDGPUIntrinsicInfo(); std::string getName(unsigned IntrId, Type **Tys = nullptr, diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td index 1de3546485b1..2127391f18e7 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -12,79 +12,26 @@ //===----------------------------------------------------------------------===// let TargetPrefix = "AMDGPU", isTarget = 1 in { - - def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; - def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - // This is named backwards (instead of rsq_legacy) so we don't have - // to define it with the public builtins intrinsics. This is a - // workaround for how intrinsic names are parsed. If the name is - // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant - // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. - def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - - def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; def int_AMDGPU_kilp : Intrinsic<[], [], []>; - def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + + // Deprecated in favor of separate int_amdgcn_cube* intrinsics. def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + + // Deprecated in favor of expanded bit operations def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>; - def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>; -} - -// Legacy names for compatibility. -let TargetPrefix = "AMDIL", isTarget = 1 in { - def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; -} -let TargetPrefix = "TGSI", isTarget = 1 in { + // Deprecated in favor of llvm.amdgcn.rsq + def int_AMDGPU_rsq : Intrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + >; - def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>; + // Deprecated in favor of llvm.amdgcn.read.workdim + def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; } include "SIIntrinsics.td" diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index dfc652f31da5..ad8d3e4d3545 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -15,9 +15,9 @@ #include "AMDGPUMCInstLower.h" #include "AMDGPUAsmPrinter.h" +#include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "InstPrinter/AMDGPUInstPrinter.h" -#include "R600InstrInfo.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" @@ -37,8 +37,14 @@ using namespace llvm; AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): - Ctx(ctx), ST(st) -{ } + Ctx(ctx), ST(st) { } + +static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) { + switch (MOFlags) { + default: return MCSymbolRefExpr::VK_None; + case SIInstrInfo::MO_GOTPCREL: return MCSymbolRefExpr::VK_GOTPCREL; + } +} void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { @@ -70,11 +76,16 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { case MachineOperand::MO_GlobalAddress: { const GlobalValue *GV = MO.getGlobal(); MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName())); - MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); + const MCExpr *SymExpr = + MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx); + const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr, + MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + MCOp = MCOperand::createExpr(Expr); break; } case MachineOperand::MO_ExternalSymbol: { MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); + Sym->setExternal(true); const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); MCOp = MCOperand::createExpr(Expr); break; @@ -88,13 +99,13 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { const AMDGPUSubtarget &STI = MF->getSubtarget(); AMDGPUMCInstLower MCInstLowering(OutContext, STI); -#ifdef _DEBUG StringRef Err; - if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) { - errs() << "Warning: Illegal instruction detected: " << Err << "\n"; + if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { + LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); + C.emitError("Illegal instruction detected: " + Err); MI->dump(); } -#endif + if (MI->isBundle()) { const MachineBasicBlock *MBB = MI->getParent(); MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); @@ -103,6 +114,29 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { ++I; } } else { + // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder + // terminator instructions and should only be printed as comments. + if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) { + if (isVerbose()) { + SmallVector BBStr; + raw_svector_ostream Str(BBStr); + + const MachineBasicBlock *MBB = MI->getOperand(0).getMBB(); + const MCSymbolRefExpr *Expr + = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); + Expr->print(Str, MAI); + OutStreamer->emitRawComment(" mask branch " + BBStr); + } + + return; + } + + if (MI->getOpcode() == AMDGPU::SI_RETURN) { + if (isVerbose()) + OutStreamer->emitRawComment(" return"); + return; + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); @@ -114,10 +148,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { raw_string_ostream DisasmStream(DisasmLine); AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), - *MF->getSubtarget().getInstrInfo(), - *MF->getSubtarget().getRegisterInfo()); - InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), - MF->getSubtarget()); + *STI.getInstrInfo(), + *STI.getRegisterInfo()); + InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), STI); // Disassemble instruction/operands to hex representation. SmallVector Fixups; diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h index d322fe072b2b..957dcd0de8ef 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H -#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H namespace llvm { diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 54137177e4c0..44516dab04f1 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -1,8 +1,5 @@ #include "AMDGPUMachineFunction.h" -#include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/Function.h" + using namespace llvm; // Pin the vtable to this file. @@ -10,11 +7,17 @@ void AMDGPUMachineFunction::anchor() {} AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), - ShaderType(ShaderType::COMPUTE), + KernArgSize(0), + MaxKernArgAlign(0), LDSSize(0), ABIArgOffset(0), ScratchSize(0), - IsKernel(true) { + IsKernel(MF.getFunction()->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL || + MF.getFunction()->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) +{ +} - ShaderType = AMDGPU::getShaderType(*MF.getFunction()); +bool AMDGPUMachineFunction::isKernel() const +{ + return IsKernel; } diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 46fcee874887..6b31f63e1a9d 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -1,4 +1,4 @@ -//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// +//===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=// // // The LLVM Compiler Infrastructure // @@ -6,12 +6,9 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -/// \file -//===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H -#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H #include "llvm/CodeGen/MachineFunction.h" #include @@ -19,11 +16,25 @@ namespace llvm { class AMDGPUMachineFunction : public MachineFunctionInfo { + uint64_t KernArgSize; + unsigned MaxKernArgAlign; + virtual void anchor(); - unsigned ShaderType; public: AMDGPUMachineFunction(const MachineFunction &MF); + + uint64_t allocateKernArg(uint64_t Size, unsigned Align) { + assert(isPowerOf2_32(Align)); + KernArgSize = alignTo(KernArgSize, Align); + + uint64_t Result = KernArgSize; + KernArgSize += Size; + + MaxKernArgAlign = std::max(Align, MaxKernArgAlign); + return Result; + } + /// A map to keep track of local memory objects and their offsets within /// the local memory space. std::map LocalMemoryObjects; @@ -33,14 +44,7 @@ public: /// Start of implicit kernel args unsigned ABIArgOffset; - unsigned getShaderType() const { - return ShaderType; - } - - bool isKernel() const { - // FIXME: Assume everything is a kernel until function calls are supported. - return true; - } + bool isKernel() const; unsigned ScratchSize; bool IsKernel; diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp index 554bf1da81f5..8bc7b53435be 100644 --- a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp +++ b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp @@ -25,7 +25,6 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/Passes.h" diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 87d50d587059..775463809634 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -16,7 +16,8 @@ #include "AMDGPUSubtarget.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -26,79 +27,317 @@ using namespace llvm; namespace { -class AMDGPUPromoteAlloca : public FunctionPass, - public InstVisitor { - - static char ID; +// FIXME: This can create globals so should be a module pass. +class AMDGPUPromoteAlloca : public FunctionPass { +private: + const TargetMachine *TM; Module *Mod; - const AMDGPUSubtarget &ST; - int LocalMemAvailable; + const DataLayout *DL; + MDNode *MaxWorkGroupSizeRange; + + // FIXME: This should be per-kernel. + uint32_t LocalMemLimit; + uint32_t CurrentLocalMemUsage; + + bool IsAMDGCN; + bool IsAMDHSA; + + std::pair getLocalSizeYZ(IRBuilder<> &Builder); + Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); + + /// BaseAlloca is the alloca root the search started from. + /// Val may be that alloca or a recursive user of it. + bool collectUsesWithPtrTypes(Value *BaseAlloca, + Value *Val, + std::vector &WorkList) const; + + /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand + /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). + /// Returns true if both operands are derived from the same alloca. Val should + /// be the same value as one of the input operands of UseInst. + bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val, + Instruction *UseInst, + int OpIdx0, int OpIdx1) const; public: - AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), - LocalMemAvailable(0) { } + static char ID; + + AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) : + FunctionPass(ID), + TM(TM_), + Mod(nullptr), + DL(nullptr), + MaxWorkGroupSizeRange(nullptr), + LocalMemLimit(0), + CurrentLocalMemUsage(0), + IsAMDGCN(false), + IsAMDHSA(false) { } + bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; - const char *getPassName() const override { return "AMDGPU Promote Alloca"; } - void visitAlloca(AllocaInst &I); + + const char *getPassName() const override { + return "AMDGPU Promote Alloca"; + } + + void handleAlloca(AllocaInst &I); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace char AMDGPUPromoteAlloca::ID = 0; +INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, + "AMDGPU promote alloca to vector or LDS", false, false) + +char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; + + bool AMDGPUPromoteAlloca::doInitialization(Module &M) { + if (!TM) + return false; + Mod = &M; + DL = &Mod->getDataLayout(); + + // The maximum workitem id. + // + // FIXME: Should get as subtarget property. Usually runtime enforced max is + // 256. + MDBuilder MDB(Mod->getContext()); + MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048)); + + const Triple &TT = TM->getTargetTriple(); + + IsAMDGCN = TT.getArch() == Triple::amdgcn; + IsAMDHSA = TT.getOS() == Triple::AMDHSA; + return false; } bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { + if (!TM || skipFunction(F)) + return false; - FunctionType *FTy = F.getFunctionType(); - - LocalMemAvailable = ST.getLocalMemorySize(); + const AMDGPUSubtarget &ST = TM->getSubtarget(F); + if (!ST.isPromoteAllocaEnabled()) + return false; + FunctionType *FTy = F.getFunctionType(); // If the function has any arguments in the local address space, then it's // possible these arguments require the entire local memory space, so // we cannot use local memory in the pass. - for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { - Type *ParamTy = FTy->getParamType(i); - if (ParamTy->isPointerTy() && - ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LocalMemAvailable = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " + for (Type *ParamTy : FTy->params()) { + PointerType *PtrTy = dyn_cast(ParamTy); + if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LocalMemLimit = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " "local memory disabled.\n"); - break; + return false; } } - if (LocalMemAvailable > 0) { - // Check how much local memory is being used by global objects - for (Module::global_iterator I = Mod->global_begin(), - E = Mod->global_end(); I != E; ++I) { - GlobalVariable *GV = &*I; - PointerType *GVTy = GV->getType(); - if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + LocalMemLimit = ST.getLocalMemorySize(); + if (LocalMemLimit == 0) + return false; + + const DataLayout &DL = Mod->getDataLayout(); + + // Check how much local memory is being used by global objects + CurrentLocalMemUsage = 0; + for (GlobalVariable &GV : Mod->globals()) { + if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + + for (const User *U : GV.users()) { + const Instruction *Use = dyn_cast(U); + if (!Use) continue; - for (Value::use_iterator U = GV->use_begin(), - UE = GV->use_end(); U != UE; ++U) { - Instruction *Use = dyn_cast(*U); - if (!Use) - continue; - if (Use->getParent()->getParent() == &F) - LocalMemAvailable -= - Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType()); + + if (Use->getParent()->getParent() == &F) { + unsigned Align = GV.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV.getValueType()); + + // FIXME: Try to account for padding here. The padding is currently + // determined from the inverse order of uses in the function. I'm not + // sure if the use list order is in any way connected to this, so the + // total reported size is likely incorrect. + uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage += AllocSize; + break; } } } - LocalMemAvailable = std::max(0, LocalMemAvailable); - DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage); - visit(F); + // Restrict local memory usage so that we don't drastically reduce occupancy, + // unless it is already significantly reduced. - return false; + // TODO: Have some sort of hint or other heuristics to guess occupancy based + // on other factors.. + unsigned OccupancyHint + = AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0); + if (OccupancyHint == 0) + OccupancyHint = 7; + + // Clamp to max value. + OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU()); + + // Check the hint but ignore it if it's obviously wrong from the existing LDS + // usage. + MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + + + // Round up to the next tier of usage. + unsigned MaxSizeWithWaveCount + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy); + + // Program is possibly broken by using more local mem than available. + if (CurrentLocalMemUsage > MaxSizeWithWaveCount) + return false; + + LocalMemLimit = MaxSizeWithWaveCount; + + DEBUG( + dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" + << " Rounding size to " << MaxSizeWithWaveCount + << " with a maximum occupancy of " << MaxOccupancy << '\n' + << " and " << (LocalMemLimit - CurrentLocalMemUsage) + << " available for promotion\n" + ); + + BasicBlock &EntryBB = *F.begin(); + for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) { + AllocaInst *AI = dyn_cast(I); + + ++I; + if (AI) + handleAlloca(*AI); + } + + return true; +} + +std::pair +AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { + if (!IsAMDHSA) { + Function *LocalSizeYFn + = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); + Function *LocalSizeZFn + = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z); + + CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); + CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); + + LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + + return std::make_pair(LocalSizeY, LocalSizeZ); + } + + // We must read the size out of the dispatch pointer. + assert(IsAMDGCN); + + // We are indexing into this struct, and want to extract the workgroup_size_* + // fields. + // + // typedef struct hsa_kernel_dispatch_packet_s { + // uint16_t header; + // uint16_t setup; + // uint16_t workgroup_size_x ; + // uint16_t workgroup_size_y; + // uint16_t workgroup_size_z; + // uint16_t reserved0; + // uint32_t grid_size_x ; + // uint32_t grid_size_y ; + // uint32_t grid_size_z; + // + // uint32_t private_segment_size; + // uint32_t group_segment_size; + // uint64_t kernel_object; + // + // #ifdef HSA_LARGE_MODEL + // void *kernarg_address; + // #elif defined HSA_LITTLE_ENDIAN + // void *kernarg_address; + // uint32_t reserved1; + // #else + // uint32_t reserved1; + // void *kernarg_address; + // #endif + // uint64_t reserved2; + // hsa_signal_t completion_signal; // uint64_t wrapper + // } hsa_kernel_dispatch_packet_t + // + Function *DispatchPtrFn + = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); + + CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); + DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias); + DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + + // Size of the dispatch packet struct. + DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64); + + Type *I32Ty = Type::getInt32Ty(Mod->getContext()); + Value *CastDispatchPtr = Builder.CreateBitCast( + DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS)); + + // We could do a single 64-bit load here, but it's likely that the basic + // 32-bit and extract sequence is already present, and it is probably easier + // to CSE this. The loads should be mergable later anyway. + Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1); + LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4); + + Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2); + LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4); + + MDNode *MD = llvm::MDNode::get(Mod->getContext(), None); + LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); + LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); + LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + + // Extract y component. Upper half of LoadZU should be zero already. + Value *Y = Builder.CreateLShr(LoadXY, 16); + + return std::make_pair(Y, LoadZU); +} + +Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { + Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic; + + switch (N) { + case 0: + IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x + : Intrinsic::r600_read_tidig_x; + break; + case 1: + IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y + : Intrinsic::r600_read_tidig_y; + break; + + case 2: + IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z + : Intrinsic::r600_read_tidig_z; + break; + default: + llvm_unreachable("invalid dimension"); + } + + Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); + CallInst *CI = Builder.CreateCall(WorkitemIdFn); + CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + + return CI; } static VectorType *arrayTypeToVecType(Type *ArrayTy) { @@ -151,17 +390,16 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { } static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { - Type *AllocaTy = Alloca->getAllocatedType(); + ArrayType *AllocaTy = dyn_cast(Alloca->getAllocatedType()); - DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); + DEBUG(dbgs() << "Alloca candidate for vectorization\n"); // FIXME: There is no reason why we can't support larger arrays, we // are just being conservative for now. - if (!AllocaTy->isArrayTy() || - AllocaTy->getArrayElementType()->isVectorTy() || - AllocaTy->getArrayNumElements() > 4) { - - DEBUG(dbgs() << " Cannot convert type to vector"); + if (!AllocaTy || + AllocaTy->getElementType()->isVectorTy() || + AllocaTy->getNumElements() > 4) { + DEBUG(dbgs() << " Cannot convert type to vector\n"); return false; } @@ -200,9 +438,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); - for (std::vector::iterator I = WorkList.begin(), - E = WorkList.end(); I != E; ++I) { - Instruction *Inst = cast(*I); + for (Value *V : WorkList) { + Instruction *Inst = cast(V); IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { @@ -239,44 +476,163 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { return true; } -static bool collectUsesWithPtrTypes(Value *Val, std::vector &WorkList) { - bool Success = true; +static bool isCallPromotable(CallInst *CI) { + // TODO: We might be able to handle some cases where the callee is a + // constantexpr bitcast of a function. + if (!CI->getCalledFunction()) + return false; + + IntrinsicInst *II = dyn_cast(CI); + if (!II) + return false; + + switch (II->getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::invariant_group_barrier: + case Intrinsic::objectsize: + return true; + default: + return false; + } +} + +bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca, + Value *Val, + Instruction *Inst, + int OpIdx0, + int OpIdx1) const { + // Figure out which operand is the one we might not be promoting. + Value *OtherOp = Inst->getOperand(OpIdx0); + if (Val == OtherOp) + OtherOp = Inst->getOperand(OpIdx1); + + if (isa(OtherOp)) + return true; + + Value *OtherObj = GetUnderlyingObject(OtherOp, *DL); + if (!isa(OtherObj)) + return false; + + // TODO: We should be able to replace undefs with the right pointer type. + + // TODO: If we know the other base object is another promotable + // alloca, not necessarily this alloca, we can do this. The + // important part is both must have the same address space at + // the end. + if (OtherObj != BaseAlloca) { + DEBUG(dbgs() << "Found a binary instruction with another alloca object\n"); + return false; + } + + return true; +} + +bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( + Value *BaseAlloca, + Value *Val, + std::vector &WorkList) const { + for (User *User : Val->users()) { - if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) + if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) continue; + if (CallInst *CI = dyn_cast(User)) { - // TODO: We might be able to handle some cases where the callee is a - // constantexpr bitcast of a function. - if (!CI->getCalledFunction()) + if (!isCallPromotable(CI)) return false; WorkList.push_back(User); continue; } - // FIXME: Correctly handle ptrtoint instructions. - Instruction *UseInst = dyn_cast(User); - if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt) + Instruction *UseInst = cast(User); + if (UseInst->getOpcode() == Instruction::PtrToInt) return false; - if (StoreInst *SI = dyn_cast_or_null(UseInst)) { + if (LoadInst *LI = dyn_cast_or_null(UseInst)) { + if (LI->isVolatile()) + return false; + + continue; + } + + if (StoreInst *SI = dyn_cast(UseInst)) { + if (SI->isVolatile()) + return false; + // Reject if the stored value is not the pointer operand. if (SI->getPointerOperand() != Val) return false; + } else if (AtomicRMWInst *RMW = dyn_cast_or_null(UseInst)) { + if (RMW->isVolatile()) + return false; + } else if (AtomicCmpXchgInst *CAS + = dyn_cast_or_null(UseInst)) { + if (CAS->isVolatile()) + return false; + } + + // Only promote a select if we know that the other select operand + // is from another pointer that will also be promoted. + if (ICmpInst *ICmp = dyn_cast(UseInst)) { + if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1)) + return false; + + // May need to rewrite constant operands. + WorkList.push_back(ICmp); } if (!User->getType()->isPointerTy()) continue; - WorkList.push_back(User); + if (GetElementPtrInst *GEP = dyn_cast(UseInst)) { + // Be conservative if an address could be computed outside the bounds of + // the alloca. + if (!GEP->isInBounds()) + return false; + } - Success &= collectUsesWithPtrTypes(User, WorkList); + // Only promote a select if we know that the other select operand is from + // another pointer that will also be promoted. + if (SelectInst *SI = dyn_cast(UseInst)) { + if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2)) + return false; + } + + // Repeat for phis. + if (PHINode *Phi = dyn_cast(UseInst)) { + // TODO: Handle more complex cases. We should be able to replace loops + // over arrays. + switch (Phi->getNumIncomingValues()) { + case 1: + break; + case 2: + if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1)) + return false; + break; + default: + return false; + } + } + + WorkList.push_back(User); + if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList)) + return false; } - return Success; + + return true; } -void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { - if (!I.isStaticAlloca()) +// FIXME: Should try to pick the most likely to be profitable allocas first. +void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { + // Array allocations are probably not worth handling, since an allocation of + // the array type is the canonical form. + if (!I.isStaticAlloca() || I.isArrayAllocation()) return; IRBuilder<> Builder(&I); @@ -286,95 +642,144 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I)) + if (tryPromoteAllocaToVector(&I)) { + DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + return; + } + + const Function &ContainingFunction = *I.getParent()->getParent(); + + // Don't promote the alloca to LDS for shader calling conventions as the work + // item ID intrinsics are not supported for these calling conventions. + // Furthermore not all LDS is available for some of the stages. + if (AMDGPU::isShader(ContainingFunction.getCallingConv())) return; - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + // FIXME: We should also try to get this value from the reqd_work_group_size + // function attribute if it is available. + unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction); - // FIXME: This is the maximum work group size. We should try to get - // value from the reqd_work_group_size function attribute if it is - // available. - unsigned WorkGroupSize = 256; - int AllocaSize = - WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); + const DataLayout &DL = Mod->getDataLayout(); - if (AllocaSize > LocalMemAvailable) { - DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); + unsigned Align = I.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(I.getAllocatedType()); + + // FIXME: This computed padding is likely wrong since it depends on inverse + // usage order. + // + // FIXME: It is also possible that if we're allowed to use all of the memory + // could could end up using more than the maximum due to alignment padding. + + uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align); + uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); + NewSize += AllocSize; + + if (NewSize > LocalMemLimit) { + DEBUG(dbgs() << " " << AllocSize + << " bytes of local memory not available to promote\n"); return; } + CurrentLocalMemUsage = NewSize; + std::vector WorkList; - if (!collectUsesWithPtrTypes(&I, WorkList)) { + if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); return; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); - LocalMemAvailable -= AllocaSize; - Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); + Function *F = I.getParent()->getParent(); + + Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( - *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, - GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); - - FunctionType *FTy = FunctionType::get( - Type::getInt32Ty(Mod->getContext()), false); - AttributeSet AttrSet; - AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); - - Value *ReadLocalSizeY = Mod->getOrInsertFunction( - "llvm.r600.read.local.size.y", FTy, AttrSet); - Value *ReadLocalSizeZ = Mod->getOrInsertFunction( - "llvm.r600.read.local.size.z", FTy, AttrSet); - Value *ReadTIDIGX = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.x", FTy, AttrSet); - Value *ReadTIDIGY = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.y", FTy, AttrSet); - Value *ReadTIDIGZ = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.z", FTy, AttrSet); - - Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {}); - Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {}); - Value *TIdX = Builder.CreateCall(ReadTIDIGX, {}); - Value *TIdY = Builder.CreateCall(ReadTIDIGY, {}); - Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {}); - - Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); + *Mod, GVTy, false, GlobalValue::InternalLinkage, + UndefValue::get(GVTy), + Twine(F->getName()) + Twine('.') + I.getName(), + nullptr, + GlobalVariable::NotThreadLocal, + AMDGPUAS::LOCAL_ADDRESS); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + GV->setAlignment(I.getAlignment()); + + Value *TCntY, *TCntZ; + + std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); + Value *TIdX = getWorkitemID(Builder, 0); + Value *TIdY = getWorkitemID(Builder, 1); + Value *TIdZ = getWorkitemID(Builder, 2); + + Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true); Tmp0 = Builder.CreateMul(Tmp0, TIdX); - Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); + Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true); Value *TID = Builder.CreateAdd(Tmp0, Tmp1); TID = Builder.CreateAdd(TID, TIdZ); - std::vector Indices; - Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); - Indices.push_back(TID); + Value *Indices[] = { + Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), + TID + }; - Value *Offset = Builder.CreateGEP(GVTy, GV, Indices); + Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); I.mutateType(Offset->getType()); I.replaceAllUsesWith(Offset); I.eraseFromParent(); - for (std::vector::iterator i = WorkList.begin(), - e = WorkList.end(); i != e; ++i) { - Value *V = *i; + for (Value *V : WorkList) { CallInst *Call = dyn_cast(V); if (!Call) { - Type *EltTy = V->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + if (ICmpInst *CI = dyn_cast(V)) { + Value *Src0 = CI->getOperand(0); + Type *EltTy = Src0->getType()->getPointerElementType(); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + + if (isa(CI->getOperand(0))) + CI->setOperand(0, ConstantPointerNull::get(NewTy)); + + if (isa(CI->getOperand(1))) + CI->setOperand(1, ConstantPointerNull::get(NewTy)); + + continue; + } // The operand's value should be corrected on its own. if (isa(V)) continue; + Type *EltTy = V->getType()->getPointerElementType(); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + // FIXME: It doesn't really make sense to try to do this for all // instructions. V->mutateType(NewTy); + + // Adjust the types of any constant operands. + if (SelectInst *SI = dyn_cast(V)) { + if (isa(SI->getOperand(1))) + SI->setOperand(1, ConstantPointerNull::get(NewTy)); + + if (isa(SI->getOperand(2))) + SI->setOperand(2, ConstantPointerNull::get(NewTy)); + } else if (PHINode *Phi = dyn_cast(V)) { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { + if (isa(Phi->getIncomingValue(I))) + Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy)); + } + } + continue; } IntrinsicInst *Intr = dyn_cast(Call); if (!Intr) { + // FIXME: What is this for? It doesn't make sense to promote arbitrary + // function calls. If the call is to a defined function that can also be + // promoted, we should be able to do this once that function is also + // rewritten. + std::vector ArgTypes; for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); ArgIdx != ArgEnd; ++ArgIdx) { @@ -405,6 +810,14 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { Intr->eraseFromParent(); continue; } + case Intrinsic::memmove: { + MemMoveInst *MemMove = cast(Intr); + Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(), + MemMove->getLength(), MemMove->getAlignment(), + MemMove->isVolatile()); + Intr->eraseFromParent(); + continue; + } case Intrinsic::memset: { MemSetInst *MemSet = cast(Intr); Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), @@ -413,6 +826,28 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { Intr->eraseFromParent(); continue; } + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::invariant_group_barrier: + Intr->eraseFromParent(); + // FIXME: I think the invariant marker should still theoretically apply, + // but the intrinsics need to be changed to accept pointers with any + // address space. + continue; + case Intrinsic::objectsize: { + Value *Src = Intr->getOperand(0); + Type *SrcTy = Src->getType()->getPointerElementType(); + Function *ObjectSize = Intrinsic::getDeclaration(Mod, + Intrinsic::objectsize, + { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) } + ); + + CallInst *NewCall + = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) }); + Intr->replaceAllUsesWith(NewCall); + Intr->eraseFromParent(); + continue; + } default: Intr->dump(); llvm_unreachable("Don't know how to promote alloca intrinsic use."); @@ -420,6 +855,6 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { } } -FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { - return new AMDGPUPromoteAlloca(ST); +FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) { + return new AMDGPUPromoteAlloca(TM); } diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 3ca0eca3417f..941f2d8a468a 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -24,20 +24,14 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} // they are not supported at this time. //===----------------------------------------------------------------------===// -const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; +// Dummy to not crash RegisterClassInfo. +static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister; -const MCPhysReg* -AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { +const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs( + const MachineFunction *) const { return &CalleeSavedReg; } -void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, - int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS) const { - llvm_unreachable("Subroutines not supported yet"); -} - unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { return AMDGPU::NoRegister; } @@ -54,10 +48,5 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { return SubRegs[Channel]; } -unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { - - return getSubRegFromChannel(IndirectIndex); -} - #define GET_REGINFO_TARGET_DESC #include "AMDGPUGenRegisterInfo.inc" diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index 0344834328f6..ef51aad95dce 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -13,10 +13,9 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H -#include "llvm/ADT/BitVector.h" #include "llvm/Target/TargetRegisterInfo.h" #define GET_REGINFO_HEADER @@ -29,30 +28,14 @@ class AMDGPUSubtarget; class TargetInstrInfo; struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { - static const MCPhysReg CalleeSavedReg; - AMDGPURegisterInfo(); - BitVector getReservedRegs(const MachineFunction &MF) const override { - assert(!"Unimplemented"); return BitVector(); - } - - virtual unsigned getHWRegIndex(unsigned Reg) const { - assert(!"Unimplemented"); return 0; - } - /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) unsigned getSubRegFromChannel(unsigned Channel) const; const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; - void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS) const override; unsigned getFrameRegister(const MachineFunction &MF) const override; - - unsigned getIndirectSubReg(unsigned IndirectIndex) const; - }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h new file mode 100644 index 000000000000..40f639434507 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h @@ -0,0 +1,138 @@ +//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// Enums and structure types used by runtime metadata. +/// +/// Runtime requests certain information (metadata) about kernels to be able +/// to execute the kernels and answer the queries about the kernels. +/// The metadata is represented as a byte stream in an ELF section of a +/// binary (code object). The byte stream consists of key-value pairs. +/// Each key is an 8 bit unsigned integer. Each value can be an integer, +/// a string, or a stream of key-value pairs. There are 3 levels of key-value +/// pair streams. At the beginning of the ELF section is the top level +/// key-value pair stream. A kernel-level key-value pair stream starts after +/// encountering KeyKernelBegin and ends immediately before encountering +/// KeyKernelEnd. A kernel-argument-level key-value pair stream starts +/// after encountering KeyArgBegin and ends immediately before encountering +/// KeyArgEnd. A kernel-level key-value pair stream can only appear in a top +/// level key-value pair stream. A kernel-argument-level key-value pair stream +/// can only appear in a kernel-level key-value pair stream. +/// +/// The format should be kept backward compatible. New enum values and bit +/// fields should be appended at the end. It is suggested to bump up the +/// revision number whenever the format changes and document the change +/// in the revision in this header. +/// +// +//===----------------------------------------------------------------------===// +// +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H + +#include + +namespace AMDGPU { + +namespace RuntimeMD { + + // Version and revision of runtime metadata + const unsigned char MDVersion = 1; + const unsigned char MDRevision = 0; + + // ELF section name containing runtime metadata + const char SectionName[] = ".AMDGPU.runtime_metadata"; + + // Enumeration values of keys in runtime metadata. + enum Key { + KeyNull = 0, // Place holder. Ignored when encountered + KeyMDVersion = 1, // Runtime metadata version + KeyLanguage = 2, // Language + KeyLanguageVersion = 3, // Language version + KeyKernelBegin = 4, // Beginning of kernel-level stream + KeyKernelEnd = 5, // End of kernel-level stream + KeyKernelName = 6, // Kernel name + KeyArgBegin = 7, // Beginning of kernel-arg-level stream + KeyArgEnd = 8, // End of kernel-arg-level stream + KeyArgSize = 9, // Kernel arg size + KeyArgAlign = 10, // Kernel arg alignment + KeyArgTypeName = 11, // Kernel type name + KeyArgName = 12, // Kernel name + KeyArgTypeKind = 13, // Kernel argument type kind + KeyArgValueType = 14, // Kernel argument value type + KeyArgAddrQual = 15, // Kernel argument address qualifier + KeyArgAccQual = 16, // Kernel argument access qualifier + KeyArgIsConst = 17, // Kernel argument is const qualified + KeyArgIsRestrict = 18, // Kernel argument is restrict qualified + KeyArgIsVolatile = 19, // Kernel argument is volatile qualified + KeyArgIsPipe = 20, // Kernel argument is pipe qualified + KeyReqdWorkGroupSize = 21, // Required work group size + KeyWorkGroupSizeHint = 22, // Work group size hint + KeyVecTypeHint = 23, // Vector type hint + KeyKernelIndex = 24, // Kernel index for device enqueue + KeySGPRs = 25, // Number of SGPRs + KeyVGPRs = 26, // Number of VGPRs + KeyMinWavesPerSIMD = 27, // Minimum number of waves per SIMD + KeyMaxWavesPerSIMD = 28, // Maximum number of waves per SIMD + KeyFlatWorkGroupSizeLimits = 29, // Flat work group size limits + KeyMaxWorkGroupSize = 30, // Maximum work group size + KeyNoPartialWorkGroups = 31, // No partial work groups + }; + + enum Language : uint8_t { + OpenCL_C = 0, + HCC = 1, + OpenMP = 2, + OpenCL_CPP = 3, +}; + + enum LanguageVersion : uint16_t { + V100 = 100, + V110 = 110, + V120 = 120, + V200 = 200, + V210 = 210, + }; + + namespace KernelArg { + enum TypeKind : uint8_t { + Value = 0, + Pointer = 1, + Image = 2, + Sampler = 3, + Queue = 4, + }; + + enum ValueType : uint16_t { + Struct = 0, + I8 = 1, + U8 = 2, + I16 = 3, + U16 = 4, + F16 = 5, + I32 = 6, + U32 = 7, + F32 = 8, + I64 = 9, + U64 = 10, + F64 = 11, + }; + + enum AccessQualifer : uint8_t { + None = 0, + ReadOnly = 1, + WriteOnly = 2, + ReadWrite = 3, + }; + } // namespace KernelArg +} // namespace RuntimeMD +} // namespace AMDGPU + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 7d70fa73da29..10fa9cf46737 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -15,7 +15,6 @@ #include "AMDGPUSubtarget.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" -#include "R600MachineScheduler.h" #include "SIFrameLowering.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" @@ -32,6 +31,8 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" +AMDGPUSubtarget::~AMDGPUSubtarget() {} + AMDGPUSubtarget & AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { @@ -44,14 +45,11 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // for SI has the unhelpful behavior that it unsets everything else if you // disable it. - SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. - FullFS += "+flat-for-global,"; + FullFS += "+flat-for-global,+unaligned-buffer-access,"; FullFS += FS; - if (GPU == "" && TT.getArch() == Triple::amdgcn) - GPU = "SI"; - ParseSubtargetFeatures(GPU, FullFS); // FIXME: I don't think think Evergreen has any useful support for @@ -61,52 +59,142 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, FP32Denormals = false; FP64Denormals = false; } + + // Set defaults if needed. + if (MaxPrivateElementSize == 0) + MaxPrivateElementSize = 4; + return *this; } AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - TargetMachine &TM) - : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), - DumpCode(false), R600ALUInst(false), HasVertexCache(false), - TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), - FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false), - EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), - EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), - EnableXNACK(false), - WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), - EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), - GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), - IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), - EnableSIScheduler(false), FrameLowering(nullptr), - InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { - + const TargetMachine &TM) + : AMDGPUGenSubtargetInfo(TT, GPU, FS), + TargetTriple(TT), + Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), + IsaVersion(ISAVersion0_0_0), + WavefrontSize(64), + LocalMemorySize(0), + LDSBankCount(0), + MaxPrivateElementSize(0), + + FastFMAF32(false), + HalfRate64Ops(false), + + FP32Denormals(false), + FP64Denormals(false), + FPExceptions(false), + FlatForGlobal(false), + UnalignedBufferAccess(false), + + EnableXNACK(false), + DebuggerInsertNops(false), + DebuggerReserveRegs(false), + DebuggerEmitPrologue(false), + + EnableVGPRSpilling(false), + EnablePromoteAlloca(false), + EnableLoadStoreOpt(false), + EnableUnsafeDSOffsetFolding(false), + EnableSIScheduler(false), + DumpCode(false), + + FP64(false), + IsGCN(false), + GCN1Encoding(false), + GCN3Encoding(false), + CIInsts(false), + SGPRInitBug(false), + HasSMemRealTime(false), + Has16BitInsts(false), + FlatAddressSpace(false), + + R600ALUInst(false), + CaymanISA(false), + CFALUBug(false), + HasVertexCache(false), + TexVTXClauseSize(0), + + FeatureDisable(false), + InstrItins(getInstrItineraryForCPU(GPU)) { initializeSubtargetDependencies(TT, GPU, FS); +} - const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16) - - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - InstrInfo.reset(new R600InstrInfo(*this)); - TLInfo.reset(new R600TargetLowering(TM, *this)); - - // FIXME: Should have R600 specific FrameLowering - FrameLowering.reset(new AMDGPUFrameLowering( - TargetFrameLowering::StackGrowsUp, - MaxStackAlign, - 0)); - } else { - InstrInfo.reset(new SIInstrInfo(*this)); - TLInfo.reset(new SITargetLowering(TM, *this)); - FrameLowering.reset(new SIFrameLowering( - TargetFrameLowering::StackGrowsUp, - MaxStackAlign, - 0)); +// FIXME: These limits are for SI. Did they change with the larger maximum LDS +// size? +unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { + switch (NWaves) { + case 10: + return 1638; + case 9: + return 1820; + case 8: + return 2048; + case 7: + return 2340; + case 6: + return 2730; + case 5: + return 3276; + case 4: + return 4096; + case 3: + return 5461; + case 2: + return 8192; + default: + return getLocalMemorySize(); } } -unsigned AMDGPUSubtarget::getStackEntrySize() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - switch(getWavefrontSize()) { +unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { + if (Bytes <= 1638) + return 10; + + if (Bytes <= 1820) + return 9; + + if (Bytes <= 2048) + return 8; + + if (Bytes <= 2340) + return 7; + + if (Bytes <= 2730) + return 6; + + if (Bytes <= 3276) + return 5; + + if (Bytes <= 4096) + return 4; + + if (Bytes <= 5461) + return 3; + + if (Bytes <= 8192) + return 2; + + return 1; +} + +R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, + const TargetMachine &TM) : + AMDGPUSubtarget(TT, GPU, FS, TM), + InstrInfo(*this), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), + TLInfo(TM, *this) {} + +SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, + const TargetMachine &TM) : + AMDGPUSubtarget(TT, GPU, FS, TM), + InstrInfo(*this), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), + TLInfo(TM, *this), + GISel() {} + +unsigned R600Subtarget::getStackEntrySize() const { + switch (getWavefrontSize()) { case 16: return 8; case 32: @@ -118,37 +206,36 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const { } } -unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { - switch(getGeneration()) { - default: llvm_unreachable("ChipID unknown"); - case SEA_ISLANDS: return 12; - } -} - -AMDGPU::IsaVersion AMDGPUSubtarget::getIsaVersion() const { - return AMDGPU::getIsaVersion(getFeatureBits()); +void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + unsigned NumRegionInstrs) const { + // Track register pressure so the scheduler can try to decrease + // pressure once register usage is above the threshold defined by + // SIRegisterInfo::getRegPressureSetLimit() + Policy.ShouldTrackPressure = true; + + // Enabling both top down and bottom up scheduling seems to give us less + // register spills than just using one of these approaches on its own. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + + // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. + if (!enableSIScheduler()) + Policy.ShouldTrackLaneMasks = true; } -bool AMDGPUSubtarget::isVGPRSpillingEnabled( - const SIMachineFunctionInfo *MFI) const { - return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling; +bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { + return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } -void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, - MachineInstr *end, - unsigned NumRegionInstrs) const { - if (getGeneration() >= SOUTHERN_ISLANDS) { - - // Track register pressure so the scheduler can try to decrease - // pressure once register usage is above the threshold defined by - // SIRegisterInfo::getRegPressureSetLimit() - Policy.ShouldTrackPressure = true; - - // Enabling both top down and bottom up scheduling seems to give us less - // register spills than just using one of these approaches on its own. - Policy.OnlyTopDown = false; - Policy.OnlyBottomUp = false; +unsigned SISubtarget::getAmdKernelCodeChipID() const { + switch (getGeneration()) { + case SEA_ISLANDS: + return 12; + default: + llvm_unreachable("ChipID unknown"); } } +AMDGPU::IsaVersion SISubtarget::getIsaVersion() const { + return AMDGPU::getIsaVersion(getFeatureBits()); +} diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 49c94f1eceb8..3fe61aa449e0 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -16,12 +16,14 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H #include "AMDGPU.h" -#include "AMDGPUFrameLowering.h" -#include "AMDGPUInstrInfo.h" -#include "AMDGPUISelLowering.h" -#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "R600ISelLowering.h" +#include "R600FrameLowering.h" +#include "SIInstrInfo.h" +#include "SIISelLowering.h" +#include "SIFrameLowering.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/Target/TargetSubtargetInfo.h" #define GET_SUBTARGETINFO_HEADER @@ -30,9 +32,9 @@ namespace llvm { class SIMachineFunctionInfo; +class StringRef; class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { - public: enum Generation { R600 = 0, @@ -44,10 +46,6 @@ public: VOLCANIC_ISLANDS, }; - enum { - FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 - }; - enum { ISAVersion0_0_0, ISAVersion7_0_0, @@ -57,114 +55,116 @@ public: ISAVersion8_0_3 }; -private: - std::string DevName; - bool Is64bit; - bool DumpCode; - bool R600ALUInst; - bool HasVertexCache; - short TexVTXClauseSize; +protected: + // Basic subtarget description. + Triple TargetTriple; Generation Gen; - bool FP64; - bool FP64Denormals; - bool FP32Denormals; + unsigned IsaVersion; + unsigned WavefrontSize; + int LocalMemorySize; + int LDSBankCount; + unsigned MaxPrivateElementSize; + + // Possibly statically set by tablegen, but may want to be overridden. bool FastFMAF32; - bool CaymanISA; - bool FlatAddressSpace; + bool HalfRate64Ops; + + // Dynamially set bits that enable features. + bool FP32Denormals; + bool FP64Denormals; + bool FPExceptions; bool FlatForGlobal; - bool EnableIRStructurizer; + bool UnalignedBufferAccess; + bool EnableXNACK; + bool DebuggerInsertNops; + bool DebuggerReserveRegs; + bool DebuggerEmitPrologue; + + // Used as options. + bool EnableVGPRSpilling; bool EnablePromoteAlloca; - bool EnableIfCvt; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; - bool EnableXNACK; - unsigned WavefrontSize; - bool CFALUBug; - int LocalMemorySize; - bool EnableVGPRSpilling; - bool SGPRInitBug; + bool EnableSIScheduler; + bool DumpCode; + + // Subtarget statically properties set by tablegen + bool FP64; bool IsGCN; bool GCN1Encoding; bool GCN3Encoding; bool CIInsts; + bool SGPRInitBug; + bool HasSMemRealTime; + bool Has16BitInsts; + bool FlatAddressSpace; + bool R600ALUInst; + bool CaymanISA; + bool CFALUBug; + bool HasVertexCache; + short TexVTXClauseSize; + + // Dummy feature to use for assembler in tablegen. bool FeatureDisable; - int LDSBankCount; - unsigned IsaVersion; - bool EnableHugeScratchBuffer; - bool EnableSIScheduler; - std::unique_ptr FrameLowering; - std::unique_ptr TLInfo; - std::unique_ptr InstrInfo; InstrItineraryData InstrItins; - Triple TargetTriple; public: - AMDGPUSubtarget(const Triple &TT, StringRef CPU, StringRef FS, - TargetMachine &TM); + AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, + const TargetMachine &TM); + virtual ~AMDGPUSubtarget(); AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS); - const AMDGPUFrameLowering *getFrameLowering() const override { - return FrameLowering.get(); - } - const AMDGPUInstrInfo *getInstrInfo() const override { - return InstrInfo.get(); - } - const AMDGPURegisterInfo *getRegisterInfo() const override { - return &InstrInfo->getRegisterInfo(); - } - AMDGPUTargetLowering *getTargetLowering() const override { - return TLInfo.get(); - } + const AMDGPUInstrInfo *getInstrInfo() const override; + const AMDGPUFrameLowering *getFrameLowering() const override; + const AMDGPUTargetLowering *getTargetLowering() const override; + const AMDGPURegisterInfo *getRegisterInfo() const override; + const InstrItineraryData *getInstrItineraryData() const override { return &InstrItins; } void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - bool is64bit() const { - return Is64bit; - } - - bool hasVertexCache() const { - return HasVertexCache; - } - - short getTexVTXClauseSize() const { - return TexVTXClauseSize; + bool isAmdHsaOS() const { + return TargetTriple.getOS() == Triple::AMDHSA; } Generation getGeneration() const { return Gen; } - bool hasHWFP64() const { - return FP64; + unsigned getWavefrontSize() const { + return WavefrontSize; } - bool hasCaymanISA() const { - return CaymanISA; + int getLocalMemorySize() const { + return LocalMemorySize; } - bool hasFP32Denormals() const { - return FP32Denormals; + int getLDSBankCount() const { + return LDSBankCount; } - bool hasFP64Denormals() const { - return FP64Denormals; + unsigned getMaxPrivateElementSize() const { + return MaxPrivateElementSize; + } + + bool hasHWFP64() const { + return FP64; } bool hasFastFMAF32() const { return FastFMAF32; } - bool hasFlatAddressSpace() const { - return FlatAddressSpace; + bool hasHalfRate64Ops() const { + return HalfRate64Ops; } - bool useFlatForGlobal() const { - return FlatForGlobal; + bool hasAddr64() const { + return (getGeneration() < VOLCANIC_ISLANDS); } bool hasBFE() const { @@ -214,116 +214,249 @@ public: return (getGeneration() >= EVERGREEN); } - bool IsIRStructurizerEnabled() const { - return EnableIRStructurizer; + bool hasCaymanISA() const { + return CaymanISA; } bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } - bool isIfCvtEnabled() const { - return EnableIfCvt; + bool unsafeDSOffsetFoldingEnabled() const { + return EnableUnsafeDSOffsetFolding; } - bool loadStoreOptEnabled() const { - return EnableLoadStoreOpt; + bool dumpCode() const { + return DumpCode; } - bool unsafeDSOffsetFoldingEnabled() const { - return EnableUnsafeDSOffsetFolding; + /// Return the amount of LDS that can be used that will not restrict the + /// occupancy lower than WaveCount. + unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const; + + /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if + /// the given LDS memory size is the only constraint. + unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + + + bool hasFP32Denormals() const { + return FP32Denormals; } - unsigned getWavefrontSize() const { - return WavefrontSize; + bool hasFP64Denormals() const { + return FP64Denormals; } - unsigned getStackEntrySize() const; + bool hasFPExceptions() const { + return FPExceptions; + } - bool hasCFAluBug() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - return CFALUBug; + bool useFlatForGlobal() const { + return FlatForGlobal; } - int getLocalMemorySize() const { - return LocalMemorySize; + bool hasUnalignedBufferAccess() const { + return UnalignedBufferAccess; } - bool hasSGPRInitBug() const { - return SGPRInitBug; + bool isXNACKEnabled() const { + return EnableXNACK; } - int getLDSBankCount() const { - return LDSBankCount; + unsigned getMaxWavesPerCU() const { + if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 10; + + // FIXME: Not sure what this is for other subtagets. + return 8; } - unsigned getAmdKernelCodeChipID() const; + /// \brief Returns the offset in bytes from the start of the input buffer + /// of the first explicit kernel argument. + unsigned getExplicitKernelArgOffset() const { + return isAmdHsaOS() ? 0 : 36; + } - AMDGPU::IsaVersion getIsaVersion() const; + unsigned getStackAlignment() const { + // Scratch is allocated in 256 dword per wave blocks. + return 4 * 256 / getWavefrontSize(); + } bool enableMachineScheduler() const override { return true; } - void overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, MachineInstr *end, - unsigned NumRegionInstrs) const override; + bool enableSubRegLiveness() const override { + return true; + } +}; - // Helper functions to simplify if statements - bool isTargetELF() const { - return false; +class R600Subtarget final : public AMDGPUSubtarget { +private: + R600InstrInfo InstrInfo; + R600FrameLowering FrameLowering; + R600TargetLowering TLInfo; + +public: + R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, + const TargetMachine &TM); + + const R600InstrInfo *getInstrInfo() const override { + return &InstrInfo; } - StringRef getDeviceName() const { - return DevName; + const R600FrameLowering *getFrameLowering() const override { + return &FrameLowering; } - bool enableHugeScratchBuffer() const { - return EnableHugeScratchBuffer; + const R600TargetLowering *getTargetLowering() const override { + return &TLInfo; } - bool enableSIScheduler() const { - return EnableSIScheduler; + const R600RegisterInfo *getRegisterInfo() const override { + return &InstrInfo.getRegisterInfo(); } - bool dumpCode() const { - return DumpCode; + bool hasCFAluBug() const { + return CFALUBug; } - bool r600ALUEncoding() const { - return R600ALUInst; + + bool hasVertexCache() const { + return HasVertexCache; } - bool isAmdHsaOS() const { - return TargetTriple.getOS() == Triple::AMDHSA; + + short getTexVTXClauseSize() const { + return TexVTXClauseSize; } - bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; - bool isXNACKEnabled() const { - return EnableXNACK; + unsigned getStackEntrySize() const; +}; + +class SISubtarget final : public AMDGPUSubtarget { +public: + enum { + FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 + }; + +private: + SIInstrInfo InstrInfo; + SIFrameLowering FrameLowering; + SITargetLowering TLInfo; + std::unique_ptr GISel; + +public: + SISubtarget(const Triple &TT, StringRef CPU, StringRef FS, + const TargetMachine &TM); + + const SIInstrInfo *getInstrInfo() const override { + return &InstrInfo; } - unsigned getMaxWavesPerCU() const { - if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 10; + const SIFrameLowering *getFrameLowering() const override { + return &FrameLowering; + } - // FIXME: Not sure what this is for other subtagets. - llvm_unreachable("do not know max waves per CU for this subtarget."); + const SITargetLowering *getTargetLowering() const override { + return &TLInfo; } - bool enableSubRegLiveness() const override { - return true; + const CallLowering *getCallLowering() const override { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getCallLowering(); } - /// \brief Returns the offset in bytes from the start of the input buffer - /// of the first explicit kernel argument. - unsigned getExplicitKernelArgOffset() const { - return isAmdHsaOS() ? 0 : 36; + const SIRegisterInfo *getRegisterInfo() const override { + return &InstrInfo.getRegisterInfo(); + } + + void setGISelAccessor(GISelAccessor &GISel) { + this->GISel.reset(&GISel); } + void overrideSchedPolicy(MachineSchedPolicy &Policy, + unsigned NumRegionInstrs) const override; + + bool isVGPRSpillingEnabled(const Function& F) const; + + unsigned getAmdKernelCodeChipID() const; + + AMDGPU::IsaVersion getIsaVersion() const; + unsigned getMaxNumUserSGPRs() const { return 16; } + + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } + + bool hasSMemRealTime() const { + return HasSMemRealTime; + } + + bool has16BitInsts() const { + return Has16BitInsts; + } + + bool enableSIScheduler() const { + return EnableSIScheduler; + } + + bool debuggerSupported() const { + return debuggerInsertNops() && debuggerReserveRegs() && + debuggerEmitPrologue(); + } + + bool debuggerInsertNops() const { + return DebuggerInsertNops; + } + + bool debuggerReserveRegs() const { + return DebuggerReserveRegs; + } + + bool debuggerEmitPrologue() const { + return DebuggerEmitPrologue; + } + + bool loadStoreOptEnabled() const { + return EnableLoadStoreOpt; + } + + bool hasSGPRInitBug() const { + return SGPRInitBug; + } }; + +inline const AMDGPUInstrInfo *AMDGPUSubtarget::getInstrInfo() const { + if (getGeneration() >= SOUTHERN_ISLANDS) + return static_cast(this)->getInstrInfo(); + + return static_cast(this)->getInstrInfo(); +} + +inline const AMDGPUFrameLowering *AMDGPUSubtarget::getFrameLowering() const { + if (getGeneration() >= SOUTHERN_ISLANDS) + return static_cast(this)->getFrameLowering(); + + return static_cast(this)->getFrameLowering(); +} + +inline const AMDGPUTargetLowering *AMDGPUSubtarget::getTargetLowering() const { + if (getGeneration() >= SOUTHERN_ISLANDS) + return static_cast(this)->getTargetLowering(); + + return static_cast(this)->getTargetLowering(); +} + +inline const AMDGPURegisterInfo *AMDGPUSubtarget::getRegisterInfo() const { + if (getGeneration() >= SOUTHERN_ISLANDS) + return static_cast(this)->getRegisterInfo(); + + return static_cast(this)->getRegisterInfo(); +} + } // End namespace llvm #endif diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 519ae5cc748d..3e53f52c689f 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -14,19 +14,23 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetMachine.h" -#include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" +#include "AMDGPUCallLowering.h" +#include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" + #include "llvm/Analysis/Passes.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Verifier.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/IR/LegacyPassManager.h" @@ -34,10 +38,35 @@ #include "llvm/Support/raw_os_ostream.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" -#include +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Vectorize.h" using namespace llvm; +static cl::opt EnableR600StructurizeCFG( + "r600-ir-structurize", + cl::desc("Use StructurizeCFG IR pass"), + cl::init(true)); + +static cl::opt EnableSROA( + "amdgpu-sroa", + cl::desc("Run SROA after promote alloca pass"), + cl::ReallyHidden, + cl::init(true)); + +static cl::opt EnableR600IfConvert( + "r600-if-convert", + cl::desc("Use if conversion pass"), + cl::ReallyHidden, + cl::init(true)); + +// Option to disable vectorizer for tests. +static cl::opt EnableLoadStoreVectorizer( + "amdgpu-load-store-vectorizer", + cl::desc("Enable load store vectorizer"), + cl::init(false), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(TheAMDGPUTarget); @@ -47,17 +76,22 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); - initializeSIFixSGPRLiveRangesPass(*PR); + initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); + initializeAMDGPUPromoteAllocaPass(*PR); + initializeAMDGPUCodeGenPreparePass(*PR); + initializeSIAnnotateControlFlowPass(*PR); + initializeSIDebuggerInsertNopsPass(*PR); + initializeSIInsertWaitsPass(*PR); + initializeSIWholeQuadModePass(*PR); + initializeSILowerControlFlowPass(*PR); + initializeSIDebuggerInsertNopsPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { - if (TT.getOS() == Triple::AMDHSA) - return make_unique(); - return make_unique(); } @@ -73,60 +107,156 @@ static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler); -static std::string computeDataLayout(const Triple &TT) { - std::string Ret = "e-p:32:32"; - - if (TT.getArch() == Triple::amdgcn) { - // 32-bit private, local, and region pointers. 64-bit global and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; +static StringRef computeDataLayout(const Triple &TT) { + if (TT.getArch() == Triple::r600) { + // 32-bit pointers. + return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } - Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" - "-v512:512-v1024:1024-v2048:2048-n32:64"; + // 32-bit private, local, and region pointers. 64-bit global, constant and + // flat. + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; +} + +LLVM_READNONE +static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { + if (!GPU.empty()) + return GPU; - return Ret; + // HSA only supports CI+, so change the default GPU to a CI for HSA. + if (TT.getArch() == Triple::amdgcn) + return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; + + return "r600"; +} + +static Reloc::Model getEffectiveRelocModel(Optional RM) { + // The AMDGPU toolchain only supports generating shared objects, so we + // must always use PIC. + return Reloc::PIC_; } AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, - TargetOptions Options, Reloc::Model RM, + TargetOptions Options, + Optional RM, CodeModel::Model CM, CodeGenOpt::Level OptLevel) - : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, - OptLevel), - TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this), - IntrinsicInfo() { + : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), + FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), + TLOF(createTLOF(getTargetTriple())), + IntrinsicInfo() { setRequiresStructuredCFG(true); initAsmInfo(); } AMDGPUTargetMachine::~AMDGPUTargetMachine() { } +StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { + Attribute GPUAttr = F.getFnAttribute("target-cpu"); + return GPUAttr.hasAttribute(Attribute::None) ? + getTargetCPU() : GPUAttr.getValueAsString(); +} + +StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { + Attribute FSAttr = F.getFnAttribute("target-features"); + + return FSAttr.hasAttribute(Attribute::None) ? + getTargetFeatureString() : + FSAttr.getValueAsString(); +} + //===----------------------------------------------------------------------===// // R600 Target Machine (R600 -> Cayman) //===----------------------------------------------------------------------===// R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, - StringRef FS, StringRef CPU, - TargetOptions Options, Reloc::Model RM, + StringRef CPU, StringRef FS, + TargetOptions Options, + Optional RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + +const R600Subtarget *R600TargetMachine::getSubtargetImpl( + const Function &F) const { + StringRef GPU = getGPUName(F); + StringRef FS = getFeatureString(F); + + SmallString<128> SubtargetKey(GPU); + SubtargetKey.append(FS); + + auto &I = SubtargetMap[SubtargetKey]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique(TargetTriple, GPU, FS, *this); + } + + return I.get(); +} //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { +struct SIGISelActualAccessor : public GISelAccessor { + std::unique_ptr CallLoweringInfo; + const AMDGPUCallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } +}; +} // End anonymous namespace. +#endif + GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, - StringRef FS, StringRef CPU, - TargetOptions Options, Reloc::Model RM, + StringRef CPU, StringRef FS, + TargetOptions Options, + Optional RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + +const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { + StringRef GPU = getGPUName(F); + StringRef FS = getFeatureString(F); + + SmallString<128> SubtargetKey(GPU); + SubtargetKey.append(FS); + + auto &I = SubtargetMap[SubtargetKey]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique(TargetTriple, GPU, FS, *this); + +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *GISel = new GISelAccessor(); +#else + SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); + GISel->CallLoweringInfo.reset( + new AMDGPUCallLowering(*I->getTargetLowering())); +#endif + + I->setGISelAccessor(*GISel); + } + + return I.get(); +} //===----------------------------------------------------------------------===// // AMDGPU Pass Setup //===----------------------------------------------------------------------===// namespace { + class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) @@ -142,16 +272,8 @@ public: return getTM(); } - ScheduleDAGInstrs * - createMachineScheduler(MachineSchedContext *C) const override { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - return createR600MachineScheduler(C); - else if (ST.enableSIScheduler()) - return createSIMachineScheduler(C); - return nullptr; - } - + void addEarlyCSEOrGVNPass(); + void addStraightLineScalarOptimizationPasses(); void addIRPasses() override; void addCodeGenPrepare() override; bool addPreISel() override; @@ -159,27 +281,44 @@ public: bool addGCPasses() override; }; -class R600PassConfig : public AMDGPUPassConfig { +class R600PassConfig final : public AMDGPUPassConfig { public: R600PassConfig(TargetMachine *TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { } + ScheduleDAGInstrs *createMachineScheduler( + MachineSchedContext *C) const override { + return createR600MachineScheduler(C); + } + bool addPreISel() override; void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; }; -class GCNPassConfig : public AMDGPUPassConfig { +class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { } + + GCNTargetMachine &getGCNTargetMachine() const { + return getTM(); + } + + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override; + bool addPreISel() override; + void addMachineSSAOptimization() override; bool addInstSelector() override; +#ifdef LLVM_BUILD_GLOBAL_ISEL + bool addIRTranslator() override; + bool addRegBankSelect() override; +#endif void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addPreRegAlloc() override; - void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; }; @@ -188,12 +327,39 @@ public: TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo( - AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); + return TargetTransformInfo(AMDGPUTTIImpl(this, F)); }); } +void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { + if (getOptLevel() == CodeGenOpt::Aggressive) + addPass(createGVNPass()); + else + addPass(createEarlyCSEPass()); +} + +void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { + addPass(createSeparateConstOffsetFromGEPPass()); + addPass(createSpeculativeExecutionPass()); + // ReassociateGEPs exposes more opportunites for SLSR. See + // the example in reassociate-geps-and-slsr.ll. + addPass(createStraightLineStrengthReducePass()); + // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or + // EarlyCSE can reuse. + addEarlyCSEOrGVNPass(); + // Run NaryReassociate after EarlyCSE/GVN to be more effective. + addPass(createNaryReassociatePass()); + // NaryReassociate on GEPs creates redundant common expressions, so run + // EarlyCSE after it. + addPass(createEarlyCSEPass()); +} + void AMDGPUPassConfig::addIRPasses() { + // There is no reason to run these. + disablePass(&StackMapLivenessID); + disablePass(&FuncletLayoutID); + disablePass(&PatchableFunctionID); + // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerPass()); @@ -207,24 +373,43 @@ void AMDGPUPassConfig::addIRPasses() { // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + if (TM.getOptLevel() > CodeGenOpt::None) { + addPass(createAMDGPUPromoteAlloca(&TM)); + + if (EnableSROA) + addPass(createSROAPass()); + } + + addStraightLineScalarOptimizationPasses(); + TargetPassConfig::addIRPasses(); + + // EarlyCSE is not always strong enough to clean up what LSR produces. For + // example, GVN can combine + // + // %0 = add %a, %b + // %1 = add %b, %a + // + // and + // + // %0 = shl nsw %a, 2 + // %1 = shl %a, 2 + // + // but EarlyCSE can do neither of them. + if (getOptLevel() != CodeGenOpt::None) + addEarlyCSEOrGVNPass(); } void AMDGPUPassConfig::addCodeGenPrepare() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - if (ST.isPromoteAllocaEnabled()) { - addPass(createAMDGPUPromoteAlloca(ST)); - addPass(createSROAPass()); - } TargetPassConfig::addCodeGenPrepare(); + + if (EnableLoadStoreVectorizer) + addPass(createLoadStoreVectorizerPass()); } -bool -AMDGPUPassConfig::addPreISel() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); +bool AMDGPUPassConfig::addPreISel() { addPass(createFlattenCFGPass()); - if (ST.IsIRStructurizerEnabled()) - addPass(createStructurizeCFGPass()); return false; } @@ -244,7 +429,9 @@ bool AMDGPUPassConfig::addGCPasses() { bool R600PassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); - addPass(createR600TextureIntrinsicsReplacer()); + + if (EnableR600StructurizeCFG) + addPass(createStructurizeCFGPass()); return false; } @@ -253,9 +440,8 @@ void R600PassConfig::addPreRegAlloc() { } void R600PassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createR600EmitClauseMarkers(), false); - if (ST.isIfCvtEnabled()) + if (EnableR600IfConvert) addPass(&IfConverterID, false); addPass(createR600ClauseMergePass(*TM), false); } @@ -276,32 +462,62 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { // GCN Pass Setup //===----------------------------------------------------------------------===// +ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( + MachineSchedContext *C) const { + const SISubtarget &ST = C->MF->getSubtarget(); + if (ST.enableSIScheduler()) + return createSIMachineScheduler(C); + return nullptr; +} + bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); // FIXME: We need to run a pass to propagate the attributes when calls are // supported. addPass(&AMDGPUAnnotateKernelFeaturesID); - + addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions addPass(createSinkingPass()); addPass(createSITypeRewriter()); - addPass(createSIAnnotateControlFlowPass()); addPass(createAMDGPUAnnotateUniformValues()); + addPass(createSIAnnotateControlFlowPass()); return false; } +void GCNPassConfig::addMachineSSAOptimization() { + TargetPassConfig::addMachineSSAOptimization(); + + // We want to fold operands after PeepholeOptimizer has run (or as part of + // it), because it will eliminate extra copies making it easier to fold the + // real source operand. We want to eliminate dead instructions after, so that + // we see fewer uses of the copies. We then need to clean up the dead + // instructions leftover after the operands are folded as well. + // + // XXX - Can we get away without running DeadMachineInstructionElim again? + addPass(&SIFoldOperandsID); + addPass(&DeadMachineInstructionElimID); +} + bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); addPass(&SIFixSGPRCopiesID); - addPass(createSIFoldOperandsPass()); return false; } -void GCNPassConfig::addPreRegAlloc() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); +#ifdef LLVM_BUILD_GLOBAL_ISEL +bool GCNPassConfig::addIRTranslator() { + addPass(new IRTranslator()); + return false; +} +bool GCNPassConfig::addRegBankSelect() { + return false; +} +#endif + +void GCNPassConfig::addPreRegAlloc() { // This needs to be run directly before register allocation because // earlier passes might recompute live intervals. // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass @@ -309,42 +525,48 @@ void GCNPassConfig::addPreRegAlloc() { insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } - if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + if (getOptLevel() > CodeGenOpt::None) { // Don't do this with no optimizations since it throws away debug info by // merging nonadjacent loads. // This should be run after scheduling, but before register allocation. It // also need extra copies to the address operand to be eliminated. + + // FIXME: Move pre-RA and remove extra reg coalescer run. insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); insertPass(&MachineSchedulerID, &RegisterCoalescerID); } - addPass(createSIShrinkInstructionsPass(), false); + + addPass(createSIShrinkInstructionsPass()); + addPass(createSIWholeQuadModePass()); } void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { - addPass(&SIFixSGPRLiveRangesID); TargetPassConfig::addFastRegAlloc(RegAllocPass); } void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - // We want to run this after LiveVariables is computed to avoid computing them - // twice. - // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure - // that needs to be fixed. - insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false); TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } -void GCNPassConfig::addPostRegAlloc() { - addPass(createSIShrinkInstructionsPass(), false); -} - void GCNPassConfig::addPreSched2() { } void GCNPassConfig::addPreEmitPass() { - addPass(createSIInsertWaits(*TM), false); - addPass(createSILowerControlFlowPass(*TM), false); + // The hazard recognizer that runs as part of the post-ra scheduler does not + // guarantee to be able handle all hazards correctly. This is because if there + // are multiple scheduling regions in a basic block, the regions are scheduled + // bottom up, so when we begin to schedule a region we don't know what + // instructions were emitted directly before it. + // + // Here we add a stand-alone hazard recognizer pass which can handle all + // cases. + addPass(&PostRAHazardRecognizerID); + + addPass(createSIInsertWaitsPass()); + addPass(createSIShrinkInstructionsPass()); + addPass(createSILowerControlFlowPass()); + addPass(createSIDebuggerInsertNopsPass()); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 236e3f824030..b0eb3a9a15f7 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -12,15 +12,11 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H -#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H -#include "AMDGPUFrameLowering.h" -#include "AMDGPUInstrInfo.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "llvm/IR/DataLayout.h" namespace llvm { @@ -29,23 +25,23 @@ namespace llvm { //===----------------------------------------------------------------------===// class AMDGPUTargetMachine : public LLVMTargetMachine { -private: - protected: std::unique_ptr TLOF; - AMDGPUSubtarget Subtarget; AMDGPUIntrinsicInfo IntrinsicInfo; + StringRef getGPUName(const Function &F) const; + StringRef getFeatureString(const Function &F) const; + public: - AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); + AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, TargetOptions Options, + Optional RM, CodeModel::Model CM, + CodeGenOpt::Level OL); ~AMDGPUTargetMachine(); - const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; } - const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override { - return &Subtarget; - } + const AMDGPUSubtarget *getSubtargetImpl() const; + const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override; + const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { return &IntrinsicInfo; } @@ -60,30 +56,47 @@ public: // R600 Target Machine (R600 -> Cayman) //===----------------------------------------------------------------------===// -class R600TargetMachine : public AMDGPUTargetMachine { +class R600TargetMachine final : public AMDGPUTargetMachine { +private: + mutable StringMap> SubtargetMap; public: - R600TargetMachine(const Target &T, const Triple &TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); + R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, TargetOptions Options, + Optional RM, CodeModel::Model CM, + CodeGenOpt::Level OL); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + + const R600Subtarget *getSubtargetImpl(const Function &) const override; }; //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// -class GCNTargetMachine : public AMDGPUTargetMachine { +class GCNTargetMachine final : public AMDGPUTargetMachine { +private: + mutable StringMap> SubtargetMap; public: - GCNTargetMachine(const Target &T, const Triple &TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); + GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, TargetOptions Options, + Optional RM, CodeModel::Model CM, + CodeGenOpt::Level OL); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + + const SISubtarget *getSubtargetImpl(const Function &) const override; }; +inline const AMDGPUSubtarget *AMDGPUTargetMachine::getSubtargetImpl( + const Function &F) const { + if (getTargetTriple().getArch() == Triple::amdgcn) + return static_cast(this)->getSubtargetImpl(F); + return static_cast(this)->getSubtargetImpl(F); +} + } // End namespace llvm #endif diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index e050f21091ba..03d1e2c764de 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -29,59 +29,3 @@ MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM); } - -//===----------------------------------------------------------------------===// -// HSA Object File -//===----------------------------------------------------------------------===// - - -void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM){ - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); - - TextSection = AMDGPU::getHSATextSection(Ctx); - - DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx); - DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx); - - RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx); -} - -bool AMDGPUHSATargetObjectFile::isAgentAllocationSection( - const char *SectionName) const { - return cast(DataGlobalAgentSection) - ->getSectionName() - .equals(SectionName); -} - -bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const { - // Read-only segments can only have agent allocation. - return AMDGPU::isReadOnlySegment(GV) || - (AMDGPU::isGlobalSegment(GV) && GV->hasSection() && - isAgentAllocationSection(GV->getSection())); -} - -bool AMDGPUHSATargetObjectFile::isProgramAllocation( - const GlobalValue *GV) const { - // The default for global segments is program allocation. - return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV); -} - -MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal( - const GlobalValue *GV, SectionKind Kind, - Mangler &Mang, - const TargetMachine &TM) const { - if (Kind.isText() && !GV->hasComdat()) - return getTextSection(); - - if (AMDGPU::isGlobalSegment(GV)) { - if (isAgentAllocation(GV)) - return DataGlobalAgentSection; - - if (isProgramAllocation(GV)) - return DataGlobalProgramSection; - } - - return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM); -} diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index 921341ebb897..f530e0952a74 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -28,24 +28,6 @@ class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF { const TargetMachine &TM) const override; }; -class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile { -private: - MCSection *DataGlobalAgentSection; - MCSection *DataGlobalProgramSection; - MCSection *RodataReadonlyAgentSection; - - bool isAgentAllocationSection(const char *SectionName) const; - bool isAgentAllocation(const GlobalValue *GV) const; - bool isProgramAllocation(const GlobalValue *GV) const; - -public: - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - - MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, - Mangler &Mang, - const TargetMachine &TM) const override; -}; - } // end namespace llvm #endif diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 54a003d6a9cf..3d630fe3ea9d 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" @@ -28,6 +29,7 @@ using namespace llvm; #define DEBUG_TYPE "AMDGPUtti" + void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { UP.Threshold = 300; // Twice the default. @@ -78,11 +80,127 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { return Vector ? 0 : 32; } +unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) { + switch (AddrSpace) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + return 128; + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: + return 64; + case AMDGPUAS::PRIVATE_ADDRESS: + return 8 * ST->getMaxPrivateElementSize(); + default: + if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && + (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || + AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || + (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && + AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) + return 128; + llvm_unreachable("unhandled address space"); + } +} + unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; } +int AMDGPUTTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, + TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo) { + + EVT OrigTy = TLI->getValueType(DL, Ty); + if (!OrigTy.isSimple()) { + return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); + } + + // Legalize the type. + std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + int ISD = TLI->InstructionOpcodeToISD(Opcode); + + // Because we don't have any legal vector operations, but the legal types, we + // need to account for split vectors. + unsigned NElts = LT.second.isVector() ? + LT.second.getVectorNumElements() : 1; + + MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; + + switch (ISD) { + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: { + if (SLT == MVT::i64) + return get64BitInstrCost() * LT.first * NElts; + + // i32 + return getFullRateInstrCost() * LT.first * NElts; + } + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + if (SLT == MVT::i64){ + // and, or and xor are typically split into 2 VALU instructions. + return 2 * getFullRateInstrCost() * LT.first * NElts; + } + + return LT.first * NElts * getFullRateInstrCost(); + } + case ISD::MUL: { + const int QuarterRateCost = getQuarterRateInstrCost(); + if (SLT == MVT::i64) { + const int FullRateCost = getFullRateInstrCost(); + return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; + } + + // i32 + return QuarterRateCost * NElts * LT.first; + } + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + if (SLT == MVT::f64) + return LT.first * NElts * get64BitInstrCost(); + + if (SLT == MVT::f32 || SLT == MVT::f16) + return LT.first * NElts * getFullRateInstrCost(); + break; + + case ISD::FDIV: + case ISD::FREM: + // FIXME: frem should be handled separately. The fdiv in it is most of it, + // but the current lowering is also not entirely correct. + if (SLT == MVT::f64) { + int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); + + // Add cost of workaround. + if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) + Cost += 3 * getFullRateInstrCost(); + + return LT.first * Cost * NElts; + } + + // Assuming no fp32 denormals lowering. + if (SLT == MVT::f32 || SLT == MVT::f16) { + assert(!ST->hasFP32Denormals() && "will change when supported"); + int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); + return LT.first * NElts * Cost; + } + + break; + default: + break; + } + + return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); +} + unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { // XXX - For some reason this isn't called for switch. switch (Opcode) { @@ -98,6 +216,11 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index) { switch (Opcode) { case Instruction::ExtractElement: + case Instruction::InsertElement: + // Extracts are just reads of a subregister, so are free. Inserts are + // considered free because we don't want to have any cost for scalarizing + // operations, and we don't have to copy into a different register class. + // Dynamic indexing isn't free and is best avoided. return Index == ~0u ? 2 : 0; default: @@ -115,6 +238,9 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, // IntrinsicsAMDGPU.td break; + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::amdgcn_interp_p1: case Intrinsic::amdgcn_interp_p2: case Intrinsic::amdgcn_mbcnt_hi: @@ -122,6 +248,31 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, case Intrinsic::r600_read_tidig_x: case Intrinsic::r600_read_tidig_y: case Intrinsic::r600_read_tidig_z: + case Intrinsic::amdgcn_image_atomic_swap: + case Intrinsic::amdgcn_image_atomic_add: + case Intrinsic::amdgcn_image_atomic_sub: + case Intrinsic::amdgcn_image_atomic_smin: + case Intrinsic::amdgcn_image_atomic_umin: + case Intrinsic::amdgcn_image_atomic_smax: + case Intrinsic::amdgcn_image_atomic_umax: + case Intrinsic::amdgcn_image_atomic_and: + case Intrinsic::amdgcn_image_atomic_or: + case Intrinsic::amdgcn_image_atomic_xor: + case Intrinsic::amdgcn_image_atomic_inc: + case Intrinsic::amdgcn_image_atomic_dec: + case Intrinsic::amdgcn_image_atomic_cmpswap: + case Intrinsic::amdgcn_buffer_atomic_swap: + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_ps_live: return true; } @@ -129,18 +280,17 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { default: return false; - case AMDGPUIntrinsic::SI_tid: case AMDGPUIntrinsic::SI_fs_interp: + case AMDGPUIntrinsic::SI_fs_constant: return true; } } static bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); - unsigned ShaderType = AMDGPU::getShaderType(*F); // Arguments to compute shaders are never a source of divergence. - if (ShaderType == ShaderType::COMPUTE) + if (!AMDGPU::isShader(F->getCallingConv())) return true; // For non-compute shaders, SGPR inputs are marked with either inreg or byval. @@ -169,6 +319,13 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { if (const LoadInst *Load = dyn_cast(V)) return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + // Atomics are divergent because they are executed sequentially: when an + // atomic operation refers to the same address in each thread, then each + // thread after the first sees the value written by the previous thread as + // original value. + if (isa(V) || isa(V)) + return true; + if (const IntrinsicInst *Intrinsic = dyn_cast(V)) { const TargetMachine &TM = getTLI()->getTargetMachine(); return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 976afb03443b..a82a07458086 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -14,18 +14,18 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/Target/TargetLowering.h" namespace llvm { +class AMDGPUTargetLowering; -class AMDGPUTTIImpl : public BasicTTIImplBase { +class AMDGPUTTIImpl final : public BasicTTIImplBase { typedef BasicTTIImplBase BaseT; typedef TargetTransformInfo TTI; friend BaseT; @@ -36,10 +36,33 @@ class AMDGPUTTIImpl : public BasicTTIImplBase { const AMDGPUSubtarget *getST() const { return ST; } const AMDGPUTargetLowering *getTLI() const { return TLI; } + + static inline int getFullRateInstrCost() { + return TargetTransformInfo::TCC_Basic; + } + + static inline int getHalfRateInstrCost() { + return 2 * TargetTransformInfo::TCC_Basic; + } + + // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe + // should be 2 or 4. + static inline int getQuarterRateInstrCost() { + return 3 * TargetTransformInfo::TCC_Basic; + } + + // On some parts, normal fp64 operations are half rate, and others + // quarter. This also applies to some integer operations. + inline int get64BitInstrCost() const { + return ST->hasHalfRate64Ops() ? + getHalfRateInstrCost() : getQuarterRateInstrCost(); + } + public: - explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL) - : BaseT(TM, DL), ST(TM->getSubtargetImpl()), - TLI(ST->getTargetLowering()) {} + explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), + ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) @@ -54,17 +77,27 @@ public: TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); - return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software; + return TTI::PSK_FastHardware; } unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace); unsigned getMaxInterleaveFactor(unsigned VF); + int getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + unsigned getCFInstrCost(unsigned Opcode); int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); bool isSourceOfDivergence(const Value *V) const; + + unsigned getVectorSplitCost() { return 0; } }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 917efd149e00..21de76396b16 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -50,8 +50,6 @@ STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " "matched"); STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " "matched"); -STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " - "pattern matched"); STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); @@ -162,7 +160,7 @@ public: bool prepare(); bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast(MF.getSubtarget().getInstrInfo()); + TII = MF.getSubtarget().getInstrInfo(); TRI = &TII->getRegisterInfo(); DEBUG(MF.dump();); OrderedBlks.clear(); @@ -213,7 +211,6 @@ protected: int getSCCNum(MachineBasicBlock *MBB) const; MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const; bool hasBackEdge(MachineBasicBlock *MBB) const; - static unsigned getLoopDepth(MachineLoop *LoopRep); bool isRetiredBlock(MachineBasicBlock *MBB) const; bool isActiveLoophead(MachineBasicBlock *MBB) const; PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, @@ -229,16 +226,15 @@ protected: // Function originally from CFGStructTraits void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode, - DebugLoc DL = DebugLoc()); + const DebugLoc &DL = DebugLoc()); MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode, - DebugLoc DL = DebugLoc()); + const DebugLoc &DL = DebugLoc()); MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode); void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode, - DebugLoc DL); + const DebugLoc &DL); void insertCondBranchBefore(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, int NewOpcode, int RegNum, - DebugLoc DL); - void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum); + MachineBasicBlock::iterator I, int NewOpcode, + int RegNum, const DebugLoc &DL); static int getBranchNzeroOpcode(int OldOpcode); static int getBranchZeroOpcode(int OldOpcode); static int getContinueNzeroOpcode(int OldOpcode); @@ -257,7 +253,6 @@ protected: /// instruction. Such move instruction "belong to" the loop backward-edge. MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB); static MachineInstr *getReturnInstr(MachineBasicBlock *MBB); - static MachineInstr *getContinueInstr(MachineBasicBlock *MBB); static bool isReturnBlock(MachineBasicBlock *MBB); static void cloneSuccessorList(MachineBasicBlock *DstMBB, MachineBasicBlock *SrcMBB) ; @@ -276,11 +271,7 @@ protected: int ifPatternMatch(MachineBasicBlock *MBB); int loopendPatternMatch(); int mergeLoop(MachineLoop *LoopRep); - int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader); - void handleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, - MachineLoop *ContLoop); /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in /// the same loop with LoopLandInfo without explicitly keeping track of /// loopContBlks and loopBreakBlks, this is a method to get the information. @@ -337,13 +328,7 @@ protected: MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I); void recordSccnum(MachineBasicBlock *MBB, int SCCNum); void retireBlock(MachineBasicBlock *MBB); - void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); - MachineBasicBlock *findNearestCommonPostDom(std::set&); - /// This is work around solution for findNearestCommonDominator not available - /// to post dom a proper fix should go to Dominators.h. - MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1, - MachineBasicBlock *MBB2); private: MBBInfoMap BlockInfoMap; @@ -376,10 +361,6 @@ bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { return MBB->isSuccessor(LoopHeader); } -unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) { - return LoopRep ? LoopRep->getLoopDepth() : 0; -} - bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const { MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); if (It == BlockInfoMap.end()) @@ -442,7 +423,8 @@ bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { void AMDGPUCFGStructurizer::reversePredicateSetter( MachineBasicBlock::iterator I) { - while (I--) { + assert(static_cast(I) && "Expected valid iterator"); + for (;; --I) { if (I->getOpcode() == AMDGPU::PRED_X) { switch (static_cast(I)->getOperand(2).getImm()) { case OPCODE_IS_ZERO_INT: @@ -469,16 +451,17 @@ void AMDGPUCFGStructurizer::reversePredicateSetter( } void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, - int NewOpcode, DebugLoc DL) { - MachineInstr *MI = MBB->getParent() - ->CreateMachineInstr(TII->get(NewOpcode), DL); + int NewOpcode, const DebugLoc &DL) { + MachineInstr *MI = + MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); MBB->push_back(MI); //assume the instruction doesn't take any reg operand ... SHOWNEWINSTR(MI); } MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, - int NewOpcode, DebugLoc DL) { + int NewOpcode, + const DebugLoc &DL) { MachineInstr *MI = MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); if (MBB->begin() != MBB->end()) @@ -502,7 +485,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore( } void AMDGPUCFGStructurizer::insertCondBranchBefore( - MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) { + MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) { MachineInstr *OldMI = &(*I); MachineBasicBlock *MBB = OldMI->getParent(); MachineFunction *MF = MBB->getParent(); @@ -514,9 +497,9 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore( //erase later oldInstr->eraseFromParent(); } -void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk, - MachineBasicBlock::iterator I, int NewOpcode, int RegNum, - DebugLoc DL) { +void AMDGPUCFGStructurizer::insertCondBranchBefore( + MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode, + int RegNum, const DebugLoc &DL) { MachineFunction *MF = blk->getParent(); MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL); //insert before @@ -525,16 +508,6 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk, SHOWNEWINSTR(NewInstr); } -void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB, - int NewOpcode, int RegNum) { - MachineFunction *MF = MBB->getParent(); - MachineInstr *NewInstr = - MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); - MBB->push_back(NewInstr); - MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); - SHOWNEWINSTR(NewInstr); -} - int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { switch(OldOpcode) { case AMDGPU::JUMP_COND: @@ -664,16 +637,6 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { return nullptr; } -MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) { - MachineBasicBlock::reverse_iterator It = MBB->rbegin(); - if (It != MBB->rend()) { - MachineInstr *MI = &(*It); - if (MI->getOpcode() == AMDGPU::CONTINUE) - return MI; - } - return nullptr; -} - bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { MachineInstr *MI = getReturnInstr(MBB); bool IsReturn = (MBB->succ_size() == 0); @@ -697,11 +660,8 @@ MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) { MachineFunction *Func = MBB->getParent(); MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock(); Func->push_back(NewMBB); //insert to function - for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end(); - It != E; ++It) { - MachineInstr *MI = Func->CloneMachineInstr(It); - NewMBB->push_back(MI); - } + for (const MachineInstr &It : *MBB) + NewMBB->push_back(Func->CloneMachineInstr(&It)); return NewMBB; } @@ -727,7 +687,7 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { while (It != E) { if (Pre->getOpcode() == AMDGPU::CONTINUE && It->getOpcode() == AMDGPU::ENDLOOP) - ContInstr.push_back(Pre); + ContInstr.push_back(&*Pre); Pre = It; ++It; } @@ -923,7 +883,7 @@ bool AMDGPUCFGStructurizer::run() { if (!Finish) { DEBUG(FuncRep->viewCFG()); - llvm_unreachable("IRREDUCIBLE_CFG"); + report_fatal_error("IRREDUCIBLE_CFG"); } return true; @@ -1145,34 +1105,6 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { return 1; } -int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, - MachineBasicBlock *LoopHeader) { - int NumCont = 0; - SmallVector ContMBB; - typedef GraphTraits > GTIM; - GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader), - E = GTIM::child_end(LoopHeader); - for (; It != E; ++It) { - MachineBasicBlock *MBB = *It; - if (LoopRep->contains(MBB)) { - handleLoopcontBlock(MBB, MLI->getLoopFor(MBB), - LoopHeader, LoopRep); - ContMBB.push_back(MBB); - ++NumCont; - } - } - - for (SmallVectorImpl::iterator It = ContMBB.begin(), - E = ContMBB.end(); It != E; ++It) { - (*It)->removeSuccessor(LoopHeader, true); - } - - numLoopcontPatternMatch += NumCont; - - return NumCont; -} - - bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) { if (Src1MBB->succ_size() == 0) { @@ -1413,10 +1345,10 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF); if (LandBlkHasOtherPred) { - llvm_unreachable("Extra register needed to handle CFG"); + report_fatal_error("Extra register needed to handle CFG"); unsigned CmpResReg = HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); - llvm_unreachable("Extra compare instruction needed to handle CFG"); + report_fatal_error("Extra compare instruction needed to handle CFG"); insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, CmpResReg, DebugLoc()); } @@ -1433,7 +1365,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // need to uncondionally insert the assignment to ensure a path from its // predecessor rather than headBlk has valid value in initReg if // (initVal != 1). - llvm_unreachable("Extra register needed to handle CFG"); + report_fatal_error("Extra register needed to handle CFG"); } insertInstrBefore(I, AMDGPU::ELSE); @@ -1442,7 +1374,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // need to uncondionally insert the assignment to ensure a path from its // predecessor rather than headBlk has valid value in initReg if // (initVal != 0) - llvm_unreachable("Extra register needed to handle CFG"); + report_fatal_error("Extra register needed to handle CFG"); } if (LandBlkHasOtherPred) { @@ -1454,7 +1386,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, PE = LandBlk->pred_end(); PI != PE; ++PI) { MachineBasicBlock *MBB = *PI; if (MBB != TrueMBB && MBB != FalseMBB) - llvm_unreachable("Extra register needed to handle CFG"); + report_fatal_error("Extra register needed to handle CFG"); } } DEBUG( @@ -1468,17 +1400,6 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, return NumNewBlk; } -void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, - MachineLoop *ContLoop) { - DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber() - << " header = BB" << ContMBB->getNumber() << "\n"; - dbgs() << "Trying to continue loop-depth = " - << getLoopDepth(ContLoop) - << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";); - settleLoopcontBlock(ContingMBB, ContMBB); -} - void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, MachineBasicBlock *SrcMBB) { DEBUG( @@ -1809,76 +1730,6 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { && "can't retire block yet"); } -void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep, - MachineBasicBlock *MBB) { - MachineBasicBlock *&TheEntry = LLInfoMap[loopRep]; - if (!MBB) { - MBB = FuncRep->CreateMachineBasicBlock(); - FuncRep->push_back(MBB); //insert to function - SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: "); - } - TheEntry = MBB; - DEBUG( - dbgs() << "setLoopLandBlock loop-header = BB" - << loopRep->getHeader()->getNumber() - << " landing-block = BB" << MBB->getNumber() << "\n"; - ); -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1, - MachineBasicBlock *MBB2) { - - if (PDT->dominates(MBB1, MBB2)) - return MBB1; - if (PDT->dominates(MBB2, MBB1)) - return MBB2; - - MachineDomTreeNode *Node1 = PDT->getNode(MBB1); - MachineDomTreeNode *Node2 = PDT->getNode(MBB2); - - // Handle newly cloned node. - if (!Node1 && MBB1->succ_size() == 1) - return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2); - if (!Node2 && MBB2->succ_size() == 1) - return findNearestCommonPostDom(MBB1, *MBB2->succ_begin()); - - if (!Node1 || !Node2) - return nullptr; - - Node1 = Node1->getIDom(); - while (Node1) { - if (PDT->dominates(Node1, Node2)) - return Node1->getBlock(); - Node1 = Node1->getIDom(); - } - - return nullptr; -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::findNearestCommonPostDom( - std::set &MBBs) { - MachineBasicBlock *CommonDom; - std::set::const_iterator It = MBBs.begin(); - std::set::const_iterator E = MBBs.end(); - for (CommonDom = *It; It != E && CommonDom; ++It) { - MachineBasicBlock *MBB = *It; - if (MBB != CommonDom) - CommonDom = findNearestCommonPostDom(MBB, CommonDom); - } - - DEBUG( - dbgs() << "Common post dominator for exit blocks is "; - if (CommonDom) - dbgs() << "BB" << CommonDom->getNumber() << "\n"; - else - dbgs() << "NULL\n"; - ); - - return CommonDom; -} - char AMDGPUCFGStructurizer::ID = 0; } // end anonymous namespace diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h index a9ba60c8cbad..5d243e949fd3 100644 --- a/lib/Target/AMDGPU/AMDKernelCodeT.h +++ b/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -44,6 +44,15 @@ enum amd_code_version_t { AMD_CODE_VERSION_MINOR = 1 }; +// Sets val bits for specified mask in specified dst packed instance. +#define AMD_HSA_BITS_SET(dst, mask, val) \ + dst &= (~(1 << mask ## _SHIFT) & ~mask); \ + dst |= (((val) << mask ## _SHIFT) & mask) + +// Gets bits for specified mask from specified src packed instance. +#define AMD_HSA_BITS_GET(src, mask) \ + ((src & mask) >> mask ## _SHIFT) \ + /// The values used to define the number of bytes to use for the /// swizzle element size. enum amd_element_byte_size_t { @@ -118,10 +127,14 @@ enum amd_code_property_mask_t { AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, + AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10, + AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6, + AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT, + /// Control wave ID base counter for GDS ordered-append. Used to set /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if /// ORDERED_APPEND_MODE also needs to be settable) - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16, AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, @@ -146,7 +159,7 @@ enum amd_code_property_mask_t { /// is generally DWORD. /// /// uSE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM. - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, @@ -155,7 +168,7 @@ enum amd_code_property_mask_t { /// HSA_MACHINE_LARGE. Must also match /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). - AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13, + AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19, AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, @@ -167,18 +180,22 @@ enum amd_code_property_mask_t { /// workitem_private_segment_byte_size only specifies the statically /// know private segment size, and additional space must be added /// for the call stack. - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20, AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, /// Indicate if code generated has support for debugging. - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21, AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT, - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 15, + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22, AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1, - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT, + + AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23, + AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9, + AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT }; /// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index d9f753f40133..efcf1b23adaa 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// +//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ---------===// // // The LLVM Compiler Infrastructure // @@ -7,15 +7,17 @@ // //===----------------------------------------------------------------------===// +#include "AMDKernelCodeT.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "AMDKernelCodeT.h" #include "SIDefines.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDKernelCodeTUtils.h" +#include "Utils/AMDGPUAsmUtils.h" #include "llvm/ADT/APFloat.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" @@ -25,16 +27,17 @@ #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; @@ -42,6 +45,8 @@ namespace { struct OptionalOperand; +enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL }; + class AMDGPUOperand : public MCParsedAsmOperand { enum KindTy { Token, @@ -55,19 +60,74 @@ class AMDGPUOperand : public MCParsedAsmOperand { public: AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} - MCContext *Ctx; + typedef std::unique_ptr Ptr; + + struct Modifiers { + bool Abs; + bool Neg; + bool Sext; + + bool hasFPModifiers() const { return Abs || Neg; } + bool hasIntModifiers() const { return Sext; } + bool hasModifiers() const { return hasFPModifiers() || hasIntModifiers(); } + + int64_t getFPModifiersOperand() const { + int64_t Operand = 0; + Operand |= Abs ? SISrcMods::ABS : 0; + Operand |= Neg ? SISrcMods::NEG : 0; + return Operand; + } + + int64_t getIntModifiersOperand() const { + int64_t Operand = 0; + Operand |= Sext ? SISrcMods::SEXT : 0; + return Operand; + } + + int64_t getModifiersOperand() const { + assert(!(hasFPModifiers() && hasIntModifiers()) + && "fp and int modifiers should not be used simultaneously"); + if (hasFPModifiers()) { + return getFPModifiersOperand(); + } else if (hasIntModifiers()) { + return getIntModifiersOperand(); + } else { + return 0; + } + } + + friend raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods); + }; enum ImmTy { ImmTyNone, - ImmTyDSOffset0, - ImmTyDSOffset1, ImmTyGDS, + ImmTyOffen, + ImmTyIdxen, + ImmTyAddr64, ImmTyOffset, + ImmTyOffset0, + ImmTyOffset1, ImmTyGLC, ImmTySLC, ImmTyTFE, - ImmTyClamp, - ImmTyOMod + ImmTyClampSI, + ImmTyOModSI, + ImmTyDppCtrl, + ImmTyDppRowMask, + ImmTyDppBankMask, + ImmTyDppBoundCtrl, + ImmTySdwaDstSel, + ImmTySdwaSrc0Sel, + ImmTySdwaSrc1Sel, + ImmTySdwaDstUnused, + ImmTyDMask, + ImmTyUNorm, + ImmTyDA, + ImmTyR128, + ImmTyLWE, + ImmTyHwreg, + ImmTySendMsg, }; struct TokOp { @@ -79,11 +139,12 @@ public: bool IsFPImm; ImmTy Type; int64_t Val; + Modifiers Mods; }; struct RegOp { unsigned RegNo; - int Modifiers; + Modifiers Mods; const MCRegisterInfo *TRI; const MCSubtargetInfo *STI; bool IsForcedVOP3; @@ -96,175 +157,323 @@ public: const MCExpr *Expr; }; - void addImmOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createImm(getImm())); + bool isToken() const override { + if (Kind == Token) + return true; + + if (Kind != Expression || !Expr) + return false; + + // When parsing operands, we can't always tell if something was meant to be + // a token, like 'gds', or an expression that references a global variable. + // In this case, we assume the string is an expression, and if we need to + // interpret is a token, then we treat the symbol name as the token. + return isa(Expr); } - StringRef getToken() const { - return StringRef(Tok.Data, Tok.Length); + bool isImm() const override { + return Kind == Immediate; } - void addRegOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI))); + bool isInlinableImm() const { + if (!isImmTy(ImmTyNone)) { + // Only plain immediates are inlinable (e.g. "clamp" attribute is not) + return false; + } + // TODO: We should avoid using host float here. It would be better to + // check the float bit values which is what a few other places do. + // We've had bot failures before due to weird NaN support on mips hosts. + const float F = BitsToFloat(Imm.Val); + // TODO: Add 1/(2*pi) for VI + return (Imm.Val <= 64 && Imm.Val >= -16) || + (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || + F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0); } - void addRegOrImmOperands(MCInst &Inst, unsigned N) const { - if (isReg()) - addRegOperands(Inst, N); - else - addImmOperands(Inst, N); + bool isRegKind() const { + return Kind == Register; } - void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createImm( - Reg.Modifiers == -1 ? 0 : Reg.Modifiers)); - addRegOperands(Inst, N); + bool isReg() const override { + return isRegKind() && !Reg.Mods.hasModifiers(); + } + + bool isRegOrImmWithInputMods() const { + return isRegKind() || isInlinableImm(); + } + + bool isImmTy(ImmTy ImmT) const { + return isImm() && Imm.Type == ImmT; + } + + bool isImmModifier() const { + return isImm() && Imm.Type != ImmTyNone; + } + + bool isClampSI() const { return isImmTy(ImmTyClampSI); } + bool isOModSI() const { return isImmTy(ImmTyOModSI); } + bool isDMask() const { return isImmTy(ImmTyDMask); } + bool isUNorm() const { return isImmTy(ImmTyUNorm); } + bool isDA() const { return isImmTy(ImmTyDA); } + bool isR128() const { return isImmTy(ImmTyUNorm); } + bool isLWE() const { return isImmTy(ImmTyLWE); } + bool isOffen() const { return isImmTy(ImmTyOffen); } + bool isIdxen() const { return isImmTy(ImmTyIdxen); } + bool isAddr64() const { return isImmTy(ImmTyAddr64); } + bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); } + bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); } + bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); } + bool isGDS() const { return isImmTy(ImmTyGDS); } + bool isGLC() const { return isImmTy(ImmTyGLC); } + bool isSLC() const { return isImmTy(ImmTySLC); } + bool isTFE() const { return isImmTy(ImmTyTFE); } + bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } + bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } + bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } + bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); } + bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); } + bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); } + bool isSDWADstUnused() const { return isImmTy(ImmTySdwaDstUnused); } + + bool isMod() const { + return isClampSI() || isOModSI(); } - void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { - if (isImm()) - addImmOperands(Inst, N); - else { - assert(isExpr()); - Inst.addOperand(MCOperand::createExpr(Expr)); - } + bool isRegOrImm() const { + return isReg() || isImm(); } - bool defaultTokenHasSuffix() const { - StringRef Token(Tok.Data, Tok.Length); + bool isRegClass(unsigned RCID) const { + return isReg() && Reg.TRI->getRegClass(RCID).contains(getReg()); + } - return Token.endswith("_e32") || Token.endswith("_e64"); + bool isSCSrc32() const { + return isInlinableImm() || isRegClass(AMDGPU::SReg_32RegClassID); } - bool isToken() const override { - return Kind == Token; + bool isSCSrc64() const { + return isInlinableImm() || isRegClass(AMDGPU::SReg_64RegClassID); } - bool isImm() const override { - return Kind == Immediate; + bool isSSrc32() const { + return isImm() || isSCSrc32() || isExpr(); } - bool isInlineImm() const { - float F = BitsToFloat(Imm.Val); - // TODO: Add 0.5pi for VI - return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) || - (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || - F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0)); + bool isSSrc64() const { + // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits. + // See isVSrc64(). + return isImm() || isSCSrc64(); } - bool isDSOffset0() const { - assert(isImm()); - return Imm.Type == ImmTyDSOffset0; + bool isVCSrc32() const { + return isInlinableImm() || isRegClass(AMDGPU::VS_32RegClassID); } - bool isDSOffset1() const { - assert(isImm()); - return Imm.Type == ImmTyDSOffset1; + bool isVCSrc64() const { + return isInlinableImm() || isRegClass(AMDGPU::VS_64RegClassID); } - int64_t getImm() const { - return Imm.Val; + bool isVSrc32() const { + return isImm() || isVCSrc32(); } - enum ImmTy getImmTy() const { - assert(isImm()); - return Imm.Type; + bool isVSrc64() const { + // TODO: Check if the 64-bit value (coming from assembly source) can be + // narrowed to 32 bits (in the instruction stream). That require knowledge + // of instruction type (unsigned/signed, floating or "untyped"/B64), + // see [AMD GCN3 ISA 6.3.1]. + // TODO: How 64-bit values are formed from 32-bit literals in _B64 insns? + return isImm() || isVCSrc64(); } - bool isRegKind() const { - return Kind == Register; + bool isMem() const override { + return false; } - bool isReg() const override { - return Kind == Register && Reg.Modifiers == -1; + bool isExpr() const { + return Kind == Expression; + } + + bool isSoppBrTarget() const { + return isExpr() || isImm(); } - bool isRegWithInputMods() const { - return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1); + bool isSWaitCnt() const; + bool isHwreg() const; + bool isSendMsg() const; + bool isSMRDOffset() const; + bool isSMRDLiteralOffset() const; + bool isDPPCtrl() const; + + StringRef getExpressionAsToken() const { + assert(isExpr()); + const MCSymbolRefExpr *S = cast(Expr); + return S->getSymbol().getName(); } - void setModifiers(unsigned Mods) { - assert(isReg()); - Reg.Modifiers = Mods; + + StringRef getToken() const { + assert(isToken()); + + if (Kind == Expression) + return getExpressionAsToken(); + + return StringRef(Tok.Data, Tok.Length); } - bool hasModifiers() const { - assert(isRegKind()); - return Reg.Modifiers != -1; + int64_t getImm() const { + assert(isImm()); + return Imm.Val; + } + + enum ImmTy getImmTy() const { + assert(isImm()); + return Imm.Type; } unsigned getReg() const override { return Reg.RegNo; } - bool isRegOrImm() const { - return isReg() || isImm(); + SMLoc getStartLoc() const override { + return StartLoc; } - bool isRegClass(unsigned RCID) const { - return Reg.TRI->getRegClass(RCID).contains(getReg()); + SMLoc getEndLoc() const override { + return EndLoc; } - bool isSCSrc32() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + Modifiers getModifiers() const { + assert(isRegKind() || isImmTy(ImmTyNone)); + return isRegKind() ? Reg.Mods : Imm.Mods; } - bool isSSrc32() const { - return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + void setModifiers(Modifiers Mods) { + assert(isRegKind() || isImmTy(ImmTyNone)); + if (isRegKind()) + Reg.Mods = Mods; + else + Imm.Mods = Mods; } - bool isSSrc64() const { - return isImm() || isInlineImm() || - (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); + bool hasModifiers() const { + return getModifiers().hasModifiers(); } - - bool isSCSrc64() const { - return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm(); + + bool hasFPModifiers() const { + return getModifiers().hasFPModifiers(); } - bool isVCSrc32() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + bool hasIntModifiers() const { + return getModifiers().hasIntModifiers(); } - bool isVCSrc64() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const { + if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers()) { + // Apply modifiers to immediate value + int64_t Val = Imm.Val; + bool Negate = Imm.Mods.Neg; // Only negate can get here + if (Imm.IsFPImm) { + APFloat F(BitsToFloat(Val)); + if (Negate) { + F.changeSign(); + } + Val = F.bitcastToAPInt().getZExtValue(); + } else { + Val = Negate ? -Val : Val; + } + Inst.addOperand(MCOperand::createImm(Val)); + } else { + Inst.addOperand(MCOperand::createImm(getImm())); + } } - bool isVSrc32() const { - return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + void addRegOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI))); } - bool isVSrc64() const { - return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + void addRegOrImmOperands(MCInst &Inst, unsigned N) const { + if (isRegKind()) + addRegOperands(Inst, N); + else if (isExpr()) + Inst.addOperand(MCOperand::createExpr(Expr)); + else + addImmOperands(Inst, N); } - bool isMem() const override { - return false; + void addRegOrImmWithInputModsOperands(MCInst &Inst, unsigned N) const { + Modifiers Mods = getModifiers(); + Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand())); + if (isRegKind()) { + addRegOperands(Inst, N); + } else { + addImmOperands(Inst, N, false); + } } - bool isExpr() const { - return Kind == Expression; + void addRegOrImmWithFPInputModsOperands(MCInst &Inst, unsigned N) const { + assert(!hasIntModifiers()); + addRegOrImmWithInputModsOperands(Inst, N); } - bool isSoppBrTarget() const { - return isExpr() || isImm(); + void addRegOrImmWithIntInputModsOperands(MCInst &Inst, unsigned N) const { + assert(!hasFPModifiers()); + addRegOrImmWithInputModsOperands(Inst, N); } - SMLoc getStartLoc() const override { - return StartLoc; + void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { + if (isImm()) + addImmOperands(Inst, N); + else { + assert(isExpr()); + Inst.addOperand(MCOperand::createExpr(Expr)); + } } - SMLoc getEndLoc() const override { - return EndLoc; + void printImmTy(raw_ostream& OS, ImmTy Type) const { + switch (Type) { + case ImmTyNone: OS << "None"; break; + case ImmTyGDS: OS << "GDS"; break; + case ImmTyOffen: OS << "Offen"; break; + case ImmTyIdxen: OS << "Idxen"; break; + case ImmTyAddr64: OS << "Addr64"; break; + case ImmTyOffset: OS << "Offset"; break; + case ImmTyOffset0: OS << "Offset0"; break; + case ImmTyOffset1: OS << "Offset1"; break; + case ImmTyGLC: OS << "GLC"; break; + case ImmTySLC: OS << "SLC"; break; + case ImmTyTFE: OS << "TFE"; break; + case ImmTyClampSI: OS << "ClampSI"; break; + case ImmTyOModSI: OS << "OModSI"; break; + case ImmTyDppCtrl: OS << "DppCtrl"; break; + case ImmTyDppRowMask: OS << "DppRowMask"; break; + case ImmTyDppBankMask: OS << "DppBankMask"; break; + case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break; + case ImmTySdwaDstSel: OS << "SdwaDstSel"; break; + case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break; + case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break; + case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break; + case ImmTyDMask: OS << "DMask"; break; + case ImmTyUNorm: OS << "UNorm"; break; + case ImmTyDA: OS << "DA"; break; + case ImmTyR128: OS << "R128"; break; + case ImmTyLWE: OS << "LWE"; break; + case ImmTyHwreg: OS << "Hwreg"; break; + case ImmTySendMsg: OS << "SendMsg"; break; + } } void print(raw_ostream &OS) const override { switch (Kind) { case Register: - OS << "'; + OS << "'; break; case Immediate: - OS << getImm(); + OS << '<' << getImm(); + if (getImmTy() != ImmTyNone) { + OS << " type: "; printImmTy(OS, getImmTy()); + } + OS << " mods: " << Imm.Mods << '>'; break; case Token: OS << '\'' << getToken() << '\''; @@ -275,20 +484,21 @@ public: } } - static std::unique_ptr CreateImm(int64_t Val, SMLoc Loc, - enum ImmTy Type = ImmTyNone, - bool IsFPImm = false) { + static AMDGPUOperand::Ptr CreateImm(int64_t Val, SMLoc Loc, + enum ImmTy Type = ImmTyNone, + bool IsFPImm = false) { auto Op = llvm::make_unique(Immediate); Op->Imm.Val = Val; Op->Imm.IsFPImm = IsFPImm; Op->Imm.Type = Type; + Op->Imm.Mods = {false, false, false}; Op->StartLoc = Loc; Op->EndLoc = Loc; return Op; } - static std::unique_ptr CreateToken(StringRef Str, SMLoc Loc, - bool HasExplicitEncodingSize = true) { + static AMDGPUOperand::Ptr CreateToken(StringRef Str, SMLoc Loc, + bool HasExplicitEncodingSize = true) { auto Res = llvm::make_unique(Token); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); @@ -297,43 +507,43 @@ public: return Res; } - static std::unique_ptr CreateReg(unsigned RegNo, SMLoc S, - SMLoc E, - const MCRegisterInfo *TRI, - const MCSubtargetInfo *STI, - bool ForceVOP3) { + static AMDGPUOperand::Ptr CreateReg(unsigned RegNo, SMLoc S, + SMLoc E, + const MCRegisterInfo *TRI, + const MCSubtargetInfo *STI, + bool ForceVOP3) { auto Op = llvm::make_unique(Register); Op->Reg.RegNo = RegNo; Op->Reg.TRI = TRI; Op->Reg.STI = STI; - Op->Reg.Modifiers = -1; + Op->Reg.Mods = {false, false, false}; Op->Reg.IsForcedVOP3 = ForceVOP3; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static std::unique_ptr CreateExpr(const class MCExpr *Expr, SMLoc S) { + static AMDGPUOperand::Ptr CreateExpr(const class MCExpr *Expr, SMLoc S) { auto Op = llvm::make_unique(Expression); Op->Expr = Expr; Op->StartLoc = S; Op->EndLoc = S; return Op; } - - bool isDSOffset() const; - bool isDSOffset01() const; - bool isSWaitCnt() const; - bool isMubufOffset() const; - bool isSMRDOffset() const; - bool isSMRDLiteralOffset() const; }; +raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) { + OS << "abs:" << Mods.Abs << " neg: " << Mods.Neg << " sext:" << Mods.Sext; + return OS; +} + class AMDGPUAsmParser : public MCTargetAsmParser { const MCInstrInfo &MII; MCAsmParser &Parser; unsigned ForcedEncodingSize; + bool ForcedDPP; + bool ForcedSDWA; bool isSI() const { return AMDGPU::isSI(getSTI()); @@ -373,8 +583,10 @@ private: bool ParseSectionDirectiveHSADataGlobalAgent(); bool ParseSectionDirectiveHSADataGlobalProgram(); bool ParseSectionDirectiveHSARodataReadonlyAgent(); + bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum); + bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth); + void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn); -public: public: enum AMDGPUMatchResultTy { Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY @@ -384,7 +596,9 @@ public: const MCInstrInfo &MII, const MCTargetOptions &Options) : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser), - ForcedEncodingSize(0) { + ForcedEncodingSize(0), + ForcedDPP(false), + ForcedSDWA(false) { MCAsmParserExtension::Initialize(Parser); if (getSTI().getFeatureBits().none()) { @@ -393,6 +607,21 @@ public: } setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + + { + // TODO: make those pre-defined variables read-only. + // Currently there is none suitable machinery in the core llvm-mc for this. + // MCSymbol::isRedefinable is intended for another purpose, and + // AsmParser::parseDirectiveSet() cannot be specialized for specific target. + AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); + MCContext &Ctx = getContext(); + MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); + Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx)); + Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor")); + Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx)); + Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); + Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx)); + } } AMDGPUTargetStreamer &getTargetStreamer() { @@ -400,84 +629,117 @@ public: return static_cast(TS); } - unsigned getForcedEncodingSize() const { - return ForcedEncodingSize; - } - - void setForcedEncodingSize(unsigned Size) { - ForcedEncodingSize = Size; - } + void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; } + void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; } + void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; } - bool isForcedVOP3() const { - return ForcedEncodingSize == 64; - } + unsigned getForcedEncodingSize() const { return ForcedEncodingSize; } + bool isForcedVOP3() const { return ForcedEncodingSize == 64; } + bool isForcedDPP() const { return ForcedDPP; } + bool isForcedSDWA() const { return ForcedSDWA; } + std::unique_ptr parseRegister(); bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; unsigned checkTargetMatchPredicate(MCInst &Inst) override; + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, + unsigned Kind) override; bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; bool ParseDirective(AsmToken DirectiveID) override; OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); + StringRef parseMnemonicSuffix(StringRef Name); bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; - OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int, - int64_t Default = 0); + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int); OperandMatchResultTy parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = - AMDGPUOperand::ImmTyNone); + enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + bool (*ConvertResult)(int64_t&) = 0); OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = - AMDGPUOperand::ImmTyNone); - OperandMatchResultTy parseOptionalOps( - const ArrayRef &OptionalOps, - OperandVector &Operands); + enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value); + OperandMatchResultTy parseImm(OperandVector &Operands); + OperandMatchResultTy parseRegOrImm(OperandVector &Operands); + OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands); + OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands); void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); void cvtDS(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands); - OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands); bool parseCnt(int64_t &IntVal); OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); - OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + OperandMatchResultTy parseHwreg(OperandVector &Operands); - OperandMatchResultTy parseFlatOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseFlatAtomicOptionalOps(OperandVector &Operands); - void cvtFlat(MCInst &Inst, const OperandVector &Operands); +private: + struct OperandInfoTy { + int64_t Id; + bool IsSymbolic; + OperandInfoTy(int64_t Id_) : Id(Id_), IsSymbolic(false) { } + }; - void cvtMubuf(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseOffset(OperandVector &Operands); - OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseGLC(OperandVector &Operands); - OperandMatchResultTy parseSLC(OperandVector &Operands); - OperandMatchResultTy parseTFE(OperandVector &Operands); + bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId); + bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width); +public: + OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); - OperandMatchResultTy parseDMask(OperandVector &Operands); - OperandMatchResultTy parseUNorm(OperandVector &Operands); - OperandMatchResultTy parseR128(OperandVector &Operands); + OperandMatchResultTy parseSendMsgOp(OperandVector &Operands); + OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } + void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } + void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } + AMDGPUOperand::Ptr defaultGLC() const; + AMDGPUOperand::Ptr defaultSLC() const; + AMDGPUOperand::Ptr defaultTFE() const; + + AMDGPUOperand::Ptr defaultDMask() const; + AMDGPUOperand::Ptr defaultUNorm() const; + AMDGPUOperand::Ptr defaultDA() const; + AMDGPUOperand::Ptr defaultR128() const; + AMDGPUOperand::Ptr defaultLWE() const; + AMDGPUOperand::Ptr defaultSMRDOffset() const; + AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; + + OperandMatchResultTy parseOModOperand(OperandVector &Operands); + + void cvtId(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands); + + void cvtMIMG(MCInst &Inst, const OperandVector &Operands); + void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); + + OperandMatchResultTy parseDPPCtrl(OperandVector &Operands); + AMDGPUOperand::Ptr defaultRowMask() const; + AMDGPUOperand::Ptr defaultBankMask() const; + AMDGPUOperand::Ptr defaultBoundCtrl() const; + void cvtDPP(MCInst &Inst, const OperandVector &Operands); + + OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix, + AMDGPUOperand::ImmTy Type); + OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands); + void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands); + void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands); + void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands); + void cvtSDWA(MCInst &Inst, const OperandVector &Operands, + uint64_t BasicInstType); }; struct OptionalOperand { const char *Name; AMDGPUOperand::ImmTy Type; bool IsBit; - int64_t Default; bool (*ConvertResult)(int64_t&); }; } -static int getRegClass(bool IsVgpr, unsigned RegWidth) { - if (IsVgpr) { +static int getRegClass(RegisterKind Is, unsigned RegWidth) { + if (Is == IS_VGPR) { switch (RegWidth) { default: return -1; case 1: return AMDGPU::VGPR_32RegClassID; @@ -487,117 +749,389 @@ static int getRegClass(bool IsVgpr, unsigned RegWidth) { case 8: return AMDGPU::VReg_256RegClassID; case 16: return AMDGPU::VReg_512RegClassID; } + } else if (Is == IS_TTMP) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::TTMP_32RegClassID; + case 2: return AMDGPU::TTMP_64RegClassID; + case 4: return AMDGPU::TTMP_128RegClassID; + } + } else if (Is == IS_SGPR) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::SGPR_32RegClassID; + case 2: return AMDGPU::SGPR_64RegClassID; + case 4: return AMDGPU::SGPR_128RegClassID; + case 8: return AMDGPU::SReg_256RegClassID; + case 16: return AMDGPU::SReg_512RegClassID; + } } - - switch (RegWidth) { - default: return -1; - case 1: return AMDGPU::SGPR_32RegClassID; - case 2: return AMDGPU::SGPR_64RegClassID; - case 4: return AMDGPU::SReg_128RegClassID; - case 8: return AMDGPU::SReg_256RegClassID; - case 16: return AMDGPU::SReg_512RegClassID; - } + return -1; } -static unsigned getRegForName(StringRef RegName) { - +static unsigned getSpecialRegForName(StringRef RegName) { return StringSwitch(RegName) .Case("exec", AMDGPU::EXEC) .Case("vcc", AMDGPU::VCC) .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("m0", AMDGPU::M0) .Case("scc", AMDGPU::SCC) + .Case("tba", AMDGPU::TBA) + .Case("tma", AMDGPU::TMA) .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) .Case("vcc_lo", AMDGPU::VCC_LO) .Case("vcc_hi", AMDGPU::VCC_HI) .Case("exec_lo", AMDGPU::EXEC_LO) .Case("exec_hi", AMDGPU::EXEC_HI) + .Case("tma_lo", AMDGPU::TMA_LO) + .Case("tma_hi", AMDGPU::TMA_HI) + .Case("tba_lo", AMDGPU::TBA_LO) + .Case("tba_hi", AMDGPU::TBA_HI) .Default(0); } bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { - const AsmToken Tok = Parser.getTok(); - StartLoc = Tok.getLoc(); - EndLoc = Tok.getEndLoc(); - const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - - StringRef RegName = Tok.getString(); - RegNo = getRegForName(RegName); + auto R = parseRegister(); + if (!R) return true; + assert(R->isReg()); + RegNo = R->getReg(); + StartLoc = R->getStartLoc(); + EndLoc = R->getEndLoc(); + return false; +} - if (RegNo) { - Parser.Lex(); - return !subtargetHasRegister(*TRI, RegNo); +bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum) +{ + switch (RegKind) { + case IS_SPECIAL: + if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; RegWidth = 2; return true; } + if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; RegWidth = 2; return true; } + if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; return true; } + if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; RegWidth = 2; return true; } + if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; RegWidth = 2; return true; } + return false; + case IS_VGPR: + case IS_SGPR: + case IS_TTMP: + if (Reg1 != Reg + RegWidth) { return false; } + RegWidth++; + return true; + default: + assert(false); return false; } +} - // Match vgprs and sgprs - if (RegName[0] != 's' && RegName[0] != 'v') - return true; +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth) +{ + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + if (getLexer().is(AsmToken::Identifier)) { + StringRef RegName = Parser.getTok().getString(); + if ((Reg = getSpecialRegForName(RegName))) { + Parser.Lex(); + RegKind = IS_SPECIAL; + } else { + unsigned RegNumIndex = 0; + if (RegName[0] == 'v') { + RegNumIndex = 1; + RegKind = IS_VGPR; + } else if (RegName[0] == 's') { + RegNumIndex = 1; + RegKind = IS_SGPR; + } else if (RegName.startswith("ttmp")) { + RegNumIndex = strlen("ttmp"); + RegKind = IS_TTMP; + } else { + return false; + } + if (RegName.size() > RegNumIndex) { + // Single 32-bit register: vXX. + if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum)) + return false; + Parser.Lex(); + RegWidth = 1; + } else { + // Range of registers: v[XX:YY]. ":YY" is optional. + Parser.Lex(); + int64_t RegLo, RegHi; + if (getLexer().isNot(AsmToken::LBrac)) + return false; + Parser.Lex(); + + if (getParser().parseAbsoluteExpression(RegLo)) + return false; + + const bool isRBrace = getLexer().is(AsmToken::RBrac); + if (!isRBrace && getLexer().isNot(AsmToken::Colon)) + return false; + Parser.Lex(); + + if (isRBrace) { + RegHi = RegLo; + } else { + if (getParser().parseAbsoluteExpression(RegHi)) + return false; - bool IsVgpr = RegName[0] == 'v'; - unsigned RegWidth; - unsigned RegIndexInClass; - if (RegName.size() > 1) { - // We have a 32-bit register - RegWidth = 1; - if (RegName.substr(1).getAsInteger(10, RegIndexInClass)) - return true; + if (getLexer().isNot(AsmToken::RBrac)) + return false; + Parser.Lex(); + } + RegNum = (unsigned) RegLo; + RegWidth = (RegHi - RegLo) + 1; + } + } + } else if (getLexer().is(AsmToken::LBrac)) { + // List of consecutive registers: [s0,s1,s2,s3] Parser.Lex(); + if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) + return false; + if (RegWidth != 1) + return false; + RegisterKind RegKind1; + unsigned Reg1, RegNum1, RegWidth1; + do { + if (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); + } else if (getLexer().is(AsmToken::RBrac)) { + Parser.Lex(); + break; + } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1)) { + if (RegWidth1 != 1) { + return false; + } + if (RegKind1 != RegKind) { + return false; + } + if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) { + return false; + } + } else { + return false; + } + } while (true); } else { - // We have a register greater than 32-bits. + return false; + } + switch (RegKind) { + case IS_SPECIAL: + RegNum = 0; + RegWidth = 1; + break; + case IS_VGPR: + case IS_SGPR: + case IS_TTMP: + { + unsigned Size = 1; + if (RegKind == IS_SGPR || RegKind == IS_TTMP) { + // SGPR and TTMP registers must be are aligned. Max required alignment is 4 dwords. + Size = std::min(RegWidth, 4u); + } + if (RegNum % Size != 0) + return false; + RegNum = RegNum / Size; + int RCID = getRegClass(RegKind, RegWidth); + if (RCID == -1) + return false; + const MCRegisterClass RC = TRI->getRegClass(RCID); + if (RegNum >= RC.getNumRegs()) + return false; + Reg = RC.getRegister(RegNum); + break; + } - int64_t RegLo, RegHi; - Parser.Lex(); - if (getLexer().isNot(AsmToken::LBrac)) - return true; + default: + assert(false); return false; + } - Parser.Lex(); - if (getParser().parseAbsoluteExpression(RegLo)) - return true; + if (!subtargetHasRegister(*TRI, Reg)) + return false; + return true; +} - if (getLexer().isNot(AsmToken::Colon)) - return true; +std::unique_ptr AMDGPUAsmParser::parseRegister() { + const auto &Tok = Parser.getTok(); + SMLoc StartLoc = Tok.getLoc(); + SMLoc EndLoc = Tok.getEndLoc(); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - Parser.Lex(); - if (getParser().parseAbsoluteExpression(RegHi)) - return true; + RegisterKind RegKind; + unsigned Reg, RegNum, RegWidth; - if (getLexer().isNot(AsmToken::RBrac)) - return true; + if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) { + return nullptr; + } + return AMDGPUOperand::CreateReg(Reg, StartLoc, EndLoc, + TRI, &getSTI(), false); +} +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseImm(OperandVector &Operands) { + bool Minus = false; + if (getLexer().getKind() == AsmToken::Minus) { + Minus = true; Parser.Lex(); - RegWidth = (RegHi - RegLo) + 1; - if (IsVgpr) { - // VGPR registers aren't aligned. - RegIndexInClass = RegLo; - } else { - // SGPR registers are aligned. Max alignment is 4 dwords. - unsigned Size = std::min(RegWidth, 4u); - if (RegLo % Size != 0) - return true; + } - RegIndexInClass = RegLo / Size; + SMLoc S = Parser.getTok().getLoc(); + switch(getLexer().getKind()) { + case AsmToken::Integer: { + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) { + Error(S, "invalid immediate: only 32-bit values are legal"); + return MatchOperand_ParseFail; } + + if (Minus) + IntVal *= -1; + Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); + return MatchOperand_Success; } + case AsmToken::Real: { + // FIXME: We should emit an error if a double precisions floating-point + // value is used. I'm not sure the best way to detect this. + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; - int RCID = getRegClass(IsVgpr, RegWidth); - if (RCID == -1) - return true; + APFloat F((float)BitsToDouble(IntVal)); + if (Minus) + F.changeSign(); + Operands.push_back( + AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S, + AMDGPUOperand::ImmTyNone, true)); + return MatchOperand_Success; + } + default: + return Minus ? MatchOperand_ParseFail : MatchOperand_NoMatch; + } +} - const MCRegisterClass RC = TRI->getRegClass(RCID); - if (RegIndexInClass >= RC.getNumRegs()) - return true; +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) { + auto res = parseImm(Operands); + if (res != MatchOperand_NoMatch) { + return res; + } - RegNo = RC.getRegister(RegIndexInClass); - return !subtargetHasRegister(*TRI, RegNo); + if (auto R = parseRegister()) { + assert(R->isReg()); + R->Reg.IsForcedVOP3 = isForcedVOP3(); + Operands.push_back(std::move(R)); + return MatchOperand_Success; + } + return MatchOperand_ParseFail; } -unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { - - uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) { + // XXX: During parsing we can't determine if minus sign means + // negate-modifier or negative immediate value. + // By default we suppose it is modifier. + bool Negate = false, Abs = false, Abs2 = false; - if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || - (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) + if (getLexer().getKind()== AsmToken::Minus) { + Parser.Lex(); + Negate = true; + } + + if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "abs") { + Parser.Lex(); + Abs2 = true; + if (getLexer().isNot(AsmToken::LParen)) { + Error(Parser.getTok().getLoc(), "expected left paren after abs"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + } + + if (getLexer().getKind() == AsmToken::Pipe) { + if (Abs2) { + Error(Parser.getTok().getLoc(), "expected register or immediate"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + Abs = true; + } + + auto Res = parseRegOrImm(Operands); + if (Res != MatchOperand_Success) { + return Res; + } + + AMDGPUOperand::Modifiers Mods = {false, false, false}; + if (Negate) { + Mods.Neg = true; + } + if (Abs) { + if (getLexer().getKind() != AsmToken::Pipe) { + Error(Parser.getTok().getLoc(), "expected vertical bar"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + Mods.Abs = true; + } + if (Abs2) { + if (getLexer().isNot(AsmToken::RParen)) { + Error(Parser.getTok().getLoc(), "expected closing parentheses"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + Mods.Abs = true; + } + + if (Mods.hasFPModifiers()) { + AMDGPUOperand &Op = static_cast(*Operands.back()); + Op.setModifiers(Mods); + } + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) { + bool Sext = false; + + if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") { + Parser.Lex(); + Sext = true; + if (getLexer().isNot(AsmToken::LParen)) { + Error(Parser.getTok().getLoc(), "expected left paren after sext"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + } + + auto Res = parseRegOrImm(Operands); + if (Res != MatchOperand_Success) { + return Res; + } + + AMDGPUOperand::Modifiers Mods = {false, false, false}; + if (Sext) { + if (getLexer().isNot(AsmToken::RParen)) { + Error(Parser.getTok().getLoc(), "expected closing parentheses"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + Mods.Sext = true; + } + + if (Mods.hasIntModifiers()) { + AMDGPUOperand &Op = static_cast(*Operands.back()); + Op.setModifiers(Mods); + } + return MatchOperand_Success; +} + +unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { + + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + + if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || + (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)) || + (isForcedDPP() && !(TSFlags & SIInstrFlags::DPP)) || + (isForcedSDWA() && !(TSFlags & SIInstrFlags::SDWA)) ) return Match_InvalidOperand; if ((TSFlags & SIInstrFlags::VOP3) && @@ -608,7 +1142,6 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { return Match_Success; } - bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -632,31 +1165,8 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, SMLoc ErrorLoc = IDLoc; if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) { - if (isForcedVOP3()) { - // If 64-bit encoding has been forced we can end up with no - // clamp or omod operands if none of the registers have modifiers, - // so we need to add these to the operand list. - AMDGPUOperand &LastOp = - ((AMDGPUOperand &)*Operands[Operands.size() - 1]); - if (LastOp.isRegKind() || - (LastOp.isImm() && - LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) { - SMLoc S = Parser.getTok().getLoc(); - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyClamp)); - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyOMod)); - bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands, - Out, ErrorInfo, - MatchingInlineAsm); - if (!Res) - return Res; - } - - } return Error(IDLoc, "too few operands for instruction"); } - ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; @@ -762,164 +1272,12 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header) { - - if (getLexer().isNot(AsmToken::Equal)) - return TokError("expected '='"); - Lex(); - - if (getLexer().isNot(AsmToken::Integer)) - return TokError("amd_kernel_code_t values must be integers"); - - uint64_t Value = getLexer().getTok().getIntVal(); + SmallString<40> ErrStr; + raw_svector_ostream Err(ErrStr); + if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) { + return TokError(Err.str()); + } Lex(); - - if (ID == "kernel_code_version_major") - Header.amd_kernel_code_version_major = Value; - else if (ID == "kernel_code_version_minor") - Header.amd_kernel_code_version_minor = Value; - else if (ID == "machine_kind") - Header.amd_machine_kind = Value; - else if (ID == "machine_version_major") - Header.amd_machine_version_major = Value; - else if (ID == "machine_version_minor") - Header.amd_machine_version_minor = Value; - else if (ID == "machine_version_stepping") - Header.amd_machine_version_stepping = Value; - else if (ID == "kernel_code_entry_byte_offset") - Header.kernel_code_entry_byte_offset = Value; - else if (ID == "kernel_code_prefetch_byte_size") - Header.kernel_code_prefetch_byte_size = Value; - else if (ID == "max_scratch_backing_memory_byte_size") - Header.max_scratch_backing_memory_byte_size = Value; - else if (ID == "compute_pgm_rsrc1_vgprs") - Header.compute_pgm_resource_registers |= S_00B848_VGPRS(Value); - else if (ID == "compute_pgm_rsrc1_sgprs") - Header.compute_pgm_resource_registers |= S_00B848_SGPRS(Value); - else if (ID == "compute_pgm_rsrc1_priority") - Header.compute_pgm_resource_registers |= S_00B848_PRIORITY(Value); - else if (ID == "compute_pgm_rsrc1_float_mode") - Header.compute_pgm_resource_registers |= S_00B848_FLOAT_MODE(Value); - else if (ID == "compute_pgm_rsrc1_priv") - Header.compute_pgm_resource_registers |= S_00B848_PRIV(Value); - else if (ID == "compute_pgm_rsrc1_dx10_clamp") - Header.compute_pgm_resource_registers |= S_00B848_DX10_CLAMP(Value); - else if (ID == "compute_pgm_rsrc1_debug_mode") - Header.compute_pgm_resource_registers |= S_00B848_DEBUG_MODE(Value); - else if (ID == "compute_pgm_rsrc1_ieee_mode") - Header.compute_pgm_resource_registers |= S_00B848_IEEE_MODE(Value); - else if (ID == "compute_pgm_rsrc2_scratch_en") - Header.compute_pgm_resource_registers |= (S_00B84C_SCRATCH_EN(Value) << 32); - else if (ID == "compute_pgm_rsrc2_user_sgpr") - Header.compute_pgm_resource_registers |= (S_00B84C_USER_SGPR(Value) << 32); - else if (ID == "compute_pgm_rsrc2_tgid_x_en") - Header.compute_pgm_resource_registers |= (S_00B84C_TGID_X_EN(Value) << 32); - else if (ID == "compute_pgm_rsrc2_tgid_y_en") - Header.compute_pgm_resource_registers |= (S_00B84C_TGID_Y_EN(Value) << 32); - else if (ID == "compute_pgm_rsrc2_tgid_z_en") - Header.compute_pgm_resource_registers |= (S_00B84C_TGID_Z_EN(Value) << 32); - else if (ID == "compute_pgm_rsrc2_tg_size_en") - Header.compute_pgm_resource_registers |= (S_00B84C_TG_SIZE_EN(Value) << 32); - else if (ID == "compute_pgm_rsrc2_tidig_comp_cnt") - Header.compute_pgm_resource_registers |= - (S_00B84C_TIDIG_COMP_CNT(Value) << 32); - else if (ID == "compute_pgm_rsrc2_excp_en_msb") - Header.compute_pgm_resource_registers |= - (S_00B84C_EXCP_EN_MSB(Value) << 32); - else if (ID == "compute_pgm_rsrc2_lds_size") - Header.compute_pgm_resource_registers |= (S_00B84C_LDS_SIZE(Value) << 32); - else if (ID == "compute_pgm_rsrc2_excp_en") - Header.compute_pgm_resource_registers |= (S_00B84C_EXCP_EN(Value) << 32); - else if (ID == "compute_pgm_resource_registers") - Header.compute_pgm_resource_registers = Value; - else if (ID == "enable_sgpr_private_segment_buffer") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT); - else if (ID == "enable_sgpr_dispatch_ptr") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT); - else if (ID == "enable_sgpr_queue_ptr") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT); - else if (ID == "enable_sgpr_kernarg_segment_ptr") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT); - else if (ID == "enable_sgpr_dispatch_id") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT); - else if (ID == "enable_sgpr_flat_scratch_init") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT); - else if (ID == "enable_sgpr_private_segment_size") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT); - else if (ID == "enable_sgpr_grid_workgroup_count_x") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT); - else if (ID == "enable_sgpr_grid_workgroup_count_y") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT); - else if (ID == "enable_sgpr_grid_workgroup_count_z") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT); - else if (ID == "enable_ordered_append_gds") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT); - else if (ID == "private_element_size") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT); - else if (ID == "is_ptr64") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_IS_PTR64_SHIFT); - else if (ID == "is_dynamic_callstack") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT); - else if (ID == "is_debug_enabled") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT); - else if (ID == "is_xnack_enabled") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT); - else if (ID == "workitem_private_segment_byte_size") - Header.workitem_private_segment_byte_size = Value; - else if (ID == "workgroup_group_segment_byte_size") - Header.workgroup_group_segment_byte_size = Value; - else if (ID == "gds_segment_byte_size") - Header.gds_segment_byte_size = Value; - else if (ID == "kernarg_segment_byte_size") - Header.kernarg_segment_byte_size = Value; - else if (ID == "workgroup_fbarrier_count") - Header.workgroup_fbarrier_count = Value; - else if (ID == "wavefront_sgpr_count") - Header.wavefront_sgpr_count = Value; - else if (ID == "workitem_vgpr_count") - Header.workitem_vgpr_count = Value; - else if (ID == "reserved_vgpr_first") - Header.reserved_vgpr_first = Value; - else if (ID == "reserved_vgpr_count") - Header.reserved_vgpr_count = Value; - else if (ID == "reserved_sgpr_first") - Header.reserved_sgpr_first = Value; - else if (ID == "reserved_sgpr_count") - Header.reserved_sgpr_count = Value; - else if (ID == "debug_wavefront_private_segment_offset_sgpr") - Header.debug_wavefront_private_segment_offset_sgpr = Value; - else if (ID == "debug_private_segment_buffer_sgpr") - Header.debug_private_segment_buffer_sgpr = Value; - else if (ID == "kernarg_segment_alignment") - Header.kernarg_segment_alignment = Value; - else if (ID == "group_segment_alignment") - Header.group_segment_alignment = Value; - else if (ID == "private_segment_alignment") - Header.private_segment_alignment = Value; - else if (ID == "wavefront_size") - Header.wavefront_size = Value; - else if (ID == "call_convention") - Header.call_convention = Value; - else if (ID == "runtime_loader_kernel_symbol") - Header.runtime_loader_kernel_symbol = Value; - else - return TokError("amd_kernel_code_t value not recognized."); - return false; } @@ -930,9 +1288,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { while (true) { - if (getLexer().isNot(AsmToken::EndOfStatement)) - return TokError("amd_kernel_code_t values must begin on a new line"); - // Lex EndOfStatement. This is in a while loop, because lexing a comment // will set the current token to EndOfStatement. while(getLexer().is(AsmToken::EndOfStatement)) @@ -1026,7 +1381,7 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amd_kernel_code_t") return ParseDirectiveAMDKernelCodeT(); - if (IDVal == ".hsatext" || IDVal == ".text") + if (IDVal == ".hsatext") return ParseSectionDirectiveHSAText(); if (IDVal == ".amdgpu_hsa_kernel") @@ -1078,19 +1433,6 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, return true; } -static bool operandsHaveModifiers(const OperandVector &Operands) { - - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); - if (Op.isRegKind() && Op.hasModifiers()) - return true; - if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod || - Op.getImmTy() == AMDGPUOperand::ImmTyClamp)) - return true; - } - return false; -} - AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { @@ -1107,113 +1449,59 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { getLexer().is(AsmToken::EndOfStatement)) return ResTy; - bool Negate = false, Abs = false; - if (getLexer().getKind()== AsmToken::Minus) { - Parser.Lex(); - Negate = true; - } - - if (getLexer().getKind() == AsmToken::Pipe) { - Parser.Lex(); - Abs = true; - } - - switch(getLexer().getKind()) { - case AsmToken::Integer: { - SMLoc S = Parser.getTok().getLoc(); - int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) - return MatchOperand_ParseFail; - if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) { - Error(S, "invalid immediate: only 32-bit values are legal"); - return MatchOperand_ParseFail; - } + ResTy = parseRegOrImm(Operands); - if (Negate) - IntVal *= -1; - Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); - return MatchOperand_Success; - } - case AsmToken::Real: { - // FIXME: We should emit an error if a double precisions floating-point - // value is used. I'm not sure the best way to detect this. - SMLoc S = Parser.getTok().getLoc(); - int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) - return MatchOperand_ParseFail; + if (ResTy == MatchOperand_Success) + return ResTy; - APFloat F((float)BitsToDouble(IntVal)); - if (Negate) - F.changeSign(); - Operands.push_back( - AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S)); + if (getLexer().getKind() == AsmToken::Identifier) { + // If this identifier is a symbol, we want to create an expression for it. + // It is a little difficult to distinguish between a symbol name, and + // an instruction flag like 'gds'. In order to do this, we parse + // all tokens as expressions and then treate the symbol name as the token + // string when we want to interpret the operand as a token. + const auto &Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + const MCExpr *Expr = nullptr; + if (!Parser.parseExpression(Expr)) { + Operands.push_back(AMDGPUOperand::CreateExpr(Expr, S)); return MatchOperand_Success; } - case AsmToken::Identifier: { - SMLoc S, E; - unsigned RegNo; - if (!ParseRegister(RegNo, S, E)) { - - bool HasModifiers = operandsHaveModifiers(Operands); - unsigned Modifiers = 0; - - if (Negate) - Modifiers |= 0x1; - - if (Abs) { - if (getLexer().getKind() != AsmToken::Pipe) - return MatchOperand_ParseFail; - Parser.Lex(); - Modifiers |= 0x2; - } - if (Modifiers && !HasModifiers) { - // We are adding a modifier to src1 or src2 and previous sources - // don't have modifiers, so we need to go back and empty modifers - // for each previous source. - for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1; - --PrevRegIdx) { - - AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]); - RegOp.setModifiers(0); - } - } - - - Operands.push_back(AMDGPUOperand::CreateReg( - RegNo, S, E, getContext().getRegisterInfo(), &getSTI(), - isForcedVOP3())); - - if (HasModifiers || Modifiers) { - AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]); - RegOp.setModifiers(Modifiers); - - } - } else { - Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(), - S)); - Parser.Lex(); - } - return MatchOperand_Success; - } - default: - return MatchOperand_NoMatch; + Operands.push_back(AMDGPUOperand::CreateToken(Tok.getString(), Tok.getLoc())); + Parser.Lex(); + return MatchOperand_Success; } + return MatchOperand_NoMatch; } -bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, - StringRef Name, - SMLoc NameLoc, OperandVector &Operands) { - +StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) { // Clear any forced encodings from the previous instruction. setForcedEncodingSize(0); + setForcedDPP(false); + setForcedSDWA(false); - if (Name.endswith("_e64")) + if (Name.endswith("_e64")) { setForcedEncodingSize(64); - else if (Name.endswith("_e32")) + return Name.substr(0, Name.size() - 4); + } else if (Name.endswith("_e32")) { setForcedEncodingSize(32); + return Name.substr(0, Name.size() - 4); + } else if (Name.endswith("_dpp")) { + setForcedDPP(true); + return Name.substr(0, Name.size() - 4); + } else if (Name.endswith("_sdwa")) { + setForcedSDWA(true); + return Name.substr(0, Name.size() - 5); + } + return Name; +} +bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { // Add the instruction mnemonic + Name = parseMnemonicSuffix(Name); Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); while (!getLexer().is(AsmToken::EndOfStatement)) { @@ -1225,20 +1513,21 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, switch (Res) { case MatchOperand_Success: break; - case MatchOperand_ParseFail: return Error(getLexer().getLoc(), - "failed parsing operand."); - case MatchOperand_NoMatch: return Error(getLexer().getLoc(), - "not a valid operand."); + case MatchOperand_ParseFail: + Error(getLexer().getLoc(), "failed parsing operand."); + while (!getLexer().is(AsmToken::EndOfStatement)) { + Parser.Lex(); + } + return true; + case MatchOperand_NoMatch: + Error(getLexer().getLoc(), "not a valid operand."); + while (!getLexer().is(AsmToken::EndOfStatement)) { + Parser.Lex(); + } + return true; } } - // Once we reach end of statement, continue parsing so we can add default - // values for optional arguments. - AMDGPUAsmParser::OperandMatchResultTy Res; - while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) { - if (Res != MatchOperand_Success) - return Error(getLexer().getLoc(), "failed parsing operand."); - } return false; } @@ -1247,22 +1536,14 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, //===----------------------------------------------------------------------===// AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int, - int64_t Default) { - - // We are at the end of the statement, and this is a default argument, so - // use a default value. - if (getLexer().is(AsmToken::EndOfStatement)) { - Int = Default; - return MatchOperand_Success; - } - +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) { switch(getLexer().getKind()) { default: return MatchOperand_NoMatch; case AsmToken::Identifier: { - StringRef OffsetName = Parser.getTok().getString(); - if (!OffsetName.equals(Prefix)) + StringRef Name = Parser.getTok().getString(); + if (!Name.equals(Prefix)) { return MatchOperand_NoMatch; + } Parser.Lex(); if (getLexer().isNot(AsmToken::Colon)) @@ -1282,16 +1563,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int, AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy) { + enum AMDGPUOperand::ImmTy ImmTy, + bool (*ConvertResult)(int64_t&)) { SMLoc S = Parser.getTok().getLoc(); - int64_t Offset = 0; + int64_t Value = 0; - AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset); + AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value); if (Res != MatchOperand_Success) return Res; - Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy)); + if (ConvertResult && !ConvertResult(Value)) { + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(Value, S, ImmTy)); return MatchOperand_Success; } @@ -1327,101 +1613,52 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, return MatchOperand_Success; } -static bool operandsHasOptionalOp(const OperandVector &Operands, - const OptionalOperand &OOp) { - for (unsigned i = 0; i < Operands.size(); i++) { - const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]); - if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) || - (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name)) - return true; +typedef std::map OptionalImmIndexMap; +void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands, + OptionalImmIndexMap& OptionalIdx, + enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) { + auto i = OptionalIdx.find(ImmT); + if (i != OptionalIdx.end()) { + unsigned Idx = i->second; + ((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1); + } else { + Inst.addOperand(MCOperand::createImm(Default)); } - return false; } AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseOptionalOps(const ArrayRef &OptionalOps, - OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - for (const OptionalOperand &Op : OptionalOps) { - if (operandsHasOptionalOp(Operands, Op)) - continue; - AMDGPUAsmParser::OperandMatchResultTy Res; - int64_t Value; - if (Op.IsBit) { - Res = parseNamedBit(Op.Name, Operands, Op.Type); - if (Res == MatchOperand_NoMatch) - continue; - return Res; - } - - Res = parseIntWithPrefix(Op.Name, Value, Op.Default); - - if (Res == MatchOperand_NoMatch) - continue; - - if (Res != MatchOperand_Success) - return Res; +AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) { + if (getLexer().isNot(AsmToken::Identifier)) { + return MatchOperand_NoMatch; + } + StringRef Tok = Parser.getTok().getString(); + if (Tok != Prefix) { + return MatchOperand_NoMatch; + } - if (Op.ConvertResult && !Op.ConvertResult(Value)) { - return MatchOperand_ParseFail; - } + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) { + return MatchOperand_ParseFail; + } - Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type)); - return MatchOperand_Success; + Parser.Lex(); + if (getLexer().isNot(AsmToken::Identifier)) { + return MatchOperand_ParseFail; } - return MatchOperand_NoMatch; + + Value = Parser.getTok().getString(); + return MatchOperand_Success; } //===----------------------------------------------------------------------===// // ds //===----------------------------------------------------------------------===// -static const OptionalOperand DSOptionalOps [] = { - {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, - {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} -}; - -static const OptionalOperand DSOptionalOpsOff01 [] = { - {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr}, - {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr}, - {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} -}; - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) { - return parseOptionalOps(DSOptionalOps, Operands); -} -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) { - return parseOptionalOps(DSOptionalOpsOff01, Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - AMDGPUAsmParser::OperandMatchResultTy Res = - parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset); - if (Res == MatchOperand_NoMatch) { - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyOffset)); - Res = MatchOperand_Success; - } - return Res; -} - -bool AMDGPUOperand::isDSOffset() const { - return isImm() && isUInt<16>(getImm()); -} - -bool AMDGPUOperand::isDSOffset01() const { - return isImm() && isUInt<8>(getImm()); -} - void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, const OperandVector &Operands) { - std::map OptionalIdx; + OptionalImmIndexMap OptionalIdx; for (unsigned i = 1, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); @@ -1436,13 +1673,10 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, OptionalIdx[Op.getImmTy()] = i; } - unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0]; - unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1]; - unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset0); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset1); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); - ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0 - ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1 - ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 } @@ -1469,12 +1703,11 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { OptionalIdx[Op.getImmTy()] = i; } - unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; - ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); if (!GDSOnly) { - unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; - ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); } Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 } @@ -1516,7 +1749,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { CntMask = 0x7; CntShift = 4; } else if (CntName == "lgkmcnt") { - CntMask = 0x7; + CntMask = 0xf; CntShift = 8; } else { return true; @@ -1532,8 +1765,8 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { // Disable all counters by default. // vmcnt [3:0] // expcnt [6:4] - // lgkmcnt [10:8] - int64_t CntVal = 0x77f; + // lgkmcnt [11:8] + int64_t CntVal = 0xf7f; SMLoc S = Parser.getTok().getLoc(); switch(getLexer().getKind()) { @@ -1555,141 +1788,346 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { return MatchOperand_Success; } -bool AMDGPUOperand::isSWaitCnt() const { - return isImm(); -} +bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) { + using namespace llvm::AMDGPU::Hwreg; -//===----------------------------------------------------------------------===// -// sopp branch targets -//===----------------------------------------------------------------------===// + if (Parser.getTok().getString() != "hwreg") + return true; + Parser.Lex(); -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); + if (getLexer().isNot(AsmToken::LParen)) + return true; + Parser.Lex(); - switch (getLexer().getKind()) { - default: return MatchOperand_ParseFail; - case AsmToken::Integer: { - int64_t Imm; - if (getParser().parseAbsoluteExpression(Imm)) - return MatchOperand_ParseFail; - Operands.push_back(AMDGPUOperand::CreateImm(Imm, S)); - return MatchOperand_Success; + if (getLexer().is(AsmToken::Identifier)) { + HwReg.IsSymbolic = true; + HwReg.Id = ID_UNKNOWN_; + const StringRef tok = Parser.getTok().getString(); + for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) { + if (tok == IdSymbolic[i]) { + HwReg.Id = i; + break; + } } + Parser.Lex(); + } else { + HwReg.IsSymbolic = false; + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(HwReg.Id)) + return true; + } - case AsmToken::Identifier: - Operands.push_back(AMDGPUOperand::CreateExpr( - MCSymbolRefExpr::create(getContext().getOrCreateSymbol( - Parser.getTok().getString()), getContext()), S)); - Parser.Lex(); - return MatchOperand_Success; + if (getLexer().is(AsmToken::RParen)) { + Parser.Lex(); + return false; } -} -//===----------------------------------------------------------------------===// -// flat -//===----------------------------------------------------------------------===// + // optional params + if (getLexer().isNot(AsmToken::Comma)) + return true; + Parser.Lex(); -static const OptionalOperand FlatOptionalOps [] = { - {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, - {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, - {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} -}; + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(Offset)) + return true; -static const OptionalOperand FlatAtomicOptionalOps [] = { - {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, - {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} -}; + if (getLexer().isNot(AsmToken::Comma)) + return true; + Parser.Lex(); -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseFlatOptionalOps(OperandVector &Operands) { - return parseOptionalOps(FlatOptionalOps, Operands); -} + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(Width)) + return true; -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseFlatAtomicOptionalOps(OperandVector &Operands) { - return parseOptionalOps(FlatAtomicOptionalOps, Operands); + if (getLexer().isNot(AsmToken::RParen)) + return true; + Parser.Lex(); + + return false; } -void AMDGPUAsmParser::cvtFlat(MCInst &Inst, - const OperandVector &Operands) { - std::map OptionalIdx; +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { + using namespace llvm::AMDGPU::Hwreg; - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + int64_t Imm16Val = 0; + SMLoc S = Parser.getTok().getLoc(); - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } + switch(getLexer().getKind()) { + default: return MatchOperand_NoMatch; + case AsmToken::Integer: + // The operand can be an integer value. + if (getParser().parseAbsoluteExpression(Imm16Val)) + return MatchOperand_NoMatch; + if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) { + Error(S, "invalid immediate: only 16-bit values are legal"); + // Do not return error code, but create an imm operand anyway and proceed + // to the next operand, if any. That avoids unneccessary error messages. + } + break; - // Handle 'glc' token which is sometimes hard-coded into the - // asm string. There are no MCInst operands for these. - if (Op.isToken()) - continue; + case AsmToken::Identifier: { + OperandInfoTy HwReg(ID_UNKNOWN_); + int64_t Offset = OFFSET_DEFAULT_; + int64_t Width = WIDTH_M1_DEFAULT_ + 1; + if (parseHwregConstruct(HwReg, Offset, Width)) + return MatchOperand_ParseFail; + if (HwReg.Id < 0 || !isUInt(HwReg.Id)) { + if (HwReg.IsSymbolic) + Error(S, "invalid symbolic name of hardware register"); + else + Error(S, "invalid code of hardware register: only 6-bit values are legal"); + } + if (Offset < 0 || !isUInt(Offset)) + Error(S, "invalid bit offset: only 5-bit values are legal"); + if ((Width-1) < 0 || !isUInt(Width-1)) + Error(S, "invalid bitfield width: only values from 1 to 32 are legal"); + Imm16Val = (HwReg.Id << ID_SHIFT_) | (Offset << OFFSET_SHIFT_) | ((Width-1) << WIDTH_M1_SHIFT_); + } + break; + } + Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTyHwreg)); + return MatchOperand_Success; +} - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; +bool AMDGPUOperand::isSWaitCnt() const { + return isImm(); +} + +bool AMDGPUOperand::isHwreg() const { + return isImmTy(ImmTyHwreg); +} + +bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId) { + using namespace llvm::AMDGPU::SendMsg; + + if (Parser.getTok().getString() != "sendmsg") + return true; + Parser.Lex(); + if (getLexer().isNot(AsmToken::LParen)) + return true; + Parser.Lex(); + + if (getLexer().is(AsmToken::Identifier)) { + Msg.IsSymbolic = true; + Msg.Id = ID_UNKNOWN_; + const std::string tok = Parser.getTok().getString(); + for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) { + switch(i) { + default: continue; // Omit gaps. + case ID_INTERRUPT: case ID_GS: case ID_GS_DONE: case ID_SYSMSG: break; + } + if (tok == IdSymbolic[i]) { + Msg.Id = i; + break; + } + } + Parser.Lex(); + } else { + Msg.IsSymbolic = false; + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(Msg.Id)) + return true; + if (getLexer().is(AsmToken::Integer)) + if (getParser().parseAbsoluteExpression(Msg.Id)) + Msg.Id = ID_UNKNOWN_; } + if (Msg.Id == ID_UNKNOWN_) // Don't know how to parse the rest. + return false; - // flat atomic instructions don't have a glc argument. - if (OptionalIdx.count(AMDGPUOperand::ImmTyGLC)) { - unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; - ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); + if (!(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG)) { + if (getLexer().isNot(AsmToken::RParen)) + return true; + Parser.Lex(); + return false; } - unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; - unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; + if (getLexer().isNot(AsmToken::Comma)) + return true; + Parser.Lex(); - ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); -} + assert(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG); + Operation.Id = ID_UNKNOWN_; + if (getLexer().is(AsmToken::Identifier)) { + Operation.IsSymbolic = true; + const char* const *S = (Msg.Id == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic; + const int F = (Msg.Id == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_; + const int L = (Msg.Id == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_; + const StringRef Tok = Parser.getTok().getString(); + for (int i = F; i < L; ++i) { + if (Tok == S[i]) { + Operation.Id = i; + break; + } + } + Parser.Lex(); + } else { + Operation.IsSymbolic = false; + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(Operation.Id)) + return true; + } -//===----------------------------------------------------------------------===// -// mubuf -//===----------------------------------------------------------------------===// + if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) { + // Stream id is optional. + if (getLexer().is(AsmToken::RParen)) { + Parser.Lex(); + return false; + } -static const OptionalOperand MubufOptionalOps [] = { - {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, - {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, - {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, - {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} -}; + if (getLexer().isNot(AsmToken::Comma)) + return true; + Parser.Lex(); -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) { - return parseOptionalOps(MubufOptionalOps, Operands); + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(StreamId)) + return true; + } + + if (getLexer().isNot(AsmToken::RParen)) + return true; + Parser.Lex(); + return false; } AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseOffset(OperandVector &Operands) { - return parseIntWithPrefix("offset", Operands); +AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { + using namespace llvm::AMDGPU::SendMsg; + + int64_t Imm16Val = 0; + SMLoc S = Parser.getTok().getLoc(); + + switch(getLexer().getKind()) { + default: + return MatchOperand_NoMatch; + case AsmToken::Integer: + // The operand can be an integer value. + if (getParser().parseAbsoluteExpression(Imm16Val)) + return MatchOperand_NoMatch; + if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) { + Error(S, "invalid immediate: only 16-bit values are legal"); + // Do not return error code, but create an imm operand anyway and proceed + // to the next operand, if any. That avoids unneccessary error messages. + } + break; + case AsmToken::Identifier: { + OperandInfoTy Msg(ID_UNKNOWN_); + OperandInfoTy Operation(OP_UNKNOWN_); + int64_t StreamId = STREAM_ID_DEFAULT_; + if (parseSendMsgConstruct(Msg, Operation, StreamId)) + return MatchOperand_ParseFail; + do { + // Validate and encode message ID. + if (! ((ID_INTERRUPT <= Msg.Id && Msg.Id <= ID_GS_DONE) + || Msg.Id == ID_SYSMSG)) { + if (Msg.IsSymbolic) + Error(S, "invalid/unsupported symbolic name of message"); + else + Error(S, "invalid/unsupported code of message"); + break; + } + Imm16Val = (Msg.Id << ID_SHIFT_); + // Validate and encode operation ID. + if (Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) { + if (! (OP_GS_FIRST_ <= Operation.Id && Operation.Id < OP_GS_LAST_)) { + if (Operation.IsSymbolic) + Error(S, "invalid symbolic name of GS_OP"); + else + Error(S, "invalid code of GS_OP: only 2-bit values are legal"); + break; + } + if (Operation.Id == OP_GS_NOP + && Msg.Id != ID_GS_DONE) { + Error(S, "invalid GS_OP: NOP is for GS_DONE only"); + break; + } + Imm16Val |= (Operation.Id << OP_SHIFT_); + } + if (Msg.Id == ID_SYSMSG) { + if (! (OP_SYS_FIRST_ <= Operation.Id && Operation.Id < OP_SYS_LAST_)) { + if (Operation.IsSymbolic) + Error(S, "invalid/unsupported symbolic name of SYSMSG_OP"); + else + Error(S, "invalid/unsupported code of SYSMSG_OP"); + break; + } + Imm16Val |= (Operation.Id << OP_SHIFT_); + } + // Validate and encode stream ID. + if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) { + if (! (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_)) { + Error(S, "invalid stream id: only 2-bit values are legal"); + break; + } + Imm16Val |= (StreamId << STREAM_ID_SHIFT_); + } + } while (0); + } + break; + } + Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTySendMsg)); + return MatchOperand_Success; } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseGLC(OperandVector &Operands) { - return parseNamedBit("glc", Operands); +bool AMDGPUOperand::isSendMsg() const { + return isImmTy(ImmTySendMsg); } +//===----------------------------------------------------------------------===// +// sopp branch targets +//===----------------------------------------------------------------------===// + AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseSLC(OperandVector &Operands) { - return parseNamedBit("slc", Operands); +AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + + switch (getLexer().getKind()) { + default: return MatchOperand_ParseFail; + case AsmToken::Integer: { + int64_t Imm; + if (getParser().parseAbsoluteExpression(Imm)) + return MatchOperand_ParseFail; + Operands.push_back(AMDGPUOperand::CreateImm(Imm, S)); + return MatchOperand_Success; + } + + case AsmToken::Identifier: + Operands.push_back(AMDGPUOperand::CreateExpr( + MCSymbolRefExpr::create(getContext().getOrCreateSymbol( + Parser.getTok().getString()), getContext()), S)); + Parser.Lex(); + return MatchOperand_Success; + } } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseTFE(OperandVector &Operands) { - return parseNamedBit("tfe", Operands); +//===----------------------------------------------------------------------===// +// mubuf +//===----------------------------------------------------------------------===// + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyGLC); } -bool AMDGPUOperand::isMubufOffset() const { - return isImm() && isUInt<12>(getImm()); +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTySLC); } -void AMDGPUAsmParser::cvtMubuf(MCInst &Inst, - const OperandVector &Operands) { - std::map OptionalIdx; +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyTFE); +} + +void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, + const OperandVector &Operands, + bool IsAtomic, bool IsAtomicReturn) { + OptionalImmIndexMap OptionalIdx; + assert(IsAtomicReturn ? IsAtomic : true); for (unsigned i = 1, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); @@ -1717,36 +2155,111 @@ void AMDGPUAsmParser::cvtMubuf(MCInst &Inst, OptionalIdx[Op.getImmTy()] = i; } - assert(OptionalIdx.size() == 4); - - unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; - unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; - unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; - unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; + // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns. + if (IsAtomicReturn) { + MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning. + Inst.insert(I, *I); + } - ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); + if (!IsAtomic) { // glc is hard-coded. + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); + } + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } //===----------------------------------------------------------------------===// // mimg //===----------------------------------------------------------------------===// -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDMask(OperandVector &Operands) { - return parseIntWithPrefix("dmask", Operands); +void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) { + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + + OptionalImmIndexMap OptionalIdx; + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + + // Add the register arguments + if (Op.isRegOrImm()) { + Op.addRegOrImmOperands(Inst, 1); + continue; + } else if (Op.isImmModifier()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + assert(false); + } + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseUNorm(OperandVector &Operands) { - return parseNamedBit("unorm", Operands); +void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) { + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + + // Add src, same as dst + ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1); + + OptionalImmIndexMap OptionalIdx; + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + + // Add the register arguments + if (Op.isRegOrImm()) { + Op.addRegOrImmOperands(Inst, 1); + continue; + } else if (Op.isImmModifier()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + assert(false); + } + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseR128(OperandVector &Operands) { - return parseNamedBit("r128", Operands); +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDMask); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyUNorm); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDA); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyR128); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyLWE); } //===----------------------------------------------------------------------===// @@ -1766,6 +2279,14 @@ bool AMDGPUOperand::isSMRDLiteralOffset() const { return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm()); } +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset); +} + //===----------------------------------------------------------------------===// // vop3 //===----------------------------------------------------------------------===// @@ -1792,91 +2313,435 @@ static bool ConvertOmodDiv(int64_t &Div) { return false; } -static const OptionalOperand VOP3OptionalOps [] = { - {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr}, - {"mul", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul}, - {"div", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv}, +static bool ConvertBoundCtrl(int64_t &BoundCtrl) { + if (BoundCtrl == 0) { + BoundCtrl = 1; + return true; + } else if (BoundCtrl == -1) { + BoundCtrl = 0; + return true; + } + return false; +} + +// Note: the order in this table matches the order of operands in AsmString. +static const OptionalOperand AMDGPUOptionalOperandTable[] = { + {"offen", AMDGPUOperand::ImmTyOffen, true, nullptr}, + {"idxen", AMDGPUOperand::ImmTyIdxen, true, nullptr}, + {"addr64", AMDGPUOperand::ImmTyAddr64, true, nullptr}, + {"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr}, + {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr}, + {"gds", AMDGPUOperand::ImmTyGDS, true, nullptr}, + {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, + {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, + {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, + {"clamp", AMDGPUOperand::ImmTyClampSI, true, nullptr}, + {"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul}, + {"unorm", AMDGPUOperand::ImmTyUNorm, true, nullptr}, + {"da", AMDGPUOperand::ImmTyDA, true, nullptr}, + {"r128", AMDGPUOperand::ImmTyR128, true, nullptr}, + {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr}, + {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, + {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr}, + {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr}, + {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl}, + {"dst_sel", AMDGPUOperand::ImmTySdwaDstSel, false, nullptr}, + {"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr}, + {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, + {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr}, }; -static bool isVOP3(OperandVector &Operands) { - if (operandsHaveModifiers(Operands)) - return true; +AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { + OperandMatchResultTy res; + for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) { + // try to parse any optional operand here + if (Op.IsBit) { + res = parseNamedBit(Op.Name, Operands, Op.Type); + } else if (Op.Type == AMDGPUOperand::ImmTyOModSI) { + res = parseOModOperand(Operands); + } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstSel || + Op.Type == AMDGPUOperand::ImmTySdwaSrc0Sel || + Op.Type == AMDGPUOperand::ImmTySdwaSrc1Sel) { + res = parseSDWASel(Operands, Op.Name, Op.Type); + } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) { + res = parseSDWADstUnused(Operands); + } else { + res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); + } + if (res != MatchOperand_NoMatch) { + return res; + } + } + return MatchOperand_NoMatch; +} - AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]); +AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) +{ + StringRef Name = Parser.getTok().getString(); + if (Name == "mul") { + return parseIntWithPrefix("mul", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodMul); + } else if (Name == "div") { + return parseIntWithPrefix("div", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv); + } else { + return MatchOperand_NoMatch; + } +} - if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID)) - return true; +void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) { + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + for (unsigned E = Operands.size(); I != E; ++I) + ((AMDGPUOperand &)*Operands[I]).addRegOrImmOperands(Inst, 1); +} - if (Operands.size() >= 5) - return true; +void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) { + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if (TSFlags & SIInstrFlags::VOP3) { + cvtVOP3(Inst, Operands); + } else { + cvtId(Inst, Operands); + } +} - if (Operands.size() > 3) { - AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]); - if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) || - Src1Op.isRegClass(AMDGPU::SReg_64RegClassID))) - return true; +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + if (Op.isRegOrImmWithInputMods()) { + // only fp modifiers allowed in VOP3 + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isImm()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + assert(false); + } + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); +} + +//===----------------------------------------------------------------------===// +// dpp +//===----------------------------------------------------------------------===// + +bool AMDGPUOperand::isDPPCtrl() const { + bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm()); + if (result) { + int64_t Imm = getImm(); + return ((Imm >= 0x000) && (Imm <= 0x0ff)) || + ((Imm >= 0x101) && (Imm <= 0x10f)) || + ((Imm >= 0x111) && (Imm <= 0x11f)) || + ((Imm >= 0x121) && (Imm <= 0x12f)) || + (Imm == 0x130) || + (Imm == 0x134) || + (Imm == 0x138) || + (Imm == 0x13c) || + (Imm == 0x140) || + (Imm == 0x141) || + (Imm == 0x142) || + (Imm == 0x143); } return false; } AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { - - // The value returned by this function may change after parsing - // an operand so store the original value here. - bool HasModifiers = operandsHaveModifiers(Operands); - - bool IsVOP3 = isVOP3(Operands); - if (HasModifiers || IsVOP3 || - getLexer().isNot(AsmToken::EndOfStatement) || - getForcedEncodingSize() == 64) { - - AMDGPUAsmParser::OperandMatchResultTy Res = - parseOptionalOps(VOP3OptionalOps, Operands); - - if (!HasModifiers && Res == MatchOperand_Success) { - // We have added a modifier operation, so we need to make sure all - // previous register operands have modifiers - for (unsigned i = 2, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); - if (Op.isReg()) - Op.setModifiers(0); +AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + StringRef Prefix; + int64_t Int; + + if (getLexer().getKind() == AsmToken::Identifier) { + Prefix = Parser.getTok().getString(); + } else { + return MatchOperand_NoMatch; + } + + if (Prefix == "row_mirror") { + Int = 0x140; + } else if (Prefix == "row_half_mirror") { + Int = 0x141; + } else { + // Check to prevent parseDPPCtrlOps from eating invalid tokens + if (Prefix != "quad_perm" + && Prefix != "row_shl" + && Prefix != "row_shr" + && Prefix != "row_ror" + && Prefix != "wave_shl" + && Prefix != "wave_rol" + && Prefix != "wave_shr" + && Prefix != "wave_ror" + && Prefix != "row_bcast") { + return MatchOperand_NoMatch; + } + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + if (Prefix == "quad_perm") { + // quad_perm:[%d,%d,%d,%d] + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + Int = getLexer().getTok().getIntVal(); + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + Int += (getLexer().getTok().getIntVal() << 2); + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + Int += (getLexer().getTok().getIntVal() << 4); + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + Int += (getLexer().getTok().getIntVal() << 6); + + Parser.Lex(); + if (getLexer().isNot(AsmToken::RBrac)) + return MatchOperand_ParseFail; + + } else { + // sel:%d + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + Int = getLexer().getTok().getIntVal(); + + if (Prefix == "row_shl") { + Int |= 0x100; + } else if (Prefix == "row_shr") { + Int |= 0x110; + } else if (Prefix == "row_ror") { + Int |= 0x120; + } else if (Prefix == "wave_shl") { + Int = 0x130; + } else if (Prefix == "wave_rol") { + Int = 0x134; + } else if (Prefix == "wave_shr") { + Int = 0x138; + } else if (Prefix == "wave_ror") { + Int = 0x13C; + } else if (Prefix == "row_bcast") { + if (Int == 15) { + Int = 0x142; + } else if (Int == 31) { + Int = 0x143; + } else { + return MatchOperand_ParseFail; + } + } else { + return MatchOperand_ParseFail; } } - return Res; } - return MatchOperand_NoMatch; + Parser.Lex(); // eat last token + + Operands.push_back(AMDGPUOperand::CreateImm(Int, S, + AMDGPUOperand::ImmTyDppCtrl)); + return MatchOperand_Success; } -void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const { + return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask); +} - unsigned i = 1; +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const { + return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl); +} + +void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + + unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); - if (Desc.getNumDefs() > 0) { - ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - std::map OptionalIdx; + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + // Add the register arguments + if (Op.isRegOrImmWithInputMods()) { + // Only float modifiers supported in DPP + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isDPPCtrl()) { + Op.addImmOperands(Inst, 1); + } else if (Op.isImm()) { + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = I; + } else { + llvm_unreachable("Invalid operand type"); + } + } - if (operandsHaveModifiers(Operands)) { - for (unsigned e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); +} - if (Op.isRegWithInputMods()) { - ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2); - continue; - } - OptionalIdx[Op.getImmTy()] = i; - } +//===----------------------------------------------------------------------===// +// sdwa +//===----------------------------------------------------------------------===// - unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp]; - unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod]; +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix, + AMDGPUOperand::ImmTy Type) { + SMLoc S = Parser.getTok().getLoc(); + StringRef Value; + AMDGPUAsmParser::OperandMatchResultTy res; - ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1); - } else { - for (unsigned e = Operands.size(); i != e; ++i) - ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1); + res = parseStringWithPrefix(Prefix, Value); + if (res != MatchOperand_Success) { + return res; + } + + int64_t Int; + Int = StringSwitch(Value) + .Case("BYTE_0", 0) + .Case("BYTE_1", 1) + .Case("BYTE_2", 2) + .Case("BYTE_3", 3) + .Case("WORD_0", 4) + .Case("WORD_1", 5) + .Case("DWORD", 6) + .Default(0xffffffff); + Parser.Lex(); // eat last token + + if (Int == 0xffffffff) { + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(Int, S, Type)); + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + StringRef Value; + AMDGPUAsmParser::OperandMatchResultTy res; + + res = parseStringWithPrefix("dst_unused", Value); + if (res != MatchOperand_Success) { + return res; + } + + int64_t Int; + Int = StringSwitch(Value) + .Case("UNUSED_PAD", 0) + .Case("UNUSED_SEXT", 1) + .Case("UNUSED_PRESERVE", 2) + .Default(0xffffffff); + Parser.Lex(); // eat last token + + if (Int == 0xffffffff) { + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(Int, S, + AMDGPUOperand::ImmTySdwaDstUnused)); + return MatchOperand_Success; +} + +void AMDGPUAsmParser::cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOP1); +} + +void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOP2); +} + +void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOPC); +} + +void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, + uint64_t BasicInstType) { + OptionalImmIndexMap OptionalIdx; + + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + // Add the register arguments + if (BasicInstType == SIInstrFlags::VOPC && + Op.isReg() && + Op.Reg.RegNo == AMDGPU::VCC) { + // VOPC sdwa use "vcc" token as dst. Skip it. + continue; + } else if (Op.isRegOrImmWithInputMods()) { + Op.addRegOrImmWithInputModsOperands(Inst, 2); + } else if (Op.isImm()) { + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = I; + } else { + llvm_unreachable("Invalid operand type"); + } + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + + if (Inst.getOpcode() == AMDGPU::V_NOP_sdwa) { + // V_NOP_sdwa has no optional sdwa arguments + return; + } + switch (BasicInstType) { + case SIInstrFlags::VOP1: { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + break; + } + case SIInstrFlags::VOP2: { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); + break; + } + case SIInstrFlags::VOPC: { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); + break; + } + default: + llvm_unreachable("Invalid instruction type. Only VOP1, VOP2 and VOPC allowed"); } } @@ -1890,3 +2755,37 @@ extern "C" void LLVMInitializeAMDGPUAsmParser() { #define GET_MATCHER_IMPLEMENTATION #include "AMDGPUGenAsmMatcher.inc" + +// This fuction should be defined after auto-generated include so that we have +// MatchClassKind enum defined +unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, + unsigned Kind) { + // Tokens like "glc" would be parsed as immediate operands in ParseOperand(). + // But MatchInstructionImpl() expects to meet token and fails to validate + // operand. This method checks if we are given immediate operand but expect to + // get corresponding token. + AMDGPUOperand &Operand = (AMDGPUOperand&)Op; + switch (Kind) { + case MCK_addr64: + return Operand.isAddr64() ? Match_Success : Match_InvalidOperand; + case MCK_gds: + return Operand.isGDS() ? Match_Success : Match_InvalidOperand; + case MCK_glc: + return Operand.isGLC() ? Match_Success : Match_InvalidOperand; + case MCK_idxen: + return Operand.isIdxen() ? Match_Success : Match_InvalidOperand; + case MCK_offen: + return Operand.isOffen() ? Match_Success : Match_InvalidOperand; + case MCK_SSrc32: + // When operands have expression values, they will return true for isToken, + // because it is not possible to distinguish between a token and an + // expression at parse time. MatchInstructionImpl() will always try to + // match an operand as a token, when isToken returns true, and when the + // name of the expression is not a valid token, the match will fail, + // so we need to handle it here. + return Operand.isSSrc32() ? Match_Success : Match_InvalidOperand; + case MCK_SoppBrTarget: + return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand; + default: return Match_InvalidOperand; + } +} diff --git a/lib/Target/AMDGPU/AsmParser/CMakeLists.txt b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt index 21ddc4eb83d2..70be7bb6eb36 100644 --- a/lib/Target/AMDGPU/AsmParser/CMakeLists.txt +++ b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt @@ -1,3 +1,5 @@ add_llvm_library(LLVMAMDGPUAsmParser AMDGPUAsmParser.cpp ) + +add_dependencies(LLVMAMDGPUAsmParser LLVMAMDGPUUtils) diff --git a/lib/Target/AMDGPU/AsmParser/Makefile b/lib/Target/AMDGPU/AsmParser/Makefile deleted file mode 100644 index 5ad219028036..000000000000 --- a/lib/Target/AMDGPU/AsmParser/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AMDGPU/AsmParser/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAMDGPUAsmParser - -# Hack: we need to include 'main' AMDGPU target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td index c543814cae0d..f9a9f79126bd 100644 --- a/lib/Target/AMDGPU/CIInstructions.td +++ b/lib/Target/AMDGPU/CIInstructions.td @@ -25,14 +25,6 @@ // BUFFER_LOAD_DWORDX3 // BUFFER_STORE_DWORDX3 - -def isCIVI : Predicate < - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " - "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" ->, AssemblerPredicate<"FeatureCIInsts">; - -def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; - //===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// @@ -108,9 +100,11 @@ defm S_DCACHE_INV_VOL : SMRD_Inval , // MUBUF Instructions //===----------------------------------------------------------------------===// +let DisableSIDecoder = 1 in { defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate , "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol >; +} //===----------------------------------------------------------------------===// // Flat Instructions @@ -159,129 +153,114 @@ defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper < flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96 >; defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC < - flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32 + flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32, i32, atomic_swap_flat >; defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < - flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64 + flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, i32, + atomic_cmp_swap_flat, v2i32, VReg_64 >; defm FLAT_ATOMIC_ADD : FLAT_ATOMIC < - flat<0x32, 0x42>, "flat_atomic_add", VGPR_32 + flat<0x32, 0x42>, "flat_atomic_add", VGPR_32, i32, atomic_add_flat >; defm FLAT_ATOMIC_SUB : FLAT_ATOMIC < - flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32 + flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32, i32, atomic_sub_flat >; defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC < - flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32 + flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32, i32, atomic_min_flat >; defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC < - flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32 + flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32, i32, atomic_umin_flat >; defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC < - flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32 + flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32, i32, atomic_max_flat >; defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC < - flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32 + flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32, i32, atomic_umax_flat >; defm FLAT_ATOMIC_AND : FLAT_ATOMIC < - flat<0x39, 0x48>, "flat_atomic_and", VGPR_32 + flat<0x39, 0x48>, "flat_atomic_and", VGPR_32, i32, atomic_and_flat >; defm FLAT_ATOMIC_OR : FLAT_ATOMIC < - flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32 + flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32, i32, atomic_or_flat >; defm FLAT_ATOMIC_XOR : FLAT_ATOMIC < - flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32 + flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32, i32, atomic_xor_flat >; defm FLAT_ATOMIC_INC : FLAT_ATOMIC < - flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32 + flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32, i32, atomic_inc_flat >; defm FLAT_ATOMIC_DEC : FLAT_ATOMIC < - flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32 + flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32, i32, atomic_dec_flat >; defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC < - flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64 + flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64, i64, atomic_swap_flat >; defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < - flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 + flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, i64, + atomic_cmp_swap_flat, v2i64, VReg_128 >; defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC < - flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64 + flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64, i64, atomic_add_flat >; defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC < - flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64 + flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64, i64, atomic_sub_flat >; defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC < - flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64 + flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64, i64, atomic_min_flat >; defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC < - flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64 + flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64, i64, atomic_umin_flat >; defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC < - flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64 + flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64, i64, atomic_max_flat >; defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC < - flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64 + flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64, i64, atomic_umax_flat >; defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC < - flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64 + flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64, i64, atomic_and_flat >; defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC < - flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64 + flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64, i64, atomic_or_flat >; defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC < - flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64 + flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64, i64, atomic_xor_flat >; defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC < - flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64 + flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64, i64, atomic_inc_flat >; defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC < - flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64 + flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64, i64, atomic_dec_flat >; } // End SubtargetPredicate = isCIVI // CI Only flat instructions -let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in { +let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1 in { defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < - flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64 + flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, f32, + null_frag, v2f32, VReg_64 >; defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC < - flat<0x3f>, "flat_atomic_fmin", VGPR_32 + flat<0x3f>, "flat_atomic_fmin", VGPR_32, f32 >; defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC < - flat<0x40>, "flat_atomic_fmax", VGPR_32 + flat<0x40>, "flat_atomic_fmax", VGPR_32, f32 >; defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < - flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 + flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, f64, + null_frag, v2f64, VReg_128 >; defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC < - flat<0x5f>, "flat_atomic_fmin_x2", VReg_64 + flat<0x5f>, "flat_atomic_fmin_x2", VReg_64, f64 >; defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC < - flat<0x60>, "flat_atomic_fmax_x2", VReg_64 + flat<0x60>, "flat_atomic_fmax_x2", VReg_64, f64 >; -} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst - -let Predicates = [isCI] in { - -// Convert (x - floor(x)) to fract(x) -def : Pat < - (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), - (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), - (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -// Convert (x + (-floor(x))) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -} // End Predicates = [isCI] - +} // End SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1 //===----------------------------------------------------------------------===// // Flat Patterns @@ -289,12 +268,17 @@ def : Pat < let Predicates = [isCIVI] in { -// Patterns for global loads with no offset +// Patterns for global loads with no offset. class FlatLoadPat : Pat < (vt (node i64:$addr)), (inst $addr, 0, 0, 0) >; +class FlatLoadAtomicPat : Pat < + (vt (node i64:$addr)), + (inst $addr, 1, 0, 0) +>; + def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; @@ -303,9 +287,20 @@ def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; +def : FlatLoadAtomicPat ; +def : FlatLoadAtomicPat ; + + class FlatStorePat : Pat < (node vt:$data, i64:$addr), - (inst $data, $addr, 0, 0, 0) + (inst $addr, $data, 0, 0, 0) +>; + +class FlatStoreAtomicPat : Pat < + // atomic store follows atomic binop convention so the address comes + // first. + (node i64:$addr, vt:$data), + (inst $addr, $data, 1, 0, 0) >; def : FlatStorePat ; @@ -314,20 +309,41 @@ def : FlatStorePat ; def : FlatStorePat ; def : FlatStorePat ; -class FlatAtomicPat : Pat < - (vt (node i64:$addr, vt:$data)), +def : FlatStoreAtomicPat ; +def : FlatStoreAtomicPat ; + +class FlatAtomicPat : Pat < + (vt (node i64:$addr, data_vt:$data)), (inst $addr, $data, 0, 0) >; def : FlatAtomicPat ; -def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; + } // End Predicates = [isCIVI] diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index b9ef0e821763..45825c9cc76a 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -10,15 +10,30 @@ tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer) tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) +tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler) add_public_tablegen_target(AMDGPUCommonTableGen) +# List of all GlobalISel files. +set(GLOBAL_ISEL_FILES + AMDGPUCallLowering.cpp + ) + +# Add GlobalISel files to the dependencies if the user wants to build it. +if(LLVM_BUILD_GLOBAL_ISEL) + set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES}) +else() + set(GLOBAL_ISEL_BUILD_FILES"") + set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES}) +endif() + + add_llvm_target(AMDGPUCodeGen AMDILCFGStructurizer.cpp AMDGPUAlwaysInlinePass.cpp AMDGPUAnnotateKernelFeatures.cpp AMDGPUAnnotateUniformValues.cpp AMDGPUAsmPrinter.cpp - AMDGPUDiagnosticInfoUnsupported.cpp + AMDGPUCodeGenPrepare.cpp AMDGPUFrameLowering.cpp AMDGPUTargetObjectFile.cpp AMDGPUIntrinsicInfo.cpp @@ -33,10 +48,12 @@ add_llvm_target(AMDGPUCodeGen AMDGPUInstrInfo.cpp AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp + GCNHazardRecognizer.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp R600EmitClauseMarkers.cpp R600ExpandSpecialInstrs.cpp + R600FrameLowering.cpp R600InstrInfo.cpp R600ISelLowering.cpp R600MachineFunctionInfo.cpp @@ -44,11 +61,10 @@ add_llvm_target(AMDGPUCodeGen R600OptimizeVectorRegisters.cpp R600Packetizer.cpp R600RegisterInfo.cpp - R600TextureIntrinsicsReplacer.cpp SIAnnotateControlFlow.cpp + SIDebuggerInsertNops.cpp SIFixControlFlowLiveIntervals.cpp SIFixSGPRCopies.cpp - SIFixSGPRLiveRanges.cpp SIFoldOperands.cpp SIFrameLowering.cpp SIInsertWaits.cpp @@ -62,10 +78,13 @@ add_llvm_target(AMDGPUCodeGen SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp + SIWholeQuadMode.cpp + ${GLOBAL_ISEL_BUILD_FILES} ) add_subdirectory(AsmParser) add_subdirectory(InstPrinter) +add_subdirectory(Disassembler) add_subdirectory(TargetInfo) add_subdirectory(MCTargetDesc) add_subdirectory(Utils) diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td index a6c3785c815b..98bc6e856ea2 100644 --- a/lib/Target/AMDGPU/CaymanInstructions.td +++ b/lib/Target/AMDGPU/CaymanInstructions.td @@ -51,7 +51,6 @@ def : RsqPat; def : POW_Common ; defm DIV_cm : DIV_Common; -defm : Expand24UBitOps; // RECIP_UINT emulation for Cayman // The multiplication scales from [0,1] to the unsigned integer range @@ -203,27 +202,53 @@ def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0, //===----------------------------------------------------------------------===// // 8-bit reads -def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1, - [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID1_8_cm : VTX_READ_8_cm <1, + [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1, - [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] +// 16-bit reads +def VTX_READ_ID1_16_cm : VTX_READ_16_cm <1, + [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))] >; // 32-bit reads -def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1, - [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID1_32_cm : VTX_READ_32_cm <1, + [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] >; // 64-bit reads -def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1, - [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID1_64_cm : VTX_READ_64_cm <1, + [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] >; // 128-bit reads -def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1, - [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID1_128_cm : VTX_READ_128_cm <1, + [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] +>; + +// 8-bit reads +def VTX_READ_ID2_8_cm : VTX_READ_8_cm <2, + [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))] +>; + +// 16-bit reads +def VTX_READ_ID2_16_cm : VTX_READ_16_cm <2, + [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))] +>; + +// 32-bit reads +def VTX_READ_ID2_32_cm : VTX_READ_32_cm <2, + [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] +>; + +// 64-bit reads +def VTX_READ_ID2_64_cm : VTX_READ_64_cm <2, + [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] +>; + +// 128-bit reads +def VTX_READ_ID2_128_cm : VTX_READ_128_cm <2, + [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] >; } // End isCayman diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp new file mode 100644 index 000000000000..e11de855fe5f --- /dev/null +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -0,0 +1,437 @@ +//===-- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This file contains definition for AMDGPU ISA disassembler +// +//===----------------------------------------------------------------------===// + +// ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)? + +#include "AMDGPUDisassembler.h" +#include "AMDGPU.h" +#include "AMDGPURegisterInfo.h" +#include "SIDefines.h" +#include "Utils/AMDGPUBaseInfo.h" + +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCFixedLenDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/TargetRegistry.h" + + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-disassembler" + +typedef llvm::MCDisassembler::DecodeStatus DecodeStatus; + + +inline static MCDisassembler::DecodeStatus +addOperand(MCInst &Inst, const MCOperand& Opnd) { + Inst.addOperand(Opnd); + return Opnd.isValid() ? + MCDisassembler::Success : + MCDisassembler::SoftFail; +} + +#define DECODE_OPERAND2(RegClass, DecName) \ +static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \ + unsigned Imm, \ + uint64_t /*Addr*/, \ + const void *Decoder) { \ + auto DAsm = static_cast(Decoder); \ + return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \ +} + +#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass) + +DECODE_OPERAND(VGPR_32) +DECODE_OPERAND(VS_32) +DECODE_OPERAND(VS_64) + +DECODE_OPERAND(VReg_64) +DECODE_OPERAND(VReg_96) +DECODE_OPERAND(VReg_128) + +DECODE_OPERAND(SReg_32) +DECODE_OPERAND(SReg_32_XM0) +DECODE_OPERAND(SReg_64) +DECODE_OPERAND(SReg_128) +DECODE_OPERAND(SReg_256) +DECODE_OPERAND(SReg_512) + +#define GET_SUBTARGETINFO_ENUM +#include "AMDGPUGenSubtargetInfo.inc" +#undef GET_SUBTARGETINFO_ENUM + +#include "AMDGPUGenDisassemblerTables.inc" + +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +template static inline T eatBytes(ArrayRef& Bytes) { + assert(Bytes.size() >= sizeof(T)); + const auto Res = support::endian::read(Bytes.data()); + Bytes = Bytes.slice(sizeof(T)); + return Res; +} + +DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table, + MCInst &MI, + uint64_t Inst, + uint64_t Address) const { + assert(MI.getOpcode() == 0); + assert(MI.getNumOperands() == 0); + MCInst TmpInst; + const auto SavedBytes = Bytes; + if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) { + MI = TmpInst; + return MCDisassembler::Success; + } + Bytes = SavedBytes; + return MCDisassembler::Fail; +} + +DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef Bytes_, + uint64_t Address, + raw_ostream &WS, + raw_ostream &CS) const { + CommentStream = &CS; + + // ToDo: AMDGPUDisassembler supports only VI ISA. + assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA."); + + const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size()); + Bytes = Bytes_.slice(0, MaxInstBytesNum); + + DecodeStatus Res = MCDisassembler::Fail; + do { + // ToDo: better to switch encoding length using some bit predicate + // but it is unknown yet, so try all we can + + // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2 + // encodings + if (Bytes.size() >= 8) { + const uint64_t QW = eatBytes(Bytes); + Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address); + if (Res) break; + } + + // Reinitialize Bytes as DPP64 could have eaten too much + Bytes = Bytes_.slice(0, MaxInstBytesNum); + + // Try decode 32-bit instruction + if (Bytes.size() < 4) break; + const uint32_t DW = eatBytes(Bytes); + Res = tryDecodeInst(DecoderTableVI32, MI, DW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address); + if (Res) break; + + if (Bytes.size() < 4) break; + const uint64_t QW = ((uint64_t)eatBytes(Bytes) << 32) | DW; + Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address); + } while (false); + + Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0; + return Res; +} + +const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const { + return getContext().getRegisterInfo()-> + getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]); +} + +inline +MCOperand AMDGPUDisassembler::errOperand(unsigned V, + const Twine& ErrMsg) const { + *CommentStream << "Error: " + ErrMsg; + + // ToDo: add support for error operands to MCInst.h + // return MCOperand::createError(V); + return MCOperand(); +} + +inline +MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const { + return MCOperand::createReg(RegId); +} + +inline +MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID, + unsigned Val) const { + const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID]; + if (Val >= RegCl.getNumRegs()) + return errOperand(Val, Twine(getRegClassName(RegClassID)) + + ": unknown register " + Twine(Val)); + return createRegOperand(RegCl.getRegister(Val)); +} + +inline +MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID, + unsigned Val) const { + // ToDo: SI/CI have 104 SGPRs, VI - 102 + // Valery: here we accepting as much as we can, let assembler sort it out + int shift = 0; + switch (SRegClassID) { + case AMDGPU::SGPR_32RegClassID: + case AMDGPU::TTMP_32RegClassID: + break; + case AMDGPU::SGPR_64RegClassID: + case AMDGPU::TTMP_64RegClassID: + shift = 1; + break; + case AMDGPU::SGPR_128RegClassID: + case AMDGPU::TTMP_128RegClassID: + // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in + // this bundle? + case AMDGPU::SReg_256RegClassID: + // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in + // this bundle? + case AMDGPU::SReg_512RegClassID: + shift = 2; + break; + // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in + // this bundle? + default: + assert(false); + break; + } + if (Val % (1 << shift)) + *CommentStream << "Warning: " << getRegClassName(SRegClassID) + << ": scalar reg isn't aligned " << Val; + return createRegOperand(SRegClassID, Val >> shift); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VS_32(unsigned Val) const { + return decodeSrcOp(OPW32, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const { + return decodeSrcOp(OPW64, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { + return createRegOperand(AMDGPU::VGPR_32RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_64RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_96(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_96RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_128RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const { + // table-gen generated disassembler doesn't care about operand types + // leaving only registry class so SSrc_32 operand turns into SReg_32 + // and therefore we accept immediates and literals here as well + return decodeSrcOp(OPW32, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0(unsigned Val) const { + // SReg_32_XM0 is SReg_32 without M0 + return decodeOperand_SReg_32(Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const { + // see decodeOperand_SReg_32 comment + return decodeSrcOp(OPW64, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const { + return decodeSrcOp(OPW128, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const { + return createSRegOperand(AMDGPU::SReg_256RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const { + return createSRegOperand(AMDGPU::SReg_512RegClassID, Val); +} + + +MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { + // For now all literal constants are supposed to be unsigned integer + // ToDo: deal with signed/unsigned 64-bit integer constants + // ToDo: deal with float/double constants + if (Bytes.size() < 4) + return errOperand(0, "cannot read literal, inst bytes left " + + Twine(Bytes.size())); + return MCOperand::createImm(eatBytes(Bytes)); +} + +MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { + using namespace AMDGPU::EncValues; + assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX); + return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ? + (static_cast(Imm) - INLINE_INTEGER_C_MIN) : + (INLINE_INTEGER_C_POSITIVE_MAX - static_cast(Imm))); + // Cast prevents negative overflow. +} + +MCOperand AMDGPUDisassembler::decodeFPImmed(bool Is32, unsigned Imm) { + assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN + && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX); + // ToDo: case 248: 1/(2*PI) - is allowed only on VI + // ToDo: AMDGPUInstPrinter does not support 1/(2*PI). It consider 1/(2*PI) as + // literal constant. + float V = 0.0f; + switch (Imm) { + case 240: V = 0.5f; break; + case 241: V = -0.5f; break; + case 242: V = 1.0f; break; + case 243: V = -1.0f; break; + case 244: V = 2.0f; break; + case 245: V = -2.0f; break; + case 246: V = 4.0f; break; + case 247: V = -4.0f; break; + case 248: return MCOperand::createImm(Is32 ? // 1/(2*PI) + 0x3e22f983 : + 0x3fc45f306dc9c882); + default: break; + } + return MCOperand::createImm(Is32? FloatToBits(V) : DoubleToBits(V)); +} + +unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { + using namespace AMDGPU; + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); + switch (Width) { + default: // fall + case OPW32: return VGPR_32RegClassID; + case OPW64: return VReg_64RegClassID; + case OPW128: return VReg_128RegClassID; + } +} + +unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { + using namespace AMDGPU; + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); + switch (Width) { + default: // fall + case OPW32: return SGPR_32RegClassID; + case OPW64: return SGPR_64RegClassID; + case OPW128: return SGPR_128RegClassID; + } +} + +unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { + using namespace AMDGPU; + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); + switch (Width) { + default: // fall + case OPW32: return TTMP_32RegClassID; + case OPW64: return TTMP_64RegClassID; + case OPW128: return TTMP_128RegClassID; + } +} + +MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const { + using namespace AMDGPU::EncValues; + assert(Val < 512); // enum9 + + if (VGPR_MIN <= Val && Val <= VGPR_MAX) { + return createRegOperand(getVgprClassId(Width), Val - VGPR_MIN); + } + if (Val <= SGPR_MAX) { + assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning. + return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN); + } + if (TTMP_MIN <= Val && Val <= TTMP_MAX) { + return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN); + } + + assert(Width == OPW32 || Width == OPW64); + const bool Is32 = (Width == OPW32); + + if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX) + return decodeIntImmed(Val); + + if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX) + return decodeFPImmed(Is32, Val); + + if (Val == LITERAL_CONST) + return decodeLiteralConstant(); + + return Is32 ? decodeSpecialReg32(Val) : decodeSpecialReg64(Val); +} + +MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { + using namespace AMDGPU; + switch (Val) { + case 102: return createRegOperand(getMCReg(FLAT_SCR_LO, STI)); + case 103: return createRegOperand(getMCReg(FLAT_SCR_HI, STI)); + // ToDo: no support for xnack_mask_lo/_hi register + case 104: + case 105: break; + case 106: return createRegOperand(VCC_LO); + case 107: return createRegOperand(VCC_HI); + case 108: return createRegOperand(TBA_LO); + case 109: return createRegOperand(TBA_HI); + case 110: return createRegOperand(TMA_LO); + case 111: return createRegOperand(TMA_HI); + case 124: return createRegOperand(M0); + case 126: return createRegOperand(EXEC_LO); + case 127: return createRegOperand(EXEC_HI); + // ToDo: no support for vccz register + case 251: break; + // ToDo: no support for execz register + case 252: break; + case 253: return createRegOperand(SCC); + default: break; + } + return errOperand(Val, "unknown operand encoding " + Twine(Val)); +} + +MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { + using namespace AMDGPU; + switch (Val) { + case 102: return createRegOperand(getMCReg(FLAT_SCR, STI)); + case 106: return createRegOperand(VCC); + case 108: return createRegOperand(TBA); + case 110: return createRegOperand(TMA); + case 126: return createRegOperand(EXEC); + default: break; + } + return errOperand(Val, "unknown operand encoding " + Twine(Val)); +} + +static MCDisassembler *createAMDGPUDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new AMDGPUDisassembler(STI, Ctx); +} + +extern "C" void LLVMInitializeAMDGPUDisassembler() { + TargetRegistry::RegisterMCDisassembler(TheGCNTarget, createAMDGPUDisassembler); +} diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h new file mode 100644 index 000000000000..dff26a044bf5 --- /dev/null +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -0,0 +1,93 @@ +//===-- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA ---*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This file contains declaration for AMDGPU ISA disassembler +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H +#define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" + +namespace llvm { + + class MCContext; + class MCInst; + class MCOperand; + class MCSubtargetInfo; + class Twine; + + class AMDGPUDisassembler : public MCDisassembler { + private: + mutable ArrayRef Bytes; + + public: + AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : + MCDisassembler(STI, Ctx) {} + + ~AMDGPUDisassembler() {} + + DecodeStatus getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef Bytes, uint64_t Address, + raw_ostream &WS, raw_ostream &CS) const override; + + const char* getRegClassName(unsigned RegClassID) const; + + MCOperand createRegOperand(unsigned int RegId) const; + MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const; + MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const; + + MCOperand errOperand(unsigned V, const llvm::Twine& ErrMsg) const; + + DecodeStatus tryDecodeInst(const uint8_t* Table, + MCInst &MI, + uint64_t Inst, + uint64_t Address) const; + + MCOperand decodeOperand_VGPR_32(unsigned Val) const; + MCOperand decodeOperand_VS_32(unsigned Val) const; + MCOperand decodeOperand_VS_64(unsigned Val) const; + + MCOperand decodeOperand_VReg_64(unsigned Val) const; + MCOperand decodeOperand_VReg_96(unsigned Val) const; + MCOperand decodeOperand_VReg_128(unsigned Val) const; + + MCOperand decodeOperand_SReg_32(unsigned Val) const; + MCOperand decodeOperand_SReg_32_XM0(unsigned Val) const; + MCOperand decodeOperand_SReg_64(unsigned Val) const; + MCOperand decodeOperand_SReg_128(unsigned Val) const; + MCOperand decodeOperand_SReg_256(unsigned Val) const; + MCOperand decodeOperand_SReg_512(unsigned Val) const; + + enum OpWidthTy { + OPW32, + OPW64, + OPW128, + OPW_LAST_, + OPW_FIRST_ = OPW32 + }; + unsigned getVgprClassId(const OpWidthTy Width) const; + unsigned getSgprClassId(const OpWidthTy Width) const; + unsigned getTtmpClassId(const OpWidthTy Width) const; + + static MCOperand decodeIntImmed(unsigned Imm); + static MCOperand decodeFPImmed(bool Is32, unsigned Imm); + MCOperand decodeLiteralConstant() const; + + MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeSpecialReg32(unsigned Val) const; + MCOperand decodeSpecialReg64(unsigned Val) const; + }; +} // namespace llvm + +#endif //LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H diff --git a/lib/Target/AMDGPU/Disassembler/CMakeLists.txt b/lib/Target/AMDGPU/Disassembler/CMakeLists.txt new file mode 100644 index 000000000000..fb9231576919 --- /dev/null +++ b/lib/Target/AMDGPU/Disassembler/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMAMDGPUDisassembler + AMDGPUDisassembler.cpp + ) + +add_dependencies(LLVMAMDGPUDisassembler AMDGPUCommonTableGen LLVMAMDGPUUtils) diff --git a/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt b/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt new file mode 100644 index 000000000000..c9005f8a7884 --- /dev/null +++ b/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/AMDGPU/Disassembler/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AMDGPUDisassembler +parent = AMDGPU +required_libraries = AMDGPUDesc AMDGPUInfo AMDGPUUtils MC MCDisassembler Support +add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td index 2245f1417e53..94f05cc41aff 100644 --- a/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/lib/Target/AMDGPU/EvergreenInstructions.td @@ -85,8 +85,6 @@ def COS_eg : COS_Common<0x8E>; def : POW_Common ; def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; -defm : Expand24IBitOps; - //===----------------------------------------------------------------------===// // Memory read/write instructions //===----------------------------------------------------------------------===// @@ -212,23 +210,23 @@ class VTX_READ_128_eg buffer_id, list pattern> // VTX Read from parameter memory space //===----------------------------------------------------------------------===// -def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, +def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <3, [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, +def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <3, [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, +def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <3, [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0, +def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <3, [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, +def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <3, [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] >; @@ -237,27 +235,53 @@ def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, //===----------------------------------------------------------------------===// // 8-bit reads -def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, - [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID1_8_eg : VTX_READ_8_eg <1, + [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))] +>; + +// 16-bit reads +def VTX_READ_ID1_16_eg : VTX_READ_16_eg <1, + [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))] +>; + +// 32-bit reads +def VTX_READ_ID1_32_eg : VTX_READ_32_eg <1, + [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] +>; + +// 64-bit reads +def VTX_READ_ID1_64_eg : VTX_READ_64_eg <1, + [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] +>; + +// 128-bit reads +def VTX_READ_ID1_128_eg : VTX_READ_128_eg <1, + [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] +>; + +// 8-bit reads +def VTX_READ_ID2_8_eg : VTX_READ_8_eg <2, + [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1, - [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] +// 16-bit reads +def VTX_READ_ID2_16_eg : VTX_READ_16_eg <2, + [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))] >; // 32-bit reads -def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, - [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID2_32_eg : VTX_READ_32_eg <2, + [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] >; // 64-bit reads -def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1, - [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID2_64_eg : VTX_READ_64_eg <2, + [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] >; // 128-bit reads -def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, - [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID2_128_eg : VTX_READ_128_eg <2, + [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] >; } // End Predicates = [isEG] @@ -356,8 +380,6 @@ let hasSideEffects = 1 in { def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; } -def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common; - def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { let Pattern = []; let Itinerary = AnyALU; @@ -372,7 +394,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; def GROUP_BARRIER : InstR600 < - (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>, + (outs), (ins), " GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>, R600ALU_Word0, R600ALU_Word1_OP2 <0x54> { @@ -401,11 +423,6 @@ def GROUP_BARRIER : InstR600 < let ALUInst = 1; } -def : Pat < - (int_AMDGPU_barrier_global), - (GROUP_BARRIER) ->; - //===----------------------------------------------------------------------===// // LDS Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp new file mode 100644 index 000000000000..29b1f79187d5 --- /dev/null +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -0,0 +1,264 @@ +//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements hazard recognizers for scheduling on GCN processors. +// +//===----------------------------------------------------------------------===// + +#include "GCNHazardRecognizer.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Hazard Recoginizer Implementation +//===----------------------------------------------------------------------===// + +GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : + CurrCycleInstr(nullptr), + MF(MF), + ST(MF.getSubtarget()) { + MaxLookAhead = 5; +} + +void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { + EmitInstruction(SU->getInstr()); +} + +void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { + CurrCycleInstr = MI; +} + +ScheduleHazardRecognizer::HazardType +GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { + MachineInstr *MI = SU->getInstr(); + + if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) + return NoopHazard; + + if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0) + return NoopHazard; + + if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) + return NoopHazard; + + return NoHazard; +} + +unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { + return PreEmitNoops(SU->getInstr()); +} + +unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { + if (SIInstrInfo::isSMRD(*MI)) + return std::max(0, checkSMRDHazards(MI)); + + if (SIInstrInfo::isVMEM(*MI)) + return std::max(0, checkVMEMHazards(MI)); + + if (SIInstrInfo::isDPP(*MI)) + return std::max(0, checkDPPHazards(MI)); + + return 0; +} + +void GCNHazardRecognizer::EmitNoop() { + EmittedInstrs.push_front(nullptr); +} + +void GCNHazardRecognizer::AdvanceCycle() { + + // When the scheduler detects a stall, it will call AdvanceCycle() without + // emitting any instructions. + if (!CurrCycleInstr) + return; + + const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr); + + // Keep track of emitted instructions + EmittedInstrs.push_front(CurrCycleInstr); + + // Add a nullptr for each additional wait state after the first. Make sure + // not to add more than getMaxLookAhead() items to the list, since we + // truncate the list to that size right after this loop. + for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); + i < e; ++i) { + EmittedInstrs.push_front(nullptr); + } + + // getMaxLookahead() is the largest number of wait states we will ever need + // to insert, so there is no point in keeping track of more than that many + // wait states. + EmittedInstrs.resize(getMaxLookAhead()); + + CurrCycleInstr = nullptr; +} + +void GCNHazardRecognizer::RecedeCycle() { + llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); +} + +//===----------------------------------------------------------------------===// +// Helper Functions +//===----------------------------------------------------------------------===// + +int GCNHazardRecognizer::getWaitStatesSinceDef( + unsigned Reg, function_ref IsHazardDef) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + int WaitStates = -1; + for (MachineInstr *MI : EmittedInstrs) { + ++WaitStates; + if (!MI || !IsHazardDef(MI)) + continue; + if (MI->modifiesRegister(Reg, TRI)) + return WaitStates; + } + return std::numeric_limits::max(); +} + +//===----------------------------------------------------------------------===// +// No-op Hazard Detection +//===----------------------------------------------------------------------===// + +static void addRegsToSet(iterator_range Ops, + std::set &Set) { + for (const MachineOperand &Op : Ops) { + if (Op.isReg()) + Set.insert(Op.getReg()); + } +} + +int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) { + // SMEM soft clause are only present on VI+ + if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + return 0; + + // A soft-clause is any group of consecutive SMEM instructions. The + // instructions in this group may return out of order and/or may be + // replayed (i.e. the same instruction issued more than once). + // + // In order to handle these situations correctly we need to make sure + // that when a clause has more than one instruction, no instruction in the + // clause writes to a register that is read another instruction in the clause + // (including itself). If we encounter this situaion, we need to break the + // clause by inserting a non SMEM instruction. + + std::set ClauseDefs; + std::set ClauseUses; + + for (MachineInstr *MI : EmittedInstrs) { + + // When we hit a non-SMEM instruction then we have passed the start of the + // clause and we can stop. + if (!MI || !SIInstrInfo::isSMRD(*MI)) + break; + + addRegsToSet(MI->defs(), ClauseDefs); + addRegsToSet(MI->uses(), ClauseUses); + } + + if (ClauseDefs.empty()) + return 0; + + // FIXME: When we support stores, we need to make sure not to put loads and + // stores in the same clause if they use the same address. For now, just + // start a new clause whenever we see a store. + if (SMEM->mayStore()) + return 1; + + addRegsToSet(SMEM->defs(), ClauseDefs); + addRegsToSet(SMEM->uses(), ClauseUses); + + std::vector Result(std::max(ClauseDefs.size(), ClauseUses.size())); + std::vector::iterator End; + + End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(), + ClauseUses.begin(), ClauseUses.end(), Result.begin()); + + // If the set of defs and uses intersect then we cannot add this instruction + // to the clause, so we have a hazard. + if (End != Result.begin()) + return 1; + + return 0; +} + +int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + int WaitStatesNeeded = 0; + + WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD); + + // This SMRD hazard only affects SI. + if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS) + return WaitStatesNeeded; + + // A read of an SGPR by SMRD instruction requires 4 wait states when the + // SGPR was written by a VALU instruction. + int SmrdSgprWaitStates = 4; + auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + + for (const MachineOperand &Use : SMRD->uses()) { + if (!Use.isReg()) + continue; + int WaitStatesNeededForUse = + SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { + const SIInstrInfo *TII = ST.getInstrInfo(); + + if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + return 0; + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + // A read of an SGPR by a VMEM instruction requires 5 wait states when the + // SGPR was written by a VALU Instruction. + int VmemSgprWaitStates = 5; + int WaitStatesNeeded = 0; + auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + + for (const MachineOperand &Use : VMEM->uses()) { + if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + + int WaitStatesNeededForUse = + VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + // Check for DPP VGPR read after VALU VGPR write. + int DppVgprWaitStates = 2; + int WaitStatesNeeded = 0; + + for (const MachineOperand &Use : DPP->uses()) { + if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + int WaitStatesNeededForUse = + DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg()); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h new file mode 100644 index 000000000000..d82041c5f174 --- /dev/null +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -0,0 +1,62 @@ +//===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling on GCN processors. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H +#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H + +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include + +namespace llvm { + +class MachineFunction; +class MachineInstr; +class ScheduleDAG; +class SIInstrInfo; +class SISubtarget; + +class GCNHazardRecognizer final : public ScheduleHazardRecognizer { + // This variable stores the instruction that has been emitted this cycle. It + // will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is + // called. + MachineInstr *CurrCycleInstr; + std::list EmittedInstrs; + const MachineFunction &MF; + const SISubtarget &ST; + + int getWaitStatesSinceDef(unsigned Reg, + function_ref IsHazardDef = + [](MachineInstr *) { return true; }); + + int checkSMEMSoftClauseHazards(MachineInstr *SMEM); + int checkSMRDHazards(MachineInstr *SMRD); + int checkVMEMHazards(MachineInstr* VMEM); + int checkDPPHazards(MachineInstr *DPP); +public: + GCNHazardRecognizer(const MachineFunction &MF); + // We can only issue one instruction per cycle. + bool atIssueLimit() const override { return true; } + void EmitInstruction(SUnit *SU) override; + void EmitInstruction(MachineInstr *MI) override; + HazardType getHazardType(SUnit *SU, int Stalls) override; + void EmitNoop() override; + unsigned PreEmitNoops(SUnit *SU) override; + unsigned PreEmitNoops(MachineInstr *) override; + void AdvanceCycle() override; + void RecedeCycle() override; +}; + +} // end namespace llvm + +#endif //LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index a187de88f639..2932d3bb1580 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -11,6 +11,7 @@ #include "AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" +#include "Utils/AMDGPUAsmUtils.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -18,6 +19,8 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include + using namespace llvm; void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, @@ -28,6 +31,11 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, printAnnotation(OS, Annot); } +void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xf); +} + void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); @@ -43,6 +51,11 @@ void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); } +void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xf); +} + void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); @@ -53,22 +66,26 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); } +void AMDGPUInstPrinter::printNamedBit(const MCInst* MI, unsigned OpNo, + raw_ostream& O, StringRef BitName) { + if (MI->getOperand(OpNo).getImm()) { + O << ' ' << BitName; + } +} + void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " offen"; + printNamedBit(MI, OpNo, O, "offen"); } void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " idxen"; + printNamedBit(MI, OpNo, O, "idxen"); } void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " addr64"; + printNamedBit(MI, OpNo, O, "addr64"); } void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, @@ -79,7 +96,7 @@ void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O) { uint16_t Imm = MI->getOperand(OpNo).getImm(); if (Imm != 0) { @@ -88,7 +105,7 @@ void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) { O << " offset0:"; @@ -96,7 +113,7 @@ void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) { O << " offset1:"; @@ -104,28 +121,62 @@ void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printSMRDOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printU32ImmOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printU32ImmOperand(MI, OpNo, O); +} + void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " gds"; + printNamedBit(MI, OpNo, O, "gds"); } void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " glc"; + printNamedBit(MI, OpNo, O, "glc"); } void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " slc"; + printNamedBit(MI, OpNo, O, "slc"); } void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " tfe"; + printNamedBit(MI, OpNo, O, "tfe"); +} + +void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " dmask:"; + printU16ImmOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "unorm"); +} + +void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "da"); +} + +void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "r128"); +} + +void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "lwe"); } void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, @@ -152,6 +203,18 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, case AMDGPU::VCC_HI: O << "vcc_hi"; return; + case AMDGPU::TBA_LO: + O << "tba_lo"; + return; + case AMDGPU::TBA_HI: + O << "tba_hi"; + return; + case AMDGPU::TMA_LO: + O << "tma_lo"; + return; + case AMDGPU::TMA_HI: + O << "tma_hi"; + return; case AMDGPU::EXEC_LO: O << "exec_lo"; return; @@ -168,62 +231,73 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, break; } - char Type; - unsigned NumRegs; + // The low 8 bits of the encoding value is the register index, for both VGPRs + // and SGPRs. + unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + unsigned NumRegs; if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) { - Type = 'v'; + O << 'v'; NumRegs = 1; } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) { - Type = 's'; + O << 's'; NumRegs = 1; } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { - Type = 'v'; + O <<'v'; NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) { - Type = 's'; + } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(reg)) { + O << 's'; NumRegs = 2; } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { - Type = 'v'; + O << 'v'; NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) { - Type = 's'; + } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(reg)) { + O << 's'; NumRegs = 4; } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) { - Type = 'v'; + O << 'v'; NumRegs = 3; } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) { - Type = 'v'; + O << 'v'; NumRegs = 8; } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) { - Type = 's'; + O << 's'; NumRegs = 8; } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) { - Type = 'v'; + O << 'v'; NumRegs = 16; } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) { - Type = 's'; + O << 's'; NumRegs = 16; + } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(reg)) { + O << "ttmp"; + NumRegs = 2; + RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen. + } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(reg)) { + O << "ttmp"; + NumRegs = 4; + RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen. } else { O << getRegisterName(reg); return; } - // The low 8 bits of the encoding value is the register index, for both VGPRs - // and SGPRs. - unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); if (NumRegs == 1) { - O << Type << RegIdx; + O << RegIdx; return; } - O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; + O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O) { if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) O << "_e64 "; + else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP) + O << "_dpp "; + else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA) + O << "_sdwa "; else O << "_e32 "; @@ -345,12 +419,13 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCExpr *Exp = Op.getExpr(); Exp->print(O, &MAI); } else { - llvm_unreachable("unknown operand type in printOperand"); + O << "/*INV_OP*/"; } } -void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { +void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, + unsigned OpNo, + raw_ostream &O) { unsigned InputModifiers = MI->getOperand(OpNo).getImm(); if (InputModifiers & SISrcMods::NEG) O << '-'; @@ -361,6 +436,122 @@ void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, O << '|'; } +void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, + unsigned OpNo, + raw_ostream &O) { + unsigned InputModifiers = MI->getOperand(OpNo).getImm(); + if (InputModifiers & SISrcMods::SEXT) + O << "sext("; + printOperand(MI, OpNo + 1, O); + if (InputModifiers & SISrcMods::SEXT) + O << ')'; +} + + +void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (Imm <= 0x0ff) { + O << " quad_perm:["; + O << formatDec(Imm & 0x3) << ','; + O << formatDec((Imm & 0xc) >> 2) << ','; + O << formatDec((Imm & 0x30) >> 4) << ','; + O << formatDec((Imm & 0xc0) >> 6) << ']'; + } else if ((Imm >= 0x101) && (Imm <= 0x10f)) { + O << " row_shl:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if ((Imm >= 0x111) && (Imm <= 0x11f)) { + O << " row_shr:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if ((Imm >= 0x121) && (Imm <= 0x12f)) { + O << " row_ror:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if (Imm == 0x130) { + O << " wave_shl:1"; + } else if (Imm == 0x134) { + O << " wave_rol:1"; + } else if (Imm == 0x138) { + O << " wave_shr:1"; + } else if (Imm == 0x13c) { + O << " wave_ror:1"; + } else if (Imm == 0x140) { + O << " row_mirror"; + } else if (Imm == 0x141) { + O << " row_half_mirror"; + } else if (Imm == 0x142) { + O << " row_bcast:15"; + } else if (Imm == 0x143) { + O << " row_bcast:31"; + } else { + llvm_unreachable("Invalid dpp_ctrl value"); + } +} + +void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << " row_mask:"; + printU4ImmOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << " bank_mask:"; + printU4ImmOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (Imm) { + O << " bound_ctrl:0"; // XXX - this syntax is used in sp3 + } +} + +void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + switch (Imm) { + case 0: O << "BYTE_0"; break; + case 1: O << "BYTE_1"; break; + case 2: O << "BYTE_2"; break; + case 3: O << "BYTE_3"; break; + case 4: O << "WORD_0"; break; + case 5: O << "WORD_1"; break; + case 6: O << "DWORD"; break; + default: llvm_unreachable("Invalid SDWA data select operand"); + } +} + +void AMDGPUInstPrinter::printSDWADstSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << "dst_sel:"; + printSDWASel(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << "src0_sel:"; + printSDWASel(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << "src1_sel:"; + printSDWASel(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << "dst_unused:"; + unsigned Imm = MI->getOperand(OpNo).getImm(); + switch (Imm) { + case 0: O << "UNUSED_PAD"; break; + case 1: O << "UNUSED_SEXT"; break; + case 2: O << "UNUSED_PRESERVE"; break; + default: llvm_unreachable("Invalid SDWA dest_unused operand"); + } +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O) { unsigned Imm = MI->getOperand(OpNum).getImm(); @@ -395,9 +586,17 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, + raw_ostream &O, char Asm) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm()); + if (Op.getImm() == 1) + O << Asm; +} + void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printIfSet(MI, OpNo, O, "|"); + printIfSet(MI, OpNo, O, '|'); } void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, @@ -424,8 +623,15 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - int32_t Imm = MI->getOperand(OpNo).getImm(); - O << Imm << '(' << BitsToFloat(Imm) << ')'; + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm() || Op.isExpr()); + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + O << Imm << '(' << BitsToFloat(Imm) << ')'; + } + if (Op.isExpr()) { + Op.getExpr()->print(O << '@', &MAI); + } } void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, @@ -435,7 +641,7 @@ void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printIfSet(MI, OpNo, O, "-"); + printIfSet(MI, OpNo, O, '-'); } void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, @@ -456,7 +662,7 @@ void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printIfSet(MI, OpNo, O, "+"); + printIfSet(MI, OpNo, O, '+'); } void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, @@ -585,43 +791,49 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - unsigned SImm16 = MI->getOperand(OpNo).getImm(); - unsigned Msg = SImm16 & 0xF; - if (Msg == 2 || Msg == 3) { - unsigned Op = (SImm16 >> 4) & 0xF; - if (Msg == 3) - O << "Gs_done("; - else - O << "Gs("; - if (Op == 0) { - O << "nop"; - } else { - unsigned Stream = (SImm16 >> 8) & 0x3; - if (Op == 1) - O << "cut"; - else if (Op == 2) - O << "emit"; - else if (Op == 3) - O << "emit-cut"; - O << " stream " << Stream; + using namespace llvm::AMDGPU::SendMsg; + + const unsigned SImm16 = MI->getOperand(OpNo).getImm(); + const unsigned Id = SImm16 & ID_MASK_; + do { + if (Id == ID_INTERRUPT) { + if ((SImm16 & ~ID_MASK_) != 0) // Unused/unknown bits must be 0. + break; + O << "sendmsg(" << IdSymbolic[Id] << ')'; + return; } - O << "), [m0] "; - } else if (Msg == 1) - O << "interrupt "; - else if (Msg == 15) - O << "system "; - else - O << "unknown(" << Msg << ") "; + if (Id == ID_GS || Id == ID_GS_DONE) { + if ((SImm16 & ~(ID_MASK_|OP_GS_MASK_|STREAM_ID_MASK_)) != 0) // Unused/unknown bits must be 0. + break; + const unsigned OpGs = (SImm16 & OP_GS_MASK_) >> OP_SHIFT_; + const unsigned StreamId = (SImm16 & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_; + if (OpGs == OP_GS_NOP && Id != ID_GS_DONE) // NOP to be used for GS_DONE only. + break; + if (OpGs == OP_GS_NOP && StreamId != 0) // NOP does not use/define stream id bits. + break; + O << "sendmsg(" << IdSymbolic[Id] << ", " << OpGsSymbolic[OpGs]; + if (OpGs != OP_GS_NOP) { O << ", " << StreamId; } + O << ')'; + return; + } + if (Id == ID_SYSMSG) { + if ((SImm16 & ~(ID_MASK_|OP_SYS_MASK_)) != 0) // Unused/unknown bits must be 0. + break; + const unsigned OpSys = (SImm16 & OP_SYS_MASK_) >> OP_SHIFT_; + if (! (OP_SYS_FIRST_ <= OpSys && OpSys < OP_SYS_LAST_)) // Unused/unknown. + break; + O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')'; + return; + } + } while (0); + O << SImm16; // Unknown simm16 code. } void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs - // SIInsertWaits.cpp bits usage does not match ISA docs description but it - // works so it might be a misprint in docs. unsigned SImm16 = MI->getOperand(OpNo).getImm(); unsigned Vmcnt = SImm16 & 0xF; - unsigned Expcnt = (SImm16 >> 4) & 0xF; + unsigned Expcnt = (SImm16 >> 4) & 0x7; unsigned Lgkmcnt = (SImm16 >> 8) & 0xF; bool NeedSpace = false; @@ -638,11 +850,32 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, NeedSpace = true; } - if (Lgkmcnt != 0x7) { + if (Lgkmcnt != 0xF) { if (NeedSpace) O << ' '; O << "lgkmcnt(" << Lgkmcnt << ')'; } } +void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + using namespace llvm::AMDGPU::Hwreg; + + unsigned SImm16 = MI->getOperand(OpNo).getImm(); + const unsigned Id = (SImm16 & ID_MASK_) >> ID_SHIFT_; + const unsigned Offset = (SImm16 & OFFSET_MASK_) >> OFFSET_SHIFT_; + const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1; + + O << "hwreg("; + if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) { + O << IdSymbolic[Id]; + } else { + O << Id; + } + if (Width != WIDTH_M1_DEFAULT_ + 1 || Offset != OFFSET_DEFAULT_) { + O << ", " << Offset << ", " << Width; + } + O << ')'; +} + #include "AMDGPUGenAsmWriter.inc" diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index 90541d86132d..f5a290f16045 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -10,8 +10,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H -#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H +#define LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" @@ -33,37 +33,60 @@ public: const MCRegisterInfo &MRI); private: + void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printNamedBit(const MCInst* MI, unsigned OpNo, raw_ostream& O, + StringRef BitName); void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSMRDOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printUNorm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDA(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printR128(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLWE(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printImmediate32(uint32_t I, raw_ostream &O); void printImmediate64(uint64_t I, raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDPPCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRowMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBankMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBoundCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWADstSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWADstUnused(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm, StringRef Default = ""); + static void printIfSet(const MCInst *MI, unsigned OpNo, + raw_ostream &O, char Asm); static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); @@ -79,6 +102,7 @@ private: static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printHwreg(const MCInst *MI, unsigned OpNo, raw_ostream &O); }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt index ce63bd553b9c..7191ff2c4577 100644 --- a/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt +++ b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt @@ -1,3 +1,5 @@ add_llvm_library(LLVMAMDGPUAsmPrinter AMDGPUInstPrinter.cpp ) + +add_dependencies(LLVMAMDGPUAsmPrinter LLVMAMDGPUUtils) diff --git a/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt index fdb43844dc63..30c2670316c8 100644 --- a/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt +++ b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt @@ -19,6 +19,6 @@ type = Library name = AMDGPUAsmPrinter parent = AMDGPU -required_libraries = MC Support +required_libraries = MC Support AMDGPUUtils add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/InstPrinter/Makefile b/lib/Target/AMDGPU/InstPrinter/Makefile deleted file mode 100644 index 4e48ac7e28a9..000000000000 --- a/lib/Target/AMDGPU/InstPrinter/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAMDGPUAsmPrinter - -# Hack: we need to include 'main' x86 target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt index 38c5489586f1..bbdd17737cf0 100644 --- a/lib/Target/AMDGPU/LLVMBuild.txt +++ b/lib/Target/AMDGPU/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===; +;===- ./lib/Target/AMDGPU/LLVMBuild.txt ------------------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo Utils +subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils [component_0] type = TargetGroup @@ -24,10 +24,11 @@ name = AMDGPU parent = Target has_asmparser = 1 has_asmprinter = 1 +has_disassembler = 1 [component_1] type = Library name = AMDGPUCodeGen parent = AMDGPU -required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmParser AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils +required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 60e8c8f3d303..1cb9d21408c6 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -53,7 +53,8 @@ public: const MCAsmLayout &Layout) const override { return false; } - void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { + void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + MCInst &Res) const override { assert(!"Not implemented"); } bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } @@ -73,12 +74,17 @@ void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm, static unsigned getFixupKindNumBytes(unsigned Kind) { switch (Kind) { + case FK_SecRel_1: case FK_Data_1: return 1; + case FK_SecRel_2: case FK_Data_2: return 2; + case FK_SecRel_4: case FK_Data_4: + case FK_PCRel_4: return 4; + case FK_SecRel_8: case FK_Data_8: return 8; default: @@ -92,32 +98,15 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, switch ((unsigned)Fixup.getKind()) { case AMDGPU::fixup_si_sopp_br: { + int64_t BrImm = ((int64_t)Value - 4) / 4; + if (!isInt<16>(BrImm)) + report_fatal_error("branch size exceeds simm16"); + uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); - *Dst = (Value - 4) / 4; + *Dst = BrImm; break; } - case AMDGPU::fixup_si_rodata: { - uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - // We emit constant data at the end of the text section and generate its - // address using the following code sequence: - // s_getpc_b64 s[0:1] - // s_add_u32 s0, s0, $symbol - // s_addc_u32 s1, s1, 0 - // - // s_getpc_b64 returns the address of the s_add_u32 instruction and then - // the fixup replaces $symbol with a literal constant, which is a - // pc-relative offset from the encoding of the $symbol operand to the - // constant data. - // - // What we want here is an offset from the start of the s_add_u32 - // instruction to the constant data, but since the encoding of $symbol - // starts 4 bytes after the start of the add instruction, we end up - // with an offset that is 4 bytes too small. This requires us to - // add 4 to the fixup value before applying it. - *Dst = Value + 4; - break; - } default: { // FIXME: Copied from AArch64 unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); @@ -144,7 +133,6 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { // name offset bits flags { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_si_rodata", 0, 32, MCFixupKindInfo::FKF_IsPCRel } }; if (Kind < FirstTargetFixupKind) @@ -167,13 +155,15 @@ namespace { class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { bool Is64Bit; + bool HasRelocationAddend; public: - ELFAMDGPUAsmBackend(const Target &T, bool Is64Bit) : - AMDGPUAsmBackend(T), Is64Bit(Is64Bit) { } + ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) : + AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn), + HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { } MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { - return createAMDGPUELFObjectWriter(Is64Bit, OS); + return createAMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend, OS); } }; @@ -182,8 +172,6 @@ public: MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TT, StringRef CPU) { - Triple TargetTriple(TT); - // Use 64-bit ELF for amdgcn - return new ELFAMDGPUAsmBackend(T, TargetTriple.getArch() == Triple::amdgcn); + return new ELFAMDGPUAsmBackend(T, TT); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 820f17df8960..b4e3b8e896bd 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -18,23 +18,56 @@ namespace { class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { public: - AMDGPUELFObjectWriter(bool Is64Bit); + AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend); protected: - unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsPCRel) const override { - return Fixup.getKind(); - } - + unsigned getRelocType(MCContext &Ctx, const MCValue &Target, + const MCFixup &Fixup, bool IsPCRel) const override; }; } // End anonymous namespace -AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit) - : MCELFObjectTargetWriter(Is64Bit, ELF::ELFOSABI_AMDGPU_HSA, - ELF::EM_AMDGPU, false) { } +AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, + bool HasRelocationAddend) + : MCELFObjectTargetWriter(Is64Bit, + ELF::ELFOSABI_AMDGPU_HSA, + ELF::EM_AMDGPU, + HasRelocationAddend) { } + +unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, + const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + // SCRATCH_RSRC_DWORD[01] is a special global variable that represents + // the scratch buffer. + if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD0") + return ELF::R_AMDGPU_ABS32_LO; + if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") + return ELF::R_AMDGPU_ABS32_HI; + + switch (Target.getAccessVariant()) { + default: + break; + case MCSymbolRefExpr::VK_GOTPCREL: + return ELF::R_AMDGPU_GOTPCREL; + } + + switch (Fixup.getKind()) { + default: break; + case FK_PCRel_4: + return ELF::R_AMDGPU_REL32; + case FK_SecRel_4: + return ELF::R_AMDGPU_ABS32; + } + + llvm_unreachable("unhandled relocation type"); +} + -MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit, raw_pwrite_stream &OS) { - MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(Is64Bit); +MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit, + bool HasRelocationAddend, + raw_pwrite_stream &OS) { + MCELFObjectTargetWriter *MOTW = + new AMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend); return createELFObjectWriter(MOTW, OS, true); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp index 9ff9fe794d2b..43338a5bebd2 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -12,11 +12,6 @@ using namespace llvm; -void AMDGPUELFStreamer::InitSections(bool NoExecStack) { - // Start with the .hsatext section by default. - SwitchSection(AMDGPU::getHSATextSection(getContext())); -} - MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h index 488d7e74d741..5319b65d65f9 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h @@ -1,4 +1,4 @@ -//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===// +//===-------- AMDGPUELFStreamer.h - ELF Object Output -----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -29,7 +29,6 @@ public: MCCodeEmitter *Emitter) : MCELFStreamer(Context, MAB, OS, Emitter) { } - virtual void InitSections(bool NoExecStac) override; }; MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h index 59a9178082f6..20c1adfbc6b9 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H #include "llvm/MC/MCFixup.h" @@ -18,9 +18,6 @@ enum Fixups { /// 16-bit PC relative fixup for SOPP branch instructions. fixup_si_sopp_br = FirstTargetFixupKind, - /// fixup for global addresses with constant initializers - fixup_si_rodata, - // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 4bc80a028936..1655591abf39 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -9,12 +9,15 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCAsmInfo.h" +#include "llvm/ADT/Triple.h" using namespace llvm; + AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// - MaxInstLength = 16; + MinInstAlignment = 4; + MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 8 : 16; SeparatorString = "\n"; CommentString = ";"; PrivateLabelPrefix = ""; diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h index a546961705d7..8cb33a3179cd 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index c95742762233..c942ea904085 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H #include "llvm/MC/MCCodeEmitter.h" #include "llvm/Support/raw_ostream.h" diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index f70409470276..a0d9aab114fc 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -18,7 +18,6 @@ #include "AMDGPUTargetStreamer.h" #include "InstPrinter/AMDGPUInstPrinter.h" #include "SIDefines.h" -#include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -56,15 +55,6 @@ createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS); } -static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(const Triple &TT, - Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { - MCCodeGenInfo *X = new MCCodeGenInfo(); - X->initMCCodeGenInfo(RM, CM, OL); - return X; -} - static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, unsigned SyntaxVariant, const MCAsmInfo &MAI, @@ -99,7 +89,6 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() { for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { RegisterMCAsmInfo X(*T); - TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo); TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 5d1b86b8c0c2..9ab7940812ba 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -13,13 +13,13 @@ //===----------------------------------------------------------------------===// // -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H #include "llvm/Support/DataTypes.h" -#include "llvm/ADT/StringRef.h" namespace llvm { +class StringRef; class MCAsmBackend; class MCCodeEmitter; class MCContext; @@ -47,6 +47,7 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TT, StringRef CPU); MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit, + bool HasRelocationAddend, raw_pwrite_stream &OS); } // End llvm namespace diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index b91134d2ee9b..83dcaacb738f 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -312,10 +312,6 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { MCStreamer &OS = getStreamer(); OS.PushSection(); - // The MCObjectFileInfo that is available to the assembler is a generic - // implementation and not AMDGPUHSATargetObjectFile, so we can't use - // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection. - OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext())); OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header))); OS.PopSection(); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 83bb728f541c..b3d59e8f396e 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -7,16 +7,16 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H #include "AMDKernelCodeT.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Debug.h" + namespace llvm { class MCELFStreamer; +class MCSymbol; class AMDGPUTargetStreamer : public MCTargetStreamer { public: diff --git a/lib/Target/AMDGPU/MCTargetDesc/Makefile b/lib/Target/AMDGPU/MCTargetDesc/Makefile deleted file mode 100644 index 5ad68662d98c..000000000000 --- a/lib/Target/AMDGPU/MCTargetDesc/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMAMDGPUDesc - -# Hack: we need to include 'main' target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 3c1142dd664b..5e8e6ceb7ca2 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "R600Defines.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" #include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/MC/MCCodeEmitter.h" @@ -51,12 +52,9 @@ public: const MCSubtargetInfo &STI) const override; private: - void EmitByte(unsigned int byte, raw_ostream &OS) const; - void Emit(uint32_t value, raw_ostream &OS) const; void Emit(uint64_t value, raw_ostream &OS) const; - unsigned getHWRegChan(unsigned reg) const; unsigned getHWReg(unsigned regNo) const; }; @@ -142,10 +140,6 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, } } -void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { - OS.write((uint8_t) Byte & 0xff); -} - void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { support::endian::Writer(OS).write(Value); } @@ -154,17 +148,13 @@ void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { support::endian::Writer(OS).write(Value); } -unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const { - return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT; -} - unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { return MRI.getEncodingValue(RegNo) & HW_REG_MASK; } uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl &Fixup, + SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { if (MO.isReg()) { if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) @@ -172,6 +162,18 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, return getHWReg(MO.getReg()); } + if (MO.isExpr()) { + // We put rodata at the end of code section, then map the entire + // code secetion as vtx buf. Thus the section relative address is the + // correct one. + // Each R600 literal instruction has two operands + // We can't easily get the order of the current one, so compare against + // the first one and adjust offset. + const unsigned offset = (&MO == &MI.getOperand(0)) ? 0 : 4; + Fixups.push_back(MCFixup::create(offset, MO.getExpr(), FK_SecRel_4, MI.getLoc())); + return 0; + } + assert(MO.isImm()); return MO.getImm(); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 9eb3dadbc5e2..71b585c25ac5 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -162,20 +162,30 @@ static uint32_t getLit64Encoding(uint64_t Val) { uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, unsigned OpSize) const { - if (MO.isExpr()) - return 255; - assert(!MO.isFPImm()); + int64_t Imm; + if (MO.isExpr()) { + const MCConstantExpr *C = dyn_cast(MO.getExpr()); + if (!C) + return 255; + + Imm = C->getValue(); + } else { - if (!MO.isImm()) - return ~0; + assert(!MO.isFPImm()); + + if (!MO.isImm()) + return ~0; + + Imm = MO.getImm(); + } if (OpSize == 4) - return getLit32Encoding(static_cast(MO.getImm())); + return getLit32Encoding(static_cast(Imm)); assert(OpSize == 8); - return getLit64Encoding(static_cast(MO.getImm())); + return getLit64Encoding(static_cast(Imm)); } void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, @@ -213,7 +223,11 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, if (Op.isImm()) Imm = Op.getImm(); - else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. + else if (Op.isExpr()) { + if (const MCConstantExpr *C = dyn_cast(Op.getExpr())) + Imm = C->getValue(); + + } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. llvm_unreachable("Must be immediate or expr"); for (unsigned j = 0; j < 4; j++) { @@ -247,10 +261,14 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, if (MO.isReg()) return MRI.getEncodingValue(MO.getReg()); - if (MO.isExpr()) { - const MCSymbolRefExpr *Expr = cast(MO.getExpr()); - MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; - Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); + if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) { + const MCSymbolRefExpr *Expr = dyn_cast(MO.getExpr()); + MCFixupKind Kind; + if (Expr && Expr->getSymbol().isExternal()) + Kind = FK_Data_4; + else + Kind = FK_PCRel_4; + Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc())); } // Figure out the operand number, needed for isSrcOperand check diff --git a/lib/Target/AMDGPU/Makefile b/lib/Target/AMDGPU/Makefile deleted file mode 100644 index 219f34daa24f..000000000000 --- a/lib/Target/AMDGPU/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../.. -LIBRARYNAME = LLVMAMDGPUCodeGen -TARGET = AMDGPU - -# Make sure that tblgen is run, first thing. -BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \ - AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \ - AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \ - AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \ - AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc - -DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc Utils - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td index 4300d972d46b..f5f1eb14e993 100644 --- a/lib/Target/AMDGPU/Processors.td +++ b/lib/Target/AMDGPU/Processors.td @@ -13,11 +13,8 @@ class Proc Featur //===----------------------------------------------------------------------===// // R600 //===----------------------------------------------------------------------===// -def : Proc<"", R600_VLIW5_Itin, - [FeatureR600, FeatureVertexCache]>; - def : Proc<"r600", R600_VLIW5_Itin, - [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>; + [FeatureR600, FeatureVertexCache, FeatureWavefrontSize64]>; def : Proc<"r630", R600_VLIW5_Itin, [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>; @@ -84,11 +81,11 @@ def : Proc<"cayman", R600_VLIW4_Itin, //===----------------------------------------------------------------------===// def : ProcessorModel<"SI", SIFullSpeedModel, - [FeatureSouthernIslands, FeatureFastFMAF32] + [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops] >; -def : ProcessorModel<"tahiti", SIFullSpeedModel, - [FeatureSouthernIslands, FeatureFastFMAF32] +def : ProcessorModel<"tahiti", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops] >; def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; @@ -116,8 +113,8 @@ def : ProcessorModel<"kaveri", SIQuarterSpeedModel, >; def : ProcessorModel<"hawaii", SIFullSpeedModel, - [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32, - FeatureISAVersion7_0_1] + [FeatureSeaIslands, FeatureFastFMAF32, HalfRate64Ops, + FeatureLDSBankCount32, FeatureISAVersion7_0_1] >; def : ProcessorModel<"mullins", SIQuarterSpeedModel, @@ -148,3 +145,11 @@ def : ProcessorModel<"fiji", SIQuarterSpeedModel, def : ProcessorModel<"stoney", SIQuarterSpeedModel, [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16] >; + +def : ProcessorModel<"polaris10", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"polaris11", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32] +>; diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp index 3cb90218a7d5..3ccde79e2df4 100644 --- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -31,8 +31,8 @@ using namespace llvm; namespace { -static bool isCFAlu(const MachineInstr *MI) { - switch (MI->getOpcode()) { +static bool isCFAlu(const MachineInstr &MI) { + switch (MI.getOpcode()) { case AMDGPU::CF_ALU: case AMDGPU::CF_ALU_PUSH_BEFORE: return true; @@ -47,19 +47,19 @@ private: static char ID; const R600InstrInfo *TII; - unsigned getCFAluSize(const MachineInstr *MI) const; - bool isCFAluEnabled(const MachineInstr *MI) const; + unsigned getCFAluSize(const MachineInstr &MI) const; + bool isCFAluEnabled(const MachineInstr &MI) const; /// IfCvt pass can generate "disabled" ALU clause marker that need to be /// removed and their content affected to the previous alu clause. /// This function parse instructions after CFAlu until it find a disabled /// CFAlu and merge the content, or an enabled CFAlu. - void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const; + void cleanPotentialDisabledCFAlu(MachineInstr &CFAlu) const; /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if /// it is the case. - bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu) - const; + bool mergeIfPossible(MachineInstr &RootCFAlu, + const MachineInstr &LatrCFAlu) const; public: R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } @@ -71,38 +71,40 @@ public: char R600ClauseMergePass::ID = 0; -unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const { +unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const { assert(isCFAlu(MI)); - return MI->getOperand( - TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm(); + return MI + .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT)) + .getImm(); } -bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const { +bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const { assert(isCFAlu(MI)); - return MI->getOperand( - TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm(); + return MI + .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled)) + .getImm(); } -void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) - const { +void R600ClauseMergePass::cleanPotentialDisabledCFAlu( + MachineInstr &CFAlu) const { int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); - MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end(); + MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end(); I++; do { - while (I!= E && !isCFAlu(I)) + while (I != E && !isCFAlu(*I)) I++; if (I == E) return; - MachineInstr *MI = I++; + MachineInstr &MI = *I++; if (isCFAluEnabled(MI)) break; - CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI)); - MI->eraseFromParent(); + CFAlu.getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI)); + MI.eraseFromParent(); } while (I != E); } -bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, - const MachineInstr *LatrCFAlu) const { +bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu, + const MachineInstr &LatrCFAlu) const { assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu)); int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); unsigned RootInstCount = getCFAluSize(RootCFAlu), @@ -112,7 +114,7 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, DEBUG(dbgs() << "Excess inst counts\n"); return false; } - if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) return false; // Is KCache Bank 0 compatible ? int Mode0Idx = @@ -121,12 +123,12 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0); int KBank0LineIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0); - if (LatrCFAlu->getOperand(Mode0Idx).getImm() && - RootCFAlu->getOperand(Mode0Idx).getImm() && - (LatrCFAlu->getOperand(KBank0Idx).getImm() != - RootCFAlu->getOperand(KBank0Idx).getImm() || - LatrCFAlu->getOperand(KBank0LineIdx).getImm() != - RootCFAlu->getOperand(KBank0LineIdx).getImm())) { + if (LatrCFAlu.getOperand(Mode0Idx).getImm() && + RootCFAlu.getOperand(Mode0Idx).getImm() && + (LatrCFAlu.getOperand(KBank0Idx).getImm() != + RootCFAlu.getOperand(KBank0Idx).getImm() || + LatrCFAlu.getOperand(KBank0LineIdx).getImm() != + RootCFAlu.getOperand(KBank0LineIdx).getImm())) { DEBUG(dbgs() << "Wrong KC0\n"); return false; } @@ -137,56 +139,61 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1); int KBank1LineIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1); - if (LatrCFAlu->getOperand(Mode1Idx).getImm() && - RootCFAlu->getOperand(Mode1Idx).getImm() && - (LatrCFAlu->getOperand(KBank1Idx).getImm() != - RootCFAlu->getOperand(KBank1Idx).getImm() || - LatrCFAlu->getOperand(KBank1LineIdx).getImm() != - RootCFAlu->getOperand(KBank1LineIdx).getImm())) { + if (LatrCFAlu.getOperand(Mode1Idx).getImm() && + RootCFAlu.getOperand(Mode1Idx).getImm() && + (LatrCFAlu.getOperand(KBank1Idx).getImm() != + RootCFAlu.getOperand(KBank1Idx).getImm() || + LatrCFAlu.getOperand(KBank1LineIdx).getImm() != + RootCFAlu.getOperand(KBank1LineIdx).getImm())) { DEBUG(dbgs() << "Wrong KC0\n"); return false; } - if (LatrCFAlu->getOperand(Mode0Idx).getImm()) { - RootCFAlu->getOperand(Mode0Idx).setImm( - LatrCFAlu->getOperand(Mode0Idx).getImm()); - RootCFAlu->getOperand(KBank0Idx).setImm( - LatrCFAlu->getOperand(KBank0Idx).getImm()); - RootCFAlu->getOperand(KBank0LineIdx).setImm( - LatrCFAlu->getOperand(KBank0LineIdx).getImm()); + if (LatrCFAlu.getOperand(Mode0Idx).getImm()) { + RootCFAlu.getOperand(Mode0Idx).setImm( + LatrCFAlu.getOperand(Mode0Idx).getImm()); + RootCFAlu.getOperand(KBank0Idx).setImm( + LatrCFAlu.getOperand(KBank0Idx).getImm()); + RootCFAlu.getOperand(KBank0LineIdx) + .setImm(LatrCFAlu.getOperand(KBank0LineIdx).getImm()); } - if (LatrCFAlu->getOperand(Mode1Idx).getImm()) { - RootCFAlu->getOperand(Mode1Idx).setImm( - LatrCFAlu->getOperand(Mode1Idx).getImm()); - RootCFAlu->getOperand(KBank1Idx).setImm( - LatrCFAlu->getOperand(KBank1Idx).getImm()); - RootCFAlu->getOperand(KBank1LineIdx).setImm( - LatrCFAlu->getOperand(KBank1LineIdx).getImm()); + if (LatrCFAlu.getOperand(Mode1Idx).getImm()) { + RootCFAlu.getOperand(Mode1Idx).setImm( + LatrCFAlu.getOperand(Mode1Idx).getImm()); + RootCFAlu.getOperand(KBank1Idx).setImm( + LatrCFAlu.getOperand(KBank1Idx).getImm()); + RootCFAlu.getOperand(KBank1LineIdx) + .setImm(LatrCFAlu.getOperand(KBank1LineIdx).getImm()); } - RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts); - RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode())); + RootCFAlu.getOperand(CntIdx).setImm(CumuledInsts); + RootCFAlu.setDesc(TII->get(LatrCFAlu.getOpcode())); return true; } bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast(MF.getSubtarget().getInstrInfo()); + if (skipFunction(*MF.getFunction())) + return false; + + const R600Subtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { MachineBasicBlock &MBB = *BB; MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); MachineBasicBlock::iterator LatestCFAlu = E; while (I != E) { - MachineInstr *MI = I++; + MachineInstr &MI = *I++; if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) || - TII->mustBeLastInClause(MI->getOpcode())) + TII->mustBeLastInClause(MI.getOpcode())) LatestCFAlu = E; if (!isCFAlu(MI)) continue; cleanPotentialDisabledCFAlu(MI); - if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) { - MI->eraseFromParent(); + if (LatestCFAlu != E && mergeIfPossible(*LatestCFAlu, MI)) { + MI.eraseFromParent(); } else { - assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled"); + assert(MI.getOperand(8).getImm() && "CF ALU instruction disabled"); LatestCFAlu = MI; } } diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index bd80bb211b4f..d5bda4a8303e 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -39,16 +39,16 @@ struct CFStack { FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 }; - const AMDGPUSubtarget *ST; + const R600Subtarget *ST; std::vector BranchStack; std::vector LoopStack; unsigned MaxStackSize; unsigned CurrentEntries; unsigned CurrentSubEntries; - CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), + CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st), // We need to reserve a stack entry for CALL_FS in vertex shaders. - MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), + MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0), CurrentEntries(0), CurrentSubEntries(0) { } unsigned getLoopDepth(); @@ -119,7 +119,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { return 0; case CFStack::FIRST_NON_WQM_PUSH: assert(!ST->hasCaymanISA()); - if (ST->getGeneration() <= AMDGPUSubtarget::R700) { + if (ST->getGeneration() <= R600Subtarget::R700) { // +1 For the push operation. // +2 Extra space required. return 3; @@ -132,7 +132,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { return 2; } case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: - assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); + assert(ST->getGeneration() >= R600Subtarget::EVERGREEN); // +1 For the push operation. // +1 Extra space required. return 2; @@ -142,8 +142,8 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { } void CFStack::updateMaxStackSize() { - unsigned CurrentStackSize = CurrentEntries + - (RoundUpToAlignment(CurrentSubEntries, 4) / 4); + unsigned CurrentStackSize = + CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4); MaxStackSize = std::max(CurrentStackSize, MaxStackSize); } @@ -159,7 +159,7 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) { // See comment in // CFStack::getSubEntrySize() else if (CurrentEntries > 0 && - ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && + ST->getGeneration() > R600Subtarget::EVERGREEN && !ST->hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; @@ -220,10 +220,10 @@ private: const R600InstrInfo *TII; const R600RegisterInfo *TRI; unsigned MaxFetchInst; - const AMDGPUSubtarget *ST; + const R600Subtarget *ST; - bool IsTrivialInst(MachineInstr *MI) const { - switch (MI->getOpcode()) { + bool IsTrivialInst(MachineInstr &MI) const { + switch (MI.getOpcode()) { case AMDGPU::KILL: case AMDGPU::RETURN: return true; @@ -234,7 +234,7 @@ private: const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { unsigned Opcode = 0; - bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); + bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN); switch (CFI) { case CF_TC: Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; @@ -278,11 +278,12 @@ private: return TII->get(Opcode); } - bool isCompatibleWithClause(const MachineInstr *MI, - std::set &DstRegs) const { + bool isCompatibleWithClause(const MachineInstr &MI, + std::set &DstRegs) const { unsigned DstMI, SrcMI; - for (MachineInstr::const_mop_iterator I = MI->operands_begin(), - E = MI->operands_end(); I != E; ++I) { + for (MachineInstr::const_mop_iterator I = MI.operands_begin(), + E = MI.operands_end(); + I != E; ++I) { const MachineOperand &MO = *I; if (!MO.isReg()) continue; @@ -318,20 +319,20 @@ private: MachineBasicBlock::iterator ClauseHead = I; std::vector ClauseContent; unsigned AluInstCount = 0; - bool IsTex = TII->usesTextureCache(ClauseHead); + bool IsTex = TII->usesTextureCache(*ClauseHead); std::set DstRegs; for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { - if (IsTrivialInst(I)) + if (IsTrivialInst(*I)) continue; if (AluInstCount >= MaxFetchInst) break; - if ((IsTex && !TII->usesTextureCache(I)) || - (!IsTex && !TII->usesVertexCache(I))) + if ((IsTex && !TII->usesTextureCache(*I)) || + (!IsTex && !TII->usesVertexCache(*I))) break; - if (!isCompatibleWithClause(I, DstRegs)) + if (!isCompatibleWithClause(*I, DstRegs)) break; AluInstCount ++; - ClauseContent.push_back(I); + ClauseContent.push_back(&*I); } MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), getHWInstrDesc(IsTex?CF_TC:CF_VC)) @@ -340,28 +341,37 @@ private: return ClauseFile(MIb, std::move(ClauseContent)); } - void getLiteral(MachineInstr *MI, std::vector &Lits) const { + void getLiteral(MachineInstr &MI, std::vector &Lits) const { static const unsigned LiteralRegs[] = { AMDGPU::ALU_LITERAL_X, AMDGPU::ALU_LITERAL_Y, AMDGPU::ALU_LITERAL_Z, AMDGPU::ALU_LITERAL_W }; - const SmallVector, 3 > Srcs = + const SmallVector, 3> Srcs = TII->getSrcs(MI); - for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { - if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) + for (const auto &Src:Srcs) { + if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X) continue; - int64_t Imm = Srcs[i].second; - std::vector::iterator It = - std::find(Lits.begin(), Lits.end(), Imm); + int64_t Imm = Src.second; + std::vector::iterator It = + std::find_if(Lits.begin(), Lits.end(), + [&](MachineOperand* val) + { return val->isImm() && (val->getImm() == Imm);}); + + // Get corresponding Operand + MachineOperand &Operand = MI.getOperand( + TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal)); + if (It != Lits.end()) { + // Reuse existing literal reg unsigned Index = It - Lits.begin(); - Srcs[i].first->setReg(LiteralRegs[Index]); + Src.first->setReg(LiteralRegs[Index]); } else { + // Allocate new literal reg assert(Lits.size() < 4 && "Too many literals in Instruction Group"); - Srcs[i].first->setReg(LiteralRegs[Lits.size()]); - Lits.push_back(Imm); + Src.first->setReg(LiteralRegs[Lits.size()]); + Lits.push_back(&Operand); } } } @@ -384,56 +394,66 @@ private: ClauseFile MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) const { - MachineBasicBlock::iterator ClauseHead = I; + MachineInstr &ClauseHead = *I; std::vector ClauseContent; I++; for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { - if (IsTrivialInst(I)) { + if (IsTrivialInst(*I)) { ++I; continue; } if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) break; - std::vector Literals; + std::vectorLiterals; if (I->isBundle()) { - MachineInstr *DeleteMI = I; + MachineInstr &DeleteMI = *I; MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); while (++BI != E && BI->isBundledWithPred()) { BI->unbundleFromPred(); - for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = BI->getOperand(i); + for (MachineOperand &MO : BI->operands()) { if (MO.isReg() && MO.isInternalRead()) MO.setIsInternalRead(false); } - getLiteral(&*BI, Literals); + getLiteral(*BI, Literals); ClauseContent.push_back(&*BI); } I = BI; - DeleteMI->eraseFromParent(); + DeleteMI.eraseFromParent(); } else { - getLiteral(I, Literals); - ClauseContent.push_back(I); + getLiteral(*I, Literals); + ClauseContent.push_back(&*I); I++; } - for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { - unsigned literal0 = Literals[i]; - unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; - MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), - TII->get(AMDGPU::LITERALS)) - .addImm(literal0) - .addImm(literal2); + for (unsigned i = 0, e = Literals.size(); i < e; i += 2) { + MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(), + TII->get(AMDGPU::LITERALS)); + if (Literals[i]->isImm()) { + MILit.addImm(Literals[i]->getImm()); + } else { + MILit.addGlobalAddress(Literals[i]->getGlobal(), + Literals[i]->getOffset()); + } + if (i + 1 < e) { + if (Literals[i + 1]->isImm()) { + MILit.addImm(Literals[i + 1]->getImm()); + } else { + MILit.addGlobalAddress(Literals[i + 1]->getGlobal(), + Literals[i + 1]->getOffset()); + } + } else + MILit.addImm(0); ClauseContent.push_back(MILit); } } assert(ClauseContent.size() < 128 && "ALU clause is too big"); - ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); - return ClauseFile(ClauseHead, std::move(ClauseContent)); + ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1); + return ClauseFile(&ClauseHead, std::move(ClauseContent)); } void EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, unsigned &CfCount) { - CounterPropagateAddr(Clause.first, CfCount); + CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) .addImm(CfCount); @@ -447,7 +467,7 @@ private: EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, unsigned &CfCount) { Clause.first->getOperand(0).setImm(0); - CounterPropagateAddr(Clause.first, CfCount); + CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) .addImm(CfCount); @@ -457,13 +477,13 @@ private: CfCount += Clause.second.size(); } - void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { - MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); + void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const { + MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm()); } void CounterPropagateAddr(const std::set &MIs, unsigned Addr) const { for (MachineInstr *MI : MIs) { - CounterPropagateAddr(MI, Addr); + CounterPropagateAddr(*MI, Addr); } } @@ -472,20 +492,21 @@ public: : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override { - ST = &MF.getSubtarget(); + ST = &MF.getSubtarget(); MaxFetchInst = ST->getTexVTXClauseSize(); - TII = static_cast(ST->getInstrInfo()); - TRI = static_cast(ST->getRegisterInfo()); + TII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); + R600MachineFunctionInfo *MFI = MF.getInfo(); - CFStack CFStack(ST, MFI->getShaderType()); + CFStack CFStack(ST, MF.getFunction()->getCallingConv()); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector > > LoopStack; std::vector IfThenElseStack; - if (MFI->getShaderType() == ShaderType::VERTEX) { + if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; @@ -493,10 +514,10 @@ public: std::vector FetchClauses, AluClauses; std::vector LastAlu(1); std::vector ToPopAfter; - + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { + if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) { DEBUG(dbgs() << CfCount << ":"; I->dump();); FetchClauses.push_back(MakeFetchClause(MBB, I)); CfCount++; @@ -508,7 +529,7 @@ public: if (MI->getOpcode() != AMDGPU::ENDIF) LastAlu.back() = nullptr; if (MI->getOpcode() == AMDGPU::CF_ALU) - LastAlu.back() = MI; + LastAlu.back() = &*MI; I++; bool RequiresWorkAround = CFStack.requiresWorkAroundForInst(MI->getOpcode()); @@ -571,7 +592,7 @@ public: case AMDGPU::ELSE: { MachineInstr * JumpInst = IfThenElseStack.back(); IfThenElseStack.pop_back(); - CounterPropagateAddr(JumpInst, CfCount); + CounterPropagateAddr(*JumpInst, CfCount); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_ELSE)) .addImm(0) @@ -595,10 +616,10 @@ public: DEBUG(dbgs() << CfCount << ":"; MIb->dump();); CfCount++; } - + MachineInstr *IfOrElseInst = IfThenElseStack.back(); IfThenElseStack.pop_back(); - CounterPropagateAddr(IfOrElseInst, CfCount); + CounterPropagateAddr(*IfOrElseInst, CfCount); IfOrElseInst->getOperand(1).setImm(1); LastAlu.pop_back(); MI->eraseFromParent(); @@ -625,15 +646,16 @@ public: case AMDGPU::RETURN: { BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); CfCount++; - MI->eraseFromParent(); if (CfCount % 2) { BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); CfCount++; } + MI->eraseFromParent(); for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) EmitFetchClause(I, FetchClauses[i], CfCount); for (unsigned i = 0, e = AluClauses.size(); i < e; i++) EmitALUClause(I, AluClauses[i], CfCount); + break; } default: if (TII->isExport(MI->getOpcode())) { diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h index 51d87eda31d1..534461adc59f 100644 --- a/lib/Target/AMDGPU/R600Defines.h +++ b/lib/Target/AMDGPU/R600Defines.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H -#define LLVM_LIB_TARGET_R600_R600DEFINES_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H +#define LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H #include "llvm/MC/MCRegisterInfo.h" diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index fdc20302f4a3..93ed5be94a54 100644 --- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -38,8 +38,8 @@ private: const R600InstrInfo *TII; int Address; - unsigned OccupiedDwords(MachineInstr *MI) const { - switch (MI->getOpcode()) { + unsigned OccupiedDwords(MachineInstr &MI) const { + switch (MI.getOpcode()) { case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: @@ -53,17 +53,17 @@ private: // These will be expanded to two ALU instructions in the // ExpandSpecialInstructions pass. - if (TII->isLDSRetInstr(MI->getOpcode())) + if (TII->isLDSRetInstr(MI.getOpcode())) return 2; - if(TII->isVector(*MI) || - TII->isCubeOp(MI->getOpcode()) || - TII->isReductionOp(MI->getOpcode())) + if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()) || + TII->isReductionOp(MI.getOpcode())) return 4; unsigned NumLiteral = 0; - for (MachineInstr::mop_iterator It = MI->operands_begin(), - E = MI->operands_end(); It != E; ++It) { + for (MachineInstr::mop_iterator It = MI.operands_begin(), + E = MI.operands_end(); + It != E; ++It) { MachineOperand &MO = *It; if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) ++NumLiteral; @@ -71,12 +71,12 @@ private: return 1 + NumLiteral; } - bool isALU(const MachineInstr *MI) const { - if (TII->isALUInstr(MI->getOpcode())) + bool isALU(const MachineInstr &MI) const { + if (TII->isALUInstr(MI.getOpcode())) return true; - if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) + if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode())) return true; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case AMDGPU::PRED_X: case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: @@ -89,8 +89,8 @@ private: } } - bool IsTrivialInst(MachineInstr *MI) const { - switch (MI->getOpcode()) { + bool IsTrivialInst(MachineInstr &MI) const { + switch (MI.getOpcode()) { case AMDGPU::KILL: case AMDGPU::RETURN: case AMDGPU::IMPLICIT_DEF: @@ -114,18 +114,20 @@ private: ((((Sel >> 2) - 512) & 4095) >> 5) << 1); } - bool SubstituteKCacheBank(MachineInstr *MI, - std::vector > &CachedConsts, - bool UpdateInstr = true) const { + bool + SubstituteKCacheBank(MachineInstr &MI, + std::vector> &CachedConsts, + bool UpdateInstr = true) const { std::vector > UsedKCache; - if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) + if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4) return true; - const SmallVectorImpl > &Consts = + const SmallVectorImpl> &Consts = TII->getSrcs(MI); - assert((TII->isALUInstr(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); + assert( + (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) && + "Can't assign Const"); for (unsigned i = 0, n = Consts.size(); i < n; ++i) { if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) continue; @@ -194,9 +196,9 @@ private: // in the clause. unsigned LastUseCount = 0; for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { - AluInstCount += OccupiedDwords(UseI); + AluInstCount += OccupiedDwords(*UseI); // Make sure we won't need to end the clause due to KCache limitations. - if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) + if (!SubstituteKCacheBank(*UseI, KCacheBanks, false)) return false; // We have reached the maximum instruction limit before finding the @@ -230,9 +232,9 @@ private: bool PushBeforeModifier = false; unsigned AluInstCount = 0; for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { - if (IsTrivialInst(I)) + if (IsTrivialInst(*I)) continue; - if (!isALU(I)) + if (!isALU(*I)) break; if (AluInstCount > TII->getMaxAlusPerClause()) break; @@ -245,7 +247,7 @@ private: // clause as predicated alus). if (AluInstCount > 0) break; - if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) + if (TII->getFlagOp(*I).getImm() & MO_FLAG_PUSH) PushBeforeModifier = true; AluInstCount ++; continue; @@ -267,16 +269,16 @@ private: if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) break; - if (!SubstituteKCacheBank(I, KCacheBanks)) + if (!SubstituteKCacheBank(*I, KCacheBanks)) break; - AluInstCount += OccupiedDwords(I); + AluInstCount += OccupiedDwords(*I); } unsigned Opcode = PushBeforeModifier ? AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) // We don't use the ADDR field until R600ControlFlowFinalizer pass, where // it is safe to assume it is 0. However if we always put 0 here, the ifcvt - // pass may assume that identical ALU clause starter at the beginning of a + // pass may assume that identical ALU clause starter at the beginning of a // true and false branch can be factorized which is not the case. .addImm(Address++) // ADDR .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 @@ -298,7 +300,8 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast(MF.getSubtarget().getInstrInfo()); + const R600Subtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { @@ -307,7 +310,7 @@ public: if (I->getOpcode() == AMDGPU::CF_ALU) continue; // BB was already parsed for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { - if (isALU(I)) + if (isALU(*I)) I = MakeALUClause(MBB, I); else ++I; diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 211d392e8fcc..0385b6283f37 100644 --- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -29,7 +29,6 @@ using namespace llvm; namespace { class R600ExpandSpecialInstrsPass : public MachineFunctionPass { - private: static char ID; const R600InstrInfo *TII; @@ -61,12 +60,13 @@ void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, int OpIdx = TII->getOperandIdx(*OldMI, Op); if (OpIdx > -1) { uint64_t Val = OldMI->getOperand(OpIdx).getImm(); - TII->setImmOperand(NewMI, Op, Val); + TII->setImmOperand(*NewMI, Op, Val); } } bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast(MF.getSubtarget().getInstrInfo()); + const R600Subtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); const R600RegisterInfo &TRI = TII->getRegisterInfo(); @@ -107,11 +107,11 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MI.getOperand(0).getReg(), // dst MI.getOperand(1).getReg(), // src0 AMDGPU::ZERO); // src1 - TII->addFlag(PredSet, 0, MO_FLAG_MASK); + TII->addFlag(*PredSet, 0, MO_FLAG_MASK); if (Flags & MO_FLAG_PUSH) { - TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1); + TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1); } else { - TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1); + TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1); } MI.eraseFromParent(); continue; @@ -137,9 +137,9 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { BMI->bundleWithPred(); } if (Chan >= 2) - TII->addFlag(BMI, 0, MO_FLAG_MASK); + TII->addFlag(*BMI, 0, MO_FLAG_MASK); if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); @@ -166,9 +166,9 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { BMI->bundleWithPred(); } if (Chan < 2) - TII->addFlag(BMI, 0, MO_FLAG_MASK); + TII->addFlag(*BMI, 0, MO_FLAG_MASK); if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); @@ -189,7 +189,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { BMI->bundleWithPred(); } if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); @@ -212,10 +212,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { BMI->bundleWithPred(); } if (Mask) { - TII->addFlag(BMI, 0, MO_FLAG_MASK); + TII->addFlag(*BMI, 0, MO_FLAG_MASK); } if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); unsigned Opcode = BMI->getOpcode(); // While not strictly necessary from hw point of view, we force // all src operands of a dot4 inst to belong to the same slot. @@ -330,10 +330,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { if (Chan != 0) NewMI->bundleWithPred(); if (Mask) { - TII->addFlag(NewMI, 0, MO_FLAG_MASK); + TII->addFlag(*NewMI, 0, MO_FLAG_MASK); } if (NotLast) { - TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); + TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST); } SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp new file mode 100644 index 000000000000..dd5681ff5e8b --- /dev/null +++ b/lib/Target/AMDGPU/R600FrameLowering.cpp @@ -0,0 +1,15 @@ +//===----------------------- R600FrameLowering.cpp ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#include "R600FrameLowering.h" + +using namespace llvm; + +R600FrameLowering::~R600FrameLowering() { +} diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h new file mode 100644 index 000000000000..5fe4e0d201ac --- /dev/null +++ b/lib/Target/AMDGPU/R600FrameLowering.h @@ -0,0 +1,30 @@ +//===--------------------- R600FrameLowering.h ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H + +#include "AMDGPUFrameLowering.h" + +namespace llvm { + +class R600FrameLowering : public AMDGPUFrameLowering { +public: + R600FrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1) : + AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} + virtual ~R600FrameLowering(); + + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const {} + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {} +}; + +} + +#endif diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 124a9c6e0f56..8f78edd76a51 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -30,18 +30,61 @@ using namespace llvm; -R600TargetLowering::R600TargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) +R600TargetLowering::R600TargetLowering(const TargetMachine &TM, + const R600Subtarget &STI) : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { - addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); + addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); computeRegisterProperties(STI.getRegisterInfo()); + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + + // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address + // spaces, so it is custom lowered to handle those where it isn't. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); + } + + // Workaround for LegalizeDAG asserting on expansion of i1 vector loads. + setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); + + + setOperationAction(ISD::STORE, MVT::i8, Custom); + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setTruncStoreAction(MVT::i32, MVT::i8, Custom); + setTruncStoreAction(MVT::i32, MVT::i16, Custom); + + // Workaround for LegalizeDAG asserting on expansion of i1 vector stores. + setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand); + // Set condition code actions setCondCodeAction(ISD::SETO, MVT::f32, Expand); setCondCodeAction(ISD::SETUO, MVT::f32, Expand); @@ -73,10 +116,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::f32, Expand); - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -122,37 +161,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); - - // Legalize loads and stores to the private address space. - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - - // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address - // spaces, so it is custom lowered to handle those where it isn't. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); - } - - setOperationAction(ISD::STORE, MVT::i8, Custom); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - setTruncStoreAction(MVT::i32, MVT::i8, Custom); - setTruncStoreAction(MVT::i32, MVT::i16, Custom); - - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); @@ -165,12 +173,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - setTargetDAGCombine(ISD::FP_ROUND); - setTargetDAGCombine(ISD::FP_TO_SINT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 // to be Legal/Custom in order to avoid library calls. setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); @@ -188,119 +190,138 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, } setSchedulingPreference(Sched::Source); + + + setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); +} + +const R600Subtarget *R600TargetLowering::getSubtarget() const { + return static_cast(Subtarget); } static inline bool isEOP(MachineBasicBlock::iterator I) { return std::next(I)->getOpcode() == AMDGPU::RETURN; } -MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( - MachineInstr * MI, MachineBasicBlock * BB) const { +MachineBasicBlock * +R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const { MachineFunction * MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - MachineBasicBlock::iterator I = *MI; - const R600InstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); + MachineBasicBlock::iterator I = MI; + const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: // Replace LDS_*_RET instruction that don't have any uses with the // equivalent LDS_*_NORET instruction. - if (TII->isLDSRetInstr(MI->getOpcode())) { - int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + if (TII->isLDSRetInstr(MI.getOpcode())) { + int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); assert(DstIdx != -1); MachineInstrBuilder NewMI; // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add // LDS_1A2D support and remove this special case. - if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || - MI->getOpcode() == AMDGPU::LDS_CMPST_RET) + if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) || + MI.getOpcode() == AMDGPU::LDS_CMPST_RET) return BB; NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), - TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); - for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { - NewMI.addOperand(MI->getOperand(i)); + TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode()))); + for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { + NewMI.addOperand(MI.getOperand(i)); } } else { return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } break; case AMDGPU::CLAMP_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP); break; } case AMDGPU::FABS_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_ABS); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + TII->addFlag(*NewMI, 0, MO_FLAG_ABS); break; } case AMDGPU::FNEG_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_NEG); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + TII->addFlag(*NewMI, 0, MO_FLAG_NEG); break; } case AMDGPU::MASK_WRITE: { - unsigned maskedRegister = MI->getOperand(0).getReg(); + unsigned maskedRegister = MI.getOperand(0).getReg(); assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); - TII->addFlag(defInstr, 0, MO_FLAG_MASK); + TII->addFlag(*defInstr, 0, MO_FLAG_MASK); break; } case AMDGPU::MOV_IMM_F32: - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), - MI->getOperand(1).getFPImm()->getValueAPF() - .bitcastToAPInt().getZExtValue()); + TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1) + .getFPImm() + ->getValueAPF() + .bitcastToAPInt() + .getZExtValue()); break; case AMDGPU::MOV_IMM_I32: - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), - MI->getOperand(1).getImm()); + TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), + MI.getOperand(1).getImm()); break; + case AMDGPU::MOV_IMM_GLOBAL_ADDR: { + //TODO: Perhaps combine this instruction with the next if possible + auto MIB = TII->buildDefaultInstruction( + *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X); + int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal); + //TODO: Ugh this is rather ugly + MIB->getOperand(Idx) = MI.getOperand(1); + break; + } case AMDGPU::CONST_COPY: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, - MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); - TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, - MI->getOperand(1).getImm()); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); + TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel, + MI.getOperand(1).getImm()); break; } case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(isEOP(I)); // Set End of program bit + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addImm(isEOP(I)); // Set End of program bit break; } case AMDGPU::RAT_STORE_TYPED_eg: { - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(2)) - .addImm(isEOP(I)); // Set End of program bit + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addImm(isEOP(I)); // Set End of program bit break; } case AMDGPU::TXD: { unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI->getOperand(4); - MachineOperand &SID = MI->getOperand(5); - unsigned TextureId = MI->getOperand(6).getImm(); + MachineOperand &RID = MI.getOperand(4); + MachineOperand &SID = MI.getOperand(5); + unsigned TextureId = MI.getOperand(6).getImm(); unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; @@ -333,75 +354,77 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( CTZ = 0; break; } - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) - .addOperand(MI->getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) - .addOperand(MI->getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), + T0) + .addOperand(MI.getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), + T1) + .addOperand(MI.getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); break; } case AMDGPU::TXD_SHADOW: { unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI->getOperand(4); - MachineOperand &SID = MI->getOperand(5); - unsigned TextureId = MI->getOperand(6).getImm(); + MachineOperand &RID = MI.getOperand(4); + MachineOperand &SID = MI.getOperand(5); + unsigned TextureId = MI.getOperand(6).getImm(); unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; @@ -435,99 +458,101 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) - .addOperand(MI->getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) - .addOperand(MI->getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), + T0) + .addOperand(MI.getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), + T1) + .addOperand(MI.getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); break; } case AMDGPU::BRANCH: - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) - .addOperand(MI->getOperand(0)); - break; + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + .addOperand(MI.getOperand(0)); + break; case AMDGPU::BRANCH_COND_f32: { MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) - .addOperand(MI->getOperand(1)) - .addImm(OPCODE_IS_NOT_ZERO) - .addImm(0); // Flags - TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI.getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO) + .addImm(0); // Flags + TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI->getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addOperand(MI.getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; } case AMDGPU::BRANCH_COND_i32: { MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) - .addOperand(MI->getOperand(1)) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI.getOperand(1)) .addImm(OPCODE_IS_NOT_ZERO_INT) .addImm(0); // Flags - TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI->getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addOperand(MI.getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; } @@ -535,7 +560,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( case AMDGPU::R600_ExportSwz: { // Instruction is left unmodified if its not the last one of its type bool isLastInstructionOfItsType = true; - unsigned InstExportType = MI->getOperand(1).getImm(); + unsigned InstExportType = MI.getOperand(1).getImm(); for (MachineBasicBlock::iterator NextExportInst = std::next(I), EndBlock = BB->end(); NextExportInst != EndBlock; NextExportInst = std::next(NextExportInst)) { @@ -552,17 +577,17 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( bool EOP = isEOP(I); if (!EOP && !isLastInstructionOfItsType) return BB; - unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(2)) - .addOperand(MI->getOperand(3)) - .addOperand(MI->getOperand(4)) - .addOperand(MI->getOperand(5)) - .addOperand(MI->getOperand(6)) - .addImm(CfInst) - .addImm(EOP); + unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40; + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addOperand(MI.getOperand(3)) + .addOperand(MI.getOperand(4)) + .addOperand(MI.getOperand(5)) + .addOperand(MI.getOperand(6)) + .addImm(CfInst) + .addImm(EOP); break; } case AMDGPU::RETURN: { @@ -576,7 +601,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( } } - MI->eraseFromParent(); + MI.eraseFromParent(); return BB; } @@ -610,18 +635,13 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); + case ISD::FrameIndex: return lowerFrameIndex(Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { - case AMDGPUIntrinsic::AMDGPU_store_output: { - int64_t RegIndex = cast(Op.getOperand(3))->getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MFI->LiveOuts.push_back(Reg); - return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); - } - case AMDGPUIntrinsic::R600_store_swizzle: { + case AMDGPUIntrinsic::r600_store_swizzle: { SDLoc DL(Op); const SDValue Args[8] = { Chain, @@ -649,114 +669,48 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const SDLoc DL(Op); switch(IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case AMDGPUIntrinsic::R600_load_input: { - int64_t RegIndex = cast(Op.getOperand(1))->getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(Reg); - return DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), Reg, VT); - } - - case AMDGPUIntrinsic::R600_interp_input: { - int slot = cast(Op.getOperand(1))->getZExtValue(); - int ijb = cast(Op.getOperand(2))->getSExtValue(); - MachineSDNode *interp; - if (ijb < 0) { - const R600InstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, - MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32)); - return DAG.getTargetExtractSubreg( - TII->getRegisterInfo().getSubRegFromChannel(slot % 4), - DL, MVT::f32, SDValue(interp, 0)); - } - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); - unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); - MRI.addLiveIn(RegisterI); - MRI.addLiveIn(RegisterJ); - SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); - SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); - - if (slot % 4 < 2) - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), - RegisterJNode, RegisterINode); - else - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), - RegisterJNode, RegisterINode); - return SDValue(interp, slot % 2); - } - case AMDGPUIntrinsic::R600_interp_xy: - case AMDGPUIntrinsic::R600_interp_zw: { - int slot = cast(Op.getOperand(1))->getZExtValue(); - MachineSDNode *interp; - SDValue RegisterINode = Op.getOperand(2); - SDValue RegisterJNode = Op.getOperand(3); - - if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), - RegisterJNode, RegisterINode); - else - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), - RegisterJNode, RegisterINode); - return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, - SDValue(interp, 0), SDValue(interp, 1)); - } - case AMDGPUIntrinsic::R600_tex: - case AMDGPUIntrinsic::R600_texc: - case AMDGPUIntrinsic::R600_txl: - case AMDGPUIntrinsic::R600_txlc: - case AMDGPUIntrinsic::R600_txb: - case AMDGPUIntrinsic::R600_txbc: - case AMDGPUIntrinsic::R600_txf: - case AMDGPUIntrinsic::R600_txq: - case AMDGPUIntrinsic::R600_ddx: - case AMDGPUIntrinsic::R600_ddy: - case AMDGPUIntrinsic::R600_ldptr: { + case AMDGPUIntrinsic::r600_tex: + case AMDGPUIntrinsic::r600_texc: + case AMDGPUIntrinsic::r600_txl: + case AMDGPUIntrinsic::r600_txlc: + case AMDGPUIntrinsic::r600_txb: + case AMDGPUIntrinsic::r600_txbc: + case AMDGPUIntrinsic::r600_txf: + case AMDGPUIntrinsic::r600_txq: + case AMDGPUIntrinsic::r600_ddx: + case AMDGPUIntrinsic::r600_ddy: { unsigned TextureOp; switch (IntrinsicID) { - case AMDGPUIntrinsic::R600_tex: + case AMDGPUIntrinsic::r600_tex: TextureOp = 0; break; - case AMDGPUIntrinsic::R600_texc: + case AMDGPUIntrinsic::r600_texc: TextureOp = 1; break; - case AMDGPUIntrinsic::R600_txl: + case AMDGPUIntrinsic::r600_txl: TextureOp = 2; break; - case AMDGPUIntrinsic::R600_txlc: + case AMDGPUIntrinsic::r600_txlc: TextureOp = 3; break; - case AMDGPUIntrinsic::R600_txb: + case AMDGPUIntrinsic::r600_txb: TextureOp = 4; break; - case AMDGPUIntrinsic::R600_txbc: + case AMDGPUIntrinsic::r600_txbc: TextureOp = 5; break; - case AMDGPUIntrinsic::R600_txf: + case AMDGPUIntrinsic::r600_txf: TextureOp = 6; break; - case AMDGPUIntrinsic::R600_txq: + case AMDGPUIntrinsic::r600_txq: TextureOp = 7; break; - case AMDGPUIntrinsic::R600_ddx: + case AMDGPUIntrinsic::r600_ddx: TextureOp = 8; break; - case AMDGPUIntrinsic::R600_ddy: + case AMDGPUIntrinsic::r600_ddy: TextureOp = 9; break; - case AMDGPUIntrinsic::R600_ldptr: - TextureOp = 10; - break; default: llvm_unreachable("Unknow Texture Operation"); } @@ -784,7 +738,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const }; return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); } - case AMDGPUIntrinsic::AMDGPU_dp4: { + case AMDGPUIntrinsic::r600_dot4: { SDValue Args[8] = { DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), DAG.getConstant(0, DL, MVT::i32)), @@ -806,6 +760,11 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); } + case Intrinsic::r600_implicitarg_ptr: { + MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); + uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + return DAG.getConstant(ByteOffset, DL, PtrVT); + } case Intrinsic::r600_read_ngroups_x: return LowerImplicitParameter(DAG, VT, DL, 0); case Intrinsic::r600_read_ngroups_y: @@ -825,7 +784,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_local_size_z: return LowerImplicitParameter(DAG, VT, DL, 8); - case Intrinsic::AMDGPU_read_workdim: { + case Intrinsic::r600_read_workdim: + case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name. uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); } @@ -848,14 +808,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T0_Z, VT); - case Intrinsic::AMDGPU_rsq: - // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + case Intrinsic::r600_recipsqrt_ieee: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + + case Intrinsic::r600_recipsqrt_clamped: + return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); } + // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) break; } @@ -950,6 +910,22 @@ SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return vectorToVerticalVector(DAG, Insert); } +SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, + SDValue Op, + SelectionDAG &DAG) const { + + GlobalAddressSDNode *GSD = cast(Op); + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); + + const DataLayout &DL = DAG.getDataLayout(); + const GlobalValue *GV = GSD->getGlobal(); + MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + + SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); +} + SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { // On hw >= R700, COS/SIN input must be between -1. and 1. // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) @@ -977,7 +953,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, DAG.getNode(ISD::FADD, DL, VT, FractPart, DAG.getConstantFP(-0.5, DL, MVT::f32))); - if (Gen >= AMDGPUSubtarget::R700) + if (Gen >= R600Subtarget::R700) return TrigVal; // On R600 hw, COS/SIN input must be between -Pi and Pi. return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, @@ -1088,7 +1064,7 @@ SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { } SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, - SDLoc DL, + const SDLoc &DL, unsigned DwordOffset) const { unsigned ByteOffset = DwordOffset * 4; PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), @@ -1099,8 +1075,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR - MachinePointerInfo(ConstantPointerNull::get(PtrType)), - false, false, false, 0); + MachinePointerInfo(ConstantPointerNull::get(PtrType))); } bool R600TargetLowering::isZero(SDValue Op) const { @@ -1113,6 +1088,20 @@ bool R600TargetLowering::isZero(SDValue Op) const { } } +bool R600TargetLowering::isHWTrueValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast(Op)) { + return CFP->isExactlyValue(1.0); + } + return isAllOnesConstant(Op); +} + +bool R600TargetLowering::isHWFalseValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast(Op)) { + return CFP->getValueAPF().isZero(); + } + return isNullConstant(Op); +} + SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); @@ -1311,19 +1300,73 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth, } } +SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, + SelectionDAG &DAG) const { + SDLoc DL(Store); + + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + + SDValue Chain = Store->getChain(); + SDValue BasePtr = Store->getBasePtr(); + EVT MemVT = Store->getMemoryVT(); + + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, + DAG.getConstant(2, DL, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); + + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + DAG.getConstant(0x3, DL, MVT::i32)); + + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + + SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(Mask, DL, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, DL, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); +} + SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); + if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG)) + return Result; + StoreSDNode *StoreNode = cast(Op); - SDValue Chain = Op.getOperand(0); - SDValue Value = Op.getOperand(1); - SDValue Ptr = Op.getOperand(2); + unsigned AS = StoreNode->getAddressSpace(); + SDValue Value = StoreNode->getValue(); + EVT ValueVT = Value.getValueType(); - SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Result.getNode()) { - return Result; + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && + ValueVT.isVector()) { + return SplitVectorStore(Op, DAG); } - if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { + SDLoc DL(Op); + SDValue Chain = StoreNode->getChain(); + SDValue Ptr = StoreNode->getBasePtr(); + + if (AS == AMDGPUAS::GLOBAL_ADDRESS) { if (StoreNode->isTruncatingStore()) { EVT VT = Value.getValueType(); assert(VT.bitsLE(MVT::i32)); @@ -1352,13 +1395,13 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(0, DL, MVT::i32), Mask }; - SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); + SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src); SDValue Args[3] = { Chain, Input, DWordAddr }; return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, Op->getVTList(), Args, MemVT, StoreNode->getMemOperand()); } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && - Value.getValueType().bitsGE(MVT::i32)) { + ValueVT.bitsGE(MVT::i32)) { // Convert pointer from byte address to dword address. Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), @@ -1373,21 +1416,16 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } } - EVT ValueVT = Value.getValueType(); - - if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + if (AS != AMDGPUAS::PRIVATE_ADDRESS) return SDValue(); - } - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) { - return Ret; - } - // Lowering for indirect addressing + EVT MemVT = StoreNode->getMemoryVT(); + if (MemVT.bitsLT(MVT::i32)) + return lowerPrivateTruncStore(StoreNode, DAG); + // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast(Subtarget->getFrameLowering()); + const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1465,37 +1503,81 @@ ConstantAddressBlock(unsigned AddressSpace) { } } -SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const -{ - EVT VT = Op.getValueType(); +SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, + SelectionDAG &DAG) const { SDLoc DL(Op); - LoadSDNode *LoadNode = cast(Op); - SDValue Chain = Op.getOperand(0); - SDValue Ptr = Op.getOperand(1); - SDValue LoweredLoad; + LoadSDNode *Load = cast(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT MemVT = Load->getMemoryVT(); + + // getBasePtr(), + DAG.getConstant(2, DL, MVT::i32)); + // Load the Register. + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), + Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), + Op.getOperand(2)); + + // Get offset within the register. + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, + Load->getBasePtr(), + DAG.getConstant(0x3, DL, MVT::i32)); + + // Bit offset of target byte (byteIdx * 8). + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + // Shift to the right. + Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + + // Eliminate the upper bits by setting them to ... + EVT MemEltVT = MemVT.getScalarType(); + + // ... ones. + if (ExtType == ISD::SEXTLOAD) { + SDValue MemEltVTNode = DAG.getValueType(MemEltVT); + + SDValue Ops[] = { + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), + Load->getChain() + }; - if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG)) - return Ret; + return DAG.getMergeValues(Ops, DL); + } + + // ... or zeros. + SDValue Ops[] = { + DAG.getZeroExtendInReg(Ret, DL, MemEltVT), + Load->getChain() + }; - // Lower loads constant address space global variable loads - if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa(GetUnderlyingObject( - LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) { + return DAG.getMergeValues(Ops, DL); +} - SDValue Ptr = DAG.getZExtOrTrunc( - LoadNode->getBasePtr(), DL, - getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS)); - Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(2, DL, MVT::i32)); - return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), - LoadNode->getChain(), Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + LoadSDNode *LoadNode = cast(Op); + unsigned AS = LoadNode->getAddressSpace(); + EVT MemVT = LoadNode->getMemoryVT(); + ISD::LoadExtType ExtType = LoadNode->getExtensionType(); + + if (AS == AMDGPUAS::PRIVATE_ADDRESS && + ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { + return lowerPrivateExtLoad(Op, DAG); } + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Chain = LoadNode->getChain(); + SDValue Ptr = LoadNode->getBasePtr(); + if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { SDValue MergedValues[2] = { - ScalarizeVectorLoad(Op, DAG), + scalarizeVectorLoad(LoadNode, DAG), Chain }; return DAG.getMergeValues(MergedValues, DL); @@ -1526,8 +1608,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const NewVT = VT; NumElements = VT.getVectorNumElements(); } - Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, - makeArrayRef(Slots, NumElements)); + Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); } else { // non-constant ptr can't be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, @@ -1550,6 +1631,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const return DAG.getMergeValues(MergedValues, DL); } + SDValue LoweredLoad; + // For most operations returning SDValue() will result in the node being // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we // need to manually expand loads that may be legal in some address spaces and @@ -1560,12 +1643,9 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { EVT MemVT = LoadNode->getMemoryVT(); assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); - SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, - LoadNode->getPointerInfo(), MemVT, - LoadNode->isVolatile(), - LoadNode->isNonTemporal(), - LoadNode->isInvariant(), - LoadNode->getAlignment()); + SDValue NewLoad = DAG.getExtLoad( + ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT, + LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags()); SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, DAG.getValueType(MemVT)); @@ -1579,8 +1659,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast(Subtarget->getFrameLowering()); + const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1590,6 +1669,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const EVT ElemVT = VT.getVectorElementType(); SDValue Loads[4]; + assert(NumElemVT <= 4); assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " "vector width in load"); @@ -1603,11 +1683,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const DAG.getTargetConstant(Channel, DL, MVT::i32), Op.getOperand(2)); } - for (unsigned i = NumElemVT; i < 4; ++i) { - Loads[i] = DAG.getUNDEF(ElemVT); - } - EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); - LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT); + LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT)); } else { LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, Chain, Ptr, @@ -1632,16 +1709,28 @@ SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { Chain, Jump, Cond); } +SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); + + FrameIndexSDNode *FIN = cast(Op); + + unsigned FrameIndex = FIN->getIndex(); + unsigned IgnoredFrameReg; + unsigned Offset = + TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), + Op.getValueType()); +} + /// XXX Only kernel functions are supported, so we can assume for now that /// every function is a kernel function, but in the future we should use /// separate calling conventions for kernel and non-kernel functions. SDValue R600TargetLowering::LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl &InVals) const { SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); @@ -1664,7 +1753,7 @@ SDValue R600TargetLowering::LowerFormalArguments( MemVT = MemVT.getVectorElementType(); } - if (MFI->getShaderType() != ShaderType::COMPUTE) { + if (AMDGPU::isShader(CallConv)) { unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); InVals.push_back(Register); @@ -1699,11 +1788,11 @@ SDValue R600TargetLowering::LowerFormalArguments( unsigned Offset = 36 + VA.getLocMemOffset(); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); - SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, - DAG.getConstant(Offset, DL, MVT::i32), - DAG.getUNDEF(MVT::i32), - PtrInfo, - MemVT, false, true, true, 4); + SDValue Arg = DAG.getLoad( + ISD::UNINDEXED, Ext, VT, DL, Chain, + DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, + MemVT, /* Alignment = */ 4, + MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant); // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); @@ -1719,6 +1808,26 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *IsFast) const { + if (IsFast) + *IsFast = false; + + if (!VT.isSimple() || VT == MVT::Other) + return false; + + if (VT.bitsLT(MVT::i32)) + return false; + + // TODO: This is a rough estimate. + if (IsFast) + *IsFast = true; + + return VT.bitsGT(MVT::i32) && Align % 4 == 0; +} + static SDValue CompactSwizzlableVector( SelectionDAG &DAG, SDValue VectorEntry, DenseMap &RemapSwizzle) { @@ -1732,7 +1841,7 @@ static SDValue CompactSwizzlableVector( }; for (unsigned i = 0; i < 4; i++) { - if (NewBldVec[i].getOpcode() == ISD::UNDEF) + if (NewBldVec[i].isUndef()) // We mask write here to teach later passes that the ith element of this // vector is undef. Thus we can use it to reduce 128 bits reg usage, // break false dependencies and additionnaly make assembly easier to read. @@ -1747,7 +1856,7 @@ static SDValue CompactSwizzlableVector( } } - if (NewBldVec[i].getOpcode() == ISD::UNDEF) + if (NewBldVec[i].isUndef()) continue; for (unsigned j = 0; j < i; j++) { if (NewBldVec[i] == NewBldVec[j]) { @@ -1758,8 +1867,8 @@ static SDValue CompactSwizzlableVector( } } - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec); + return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), + NewBldVec); } static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, @@ -1796,14 +1905,13 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, } } - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec); + return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), + NewBldVec); } - -SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, - SDValue Swz[4], SelectionDAG &DAG, - SDLoc DL) const { +SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], + SelectionDAG &DAG, + const SDLoc &DL) const { assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); // Old -> New swizzle values DenseMap SwizzleRemap; @@ -1886,7 +1994,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, SDLoc dl(N); // If the inserted element is an UNDEF, just use the input vector. - if (InVal.getOpcode() == ISD::UNDEF) + if (InVal.isUndef()) return InVec; EVT VT = InVec.getValueType(); @@ -1907,7 +2015,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, if (InVec.getOpcode() == ISD::BUILD_VECTOR) { Ops.append(InVec.getNode()->op_begin(), InVec.getNode()->op_end()); - } else if (InVec.getOpcode() == ISD::UNDEF) { + } else if (InVec.isUndef()) { unsigned NElts = VT.getVectorNumElements(); Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); } else { @@ -1927,7 +2035,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } // Return the new vector - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } // Extract_vec (Build_vector) generated by custom lowering @@ -1953,8 +2061,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT_CC: { // Try common optimizations - SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - if (Ret.getNode()) + if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI)) return Ret; // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> @@ -2053,13 +2160,14 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } -static bool -FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, - SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { - const R600InstrInfo *TII = - static_cast(DAG.getSubtarget().getInstrInfo()); +bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, + SDValue &Src, SDValue &Neg, SDValue &Abs, + SDValue &Sel, SDValue &Imm, + SelectionDAG &DAG) const { + const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); if (!Src.isMachineOpcode()) return false; + switch (Src.getMachineOpcode()) { case AMDGPU::FNEG_R600: if (!Neg.getNode()) @@ -2127,6 +2235,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); return true; } + case AMDGPU::MOV_IMM_GLOBAL_ADDR: + // Check if the Imm slot is used. Taken from below. + if (cast(Imm)->getZExtValue()) + return false; + Imm = Src.getOperand(0); + Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32); + return true; case AMDGPU::MOV_IMM_I32: case AMDGPU::MOV_IMM_F32: { unsigned ImmReg = AMDGPU::ALU_LITERAL_X; @@ -2177,14 +2292,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, } } - /// \brief Fold the instructions after selecting them SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const R600InstrInfo *TII = - static_cast(DAG.getSubtarget().getInstrInfo()); + const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); if (!Node->isMachineOpcode()) return Node; + unsigned Opcode = Node->getMachineOpcode(); SDValue FakeOp; diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 4dbac97af2a1..2fb6ee25caa9 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -12,55 +12,69 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H -#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H #include "AMDGPUISelLowering.h" namespace llvm { class R600InstrInfo; +class R600Subtarget; -class R600TargetLowering : public AMDGPUTargetLowering { +class R600TargetLowering final : public AMDGPUTargetLowering { public: - R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); - MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock * BB) const override; + R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI); + + const R600Subtarget *getSubtarget() const; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; void ReplaceNodeResults(SDNode * N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; - SDValue LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Ins, + const SDLoc &DL, SelectionDAG &DAG, + SmallVectorImpl &InVals) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, + unsigned Align, + bool *IsFast) const override; + private: unsigned Gen; /// Each OpenCL kernel has nine implicit parameters that are stored in the /// first nine dwords of a Vertex Buffer. These implicit parameters are /// lowered to load instructions which retrieve the values from the Vertex /// Buffer. - SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, - SDLoc DL, unsigned DwordOffset) const; + SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, const SDLoc &DL, + unsigned DwordOffset) const; void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, MachineRegisterInfo & MRI, unsigned dword_offset) const; SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, - SDLoc DL) const; + const SDLoc &DL) const; SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; + SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const override; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + + SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; @@ -74,6 +88,13 @@ private: void getStackAddress(unsigned StackWidth, unsigned ElemIdx, unsigned &Channel, unsigned &PtrIncr) const; bool isZero(SDValue Op) const; + bool isHWTrueValue(SDValue Op) const; + bool isHWFalseValue(SDValue Op) const; + + bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, + SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, + SelectionDAG &DAG) const; + SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; }; diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index 8b6eea17130b..1c5f7ec1b6ef 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -28,26 +28,17 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AMDGPUGenDFAPacketizer.inc" -R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), RI() {} - -const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { - return RI; -} - -bool R600InstrInfo::isTrig(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; -} +R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) + : AMDGPUInstrInfo(ST), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; } -void -R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { +void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { unsigned VectorComponents = 0; if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && @@ -91,10 +82,9 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, } bool R600InstrInfo::isMov(unsigned Opcode) const { - - switch(Opcode) { - default: return false; + default: + return false; case AMDGPU::MOV: case AMDGPU::MOV_IMM_F32: case AMDGPU::MOV_IMM_I32: @@ -102,17 +92,6 @@ bool R600InstrInfo::isMov(unsigned Opcode) const { } } -// Some instructions act as place holders to emulate operations that the GPU -// hardware does automatically. This function can be used to check if -// an opcode falls into this category. -bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const { - switch (Opcode) { - default: return false; - case AMDGPU::RETURN: - return true; - } -} - bool R600InstrInfo::isReductionOp(unsigned Opcode) const { return false; } @@ -150,20 +129,16 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { (TargetFlags & R600_InstFlag::LDS_1A2D)); } -bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const { - return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1; -} - bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const { return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1; } -bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const { - if (isALUInstr(MI->getOpcode())) +bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const { + if (isALUInstr(MI.getOpcode())) return true; - if (isVector(*MI) || isCubeOp(MI->getOpcode())) + if (isVector(MI) || isCubeOp(MI.getOpcode())) return true; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case AMDGPU::PRED_X: case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: @@ -182,16 +157,16 @@ bool R600InstrInfo::isTransOnly(unsigned Opcode) const { return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU); } -bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const { - return isTransOnly(MI->getOpcode()); +bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const { + return isTransOnly(MI.getOpcode()); } bool R600InstrInfo::isVectorOnly(unsigned Opcode) const { return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU); } -bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const { - return isVectorOnly(MI->getOpcode()); +bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const { + return isVectorOnly(MI.getOpcode()); } bool R600InstrInfo::isExport(unsigned Opcode) const { @@ -202,23 +177,21 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { return ST.hasVertexCache() && IS_VTX(get(Opcode)); } -bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { - const MachineFunction *MF = MI->getParent()->getParent(); - const R600MachineFunctionInfo *MFI = MF->getInfo(); - return MFI->getShaderType() != ShaderType::COMPUTE && - usesVertexCache(MI->getOpcode()); +bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const { + const MachineFunction *MF = MI.getParent()->getParent(); + return !AMDGPU::isCompute(MF->getFunction()->getCallingConv()) && + usesVertexCache(MI.getOpcode()); } bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode)); } -bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { - const MachineFunction *MF = MI->getParent()->getParent(); - const R600MachineFunctionInfo *MFI = MF->getInfo(); - return (MFI->getShaderType() == ShaderType::COMPUTE && - usesVertexCache(MI->getOpcode())) || - usesTextureCache(MI->getOpcode()); +bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const { + const MachineFunction *MF = MI.getParent()->getParent(); + return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) && + usesVertexCache(MI.getOpcode())) || + usesTextureCache(MI.getOpcode()); } bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { @@ -231,20 +204,21 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { } } -bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const { - return MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; +bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const { + return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; } -bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const { - return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; +bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const { + return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; } -bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const { - if (!isALUInstr(MI->getOpcode())) { +bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { + if (!isALUInstr(MI.getOpcode())) { return false; } - for (MachineInstr::const_mop_iterator I = MI->operands_begin(), - E = MI->operands_end(); I != E; ++I) { + for (MachineInstr::const_mop_iterator I = MI.operands_begin(), + E = MI.operands_end(); + I != E; ++I) { if (!I->isReg() || !I->isUse() || TargetRegisterInfo::isVirtualRegister(I->getReg())) continue; @@ -255,17 +229,6 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const { return false; } -int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const { - static const unsigned OpTable[] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 - }; - - assert (SrcNum < 3); - return getOperandIdx(Opcode, OpTable[SrcNum]); -} - int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { static const unsigned SrcSelTable[][2] = { {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, @@ -290,10 +253,10 @@ int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { } SmallVector, 3> -R600InstrInfo::getSrcs(MachineInstr *MI) const { +R600InstrInfo::getSrcs(MachineInstr &MI) const { SmallVector, 3> Result; - if (MI->getOpcode() == AMDGPU::DOT_4) { + if (MI.getOpcode() == AMDGPU::DOT_4) { static const unsigned OpTable[8][2] = { {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, @@ -306,13 +269,13 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const { }; for (unsigned j = 0; j < 8; j++) { - MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), - OpTable[j][0])); + MachineOperand &MO = + MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0])); unsigned Reg = MO.getReg(); if (Reg == AMDGPU::ALU_CONST) { - unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(), - OpTable[j][1])).getImm(); - Result.push_back(std::pair(&MO, Sel)); + MachineOperand &Sel = + MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); + Result.push_back(std::make_pair(&MO, Sel.getImm())); continue; } @@ -327,30 +290,33 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const { }; for (unsigned j = 0; j < 3; j++) { - int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); + int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]); if (SrcIdx < 0) break; - MachineOperand &MO = MI->getOperand(SrcIdx); - unsigned Reg = MI->getOperand(SrcIdx).getReg(); + MachineOperand &MO = MI.getOperand(SrcIdx); + unsigned Reg = MO.getReg(); if (Reg == AMDGPU::ALU_CONST) { - unsigned Sel = MI->getOperand( - getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); - Result.push_back(std::pair(&MO, Sel)); + MachineOperand &Sel = + MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); + Result.push_back(std::make_pair(&MO, Sel.getImm())); continue; } if (Reg == AMDGPU::ALU_LITERAL_X) { - unsigned Imm = MI->getOperand( - getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm(); - Result.push_back(std::pair(&MO, Imm)); - continue; + MachineOperand &Operand = + MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal)); + if (Operand.isImm()) { + Result.push_back(std::make_pair(&MO, Operand.getImm())); + continue; + } + assert(Operand.isGlobal()); } - Result.push_back(std::pair(&MO, 0)); + Result.push_back(std::make_pair(&MO, 0)); } return Result; } -std::vector > -R600InstrInfo::ExtractSrcs(MachineInstr *MI, +std::vector> +R600InstrInfo::ExtractSrcs(MachineInstr &MI, const DenseMap &PV, unsigned &ConstCount) const { ConstCount = 0; @@ -360,13 +326,13 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI, unsigned i = 0; for (unsigned n = Srcs.size(); i < n; ++i) { unsigned Reg = Srcs[i].first->getReg(); - unsigned Index = RI.getEncodingValue(Reg) & 0xff; + int Index = RI.getEncodingValue(Reg) & 0xff; if (Reg == AMDGPU::OQAP) { - Result.push_back(std::pair(Index, 0)); + Result.push_back(std::make_pair(Index, 0U)); } if (PV.find(Reg) != PV.end()) { // 255 is used to tells its a PS/PV reg - Result.push_back(std::pair(255, 0)); + Result.push_back(std::make_pair(255, 0U)); continue; } if (Index > 127) { @@ -375,7 +341,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI, continue; } unsigned Chan = RI.getHWRegChan(Reg); - Result.push_back(std::pair(Index, Chan)); + Result.push_back(std::make_pair(Index, Chan)); } for (; i < 3; ++i) Result.push_back(DummyPair); @@ -411,8 +377,7 @@ Swizzle(std::vector > Src, return Src; } -static unsigned -getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { +static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { switch (Swz) { case R600InstrInfo::ALU_VEC_012_SCL_210: { unsigned Cycles[3] = { 2, 1, 0}; @@ -432,7 +397,6 @@ getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { } default: llvm_unreachable("Wrong Swizzle for Trans Slot"); - return 0; } } @@ -557,7 +521,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector &IG, unsigned ConstCount; BankSwizzle TransBS = ALU_VEC_012_SCL_210; for (unsigned i = 0, e = IG.size(); i < e; ++i) { - IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount)); + IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount)); unsigned Op = getOperandIdx(IG[i]->getOpcode(), AMDGPU::OpName::bank_swizzle); ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) @@ -624,14 +588,13 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector &MIs) std::vector Consts; SmallSet Literals; for (unsigned i = 0, n = MIs.size(); i < n; i++) { - MachineInstr *MI = MIs[i]; - if (!isALUInstr(MI->getOpcode())) + MachineInstr &MI = *MIs[i]; + if (!isALUInstr(MI.getOpcode())) continue; ArrayRef> Srcs = getSrcs(MI); - for (unsigned j = 0, e = Srcs.size(); j < e; j++) { - std::pair Src = Srcs[j]; + for (const auto &Src:Srcs) { if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X) Literals.insert(Src.second); if (Literals.size() > 4) @@ -652,7 +615,7 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector &MIs) DFAPacketizer * R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const { const InstrItineraryData *II = STI.getInstrItineraryData(); - return static_cast(STI).createDFAPacketizer(II); + return static_cast(STI).createDFAPacketizer(II); } static bool @@ -670,9 +633,9 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { while (I != MBB.begin()) { --I; - MachineInstr *MI = I; - if (isPredicateSetter(MI->getOpcode())) - return MI; + MachineInstr &MI = *I; + if (isPredicateSetter(MI.getOpcode())) + return &MI; } return nullptr; @@ -688,12 +651,11 @@ static bool isBranch(unsigned Opcode) { Opcode == AMDGPU::BRANCH_COND_f32; } -bool -R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl &Cond, - bool AllowModify) const { +bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify) const { // Most of the following comes from the ARM implementation of AnalyzeBranch // If the block has no terminators, it just falls into the block after it. @@ -716,21 +678,21 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, I->removeFromParent(); I = PriorI; } - MachineInstr *LastInst = I; + MachineInstr &LastInst = *I; // If there is only one terminator instruction, process it. - unsigned LastOpc = LastInst->getOpcode(); + unsigned LastOpc = LastInst.getOpcode(); if (I == MBB.begin() || !isJump(static_cast(--I)->getOpcode())) { if (LastOpc == AMDGPU::JUMP) { - TBB = LastInst->getOperand(0).getMBB(); + TBB = LastInst.getOperand(0).getMBB(); return false; } else if (LastOpc == AMDGPU::JUMP_COND) { - MachineInstr *predSet = I; + auto predSet = I; while (!isPredicateSetter(predSet->getOpcode())) { predSet = --I; } - TBB = LastInst->getOperand(0).getMBB(); + TBB = LastInst.getOperand(0).getMBB(); Cond.push_back(predSet->getOperand(1)); Cond.push_back(predSet->getOperand(2)); Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); @@ -740,17 +702,17 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, } // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; - unsigned SecondLastOpc = SecondLastInst->getOpcode(); + MachineInstr &SecondLastInst = *I; + unsigned SecondLastOpc = SecondLastInst.getOpcode(); // If the block ends with a B and a Bcc, handle it. if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { - MachineInstr *predSet = --I; + auto predSet = --I; while (!isPredicateSetter(predSet->getOpcode())) { predSet = --I; } - TBB = SecondLastInst->getOperand(0).getMBB(); - FBB = LastInst->getOperand(0).getMBB(); + TBB = SecondLastInst.getOperand(0).getMBB(); + FBB = LastInst.getOperand(0).getMBB(); Cond.push_back(predSet->getOperand(1)); Cond.push_back(predSet->getOperand(2)); Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); @@ -772,12 +734,11 @@ MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { return MBB.end(); } -unsigned -R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, - MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - ArrayRef Cond, - DebugLoc DL) const { +unsigned R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef Cond, + const DebugLoc &DL) const { assert(TBB && "InsertBranch must not be told to insert a fallthrough"); if (!FBB) { @@ -787,7 +748,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, } else { MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); assert(PredSet && "No previous predicate !"); - addFlag(PredSet, 0, MO_FLAG_PUSH); + addFlag(*PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) @@ -803,7 +764,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, } else { MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); assert(PredSet && "No previous predicate !"); - addFlag(PredSet, 0, MO_FLAG_PUSH); + addFlag(*PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) .addMBB(TBB) @@ -835,7 +796,7 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return 0; case AMDGPU::JUMP_COND: { MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); - clearFlag(predSet, 0, MO_FLAG_PUSH); + clearFlag(*predSet, 0, MO_FLAG_PUSH); I->eraseFromParent(); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) @@ -860,7 +821,7 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return 1; case AMDGPU::JUMP_COND: { MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); - clearFlag(predSet, 0, MO_FLAG_PUSH); + clearFlag(*predSet, 0, MO_FLAG_PUSH); I->eraseFromParent(); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) @@ -876,13 +837,12 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return 2; } -bool -R600InstrInfo::isPredicated(const MachineInstr *MI) const { - int idx = MI->findFirstPredOperandIdx(); +bool R600InstrInfo::isPredicated(const MachineInstr &MI) const { + int idx = MI.findFirstPredOperandIdx(); if (idx < 0) return false; - unsigned Reg = MI->getOperand(idx).getReg(); + unsigned Reg = MI.getOperand(idx).getReg(); switch (Reg) { default: return false; case AMDGPU::PRED_SEL_ONE: @@ -892,25 +852,22 @@ R600InstrInfo::isPredicated(const MachineInstr *MI) const { } } -bool -R600InstrInfo::isPredicable(MachineInstr *MI) const { +bool R600InstrInfo::isPredicable(MachineInstr &MI) const { // XXX: KILL* instructions can be predicated, but they must be the last // instruction in a clause, so this means any instructions after them cannot // be predicated. Until we have proper support for instruction clauses in the // backend, we will mark KILL* instructions as unpredicable. - if (MI->getOpcode() == AMDGPU::KILLGT) { + if (MI.getOpcode() == AMDGPU::KILLGT) { return false; - } else if (MI->getOpcode() == AMDGPU::CF_ALU) { + } else if (MI.getOpcode() == AMDGPU::CF_ALU) { // If the clause start in the middle of MBB then the MBB has more // than a single clause, unable to predicate several clauses. - if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI)) + if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI)) return false; // TODO: We don't support KC merging atm - if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0) - return false; - return true; - } else if (isVector(*MI)) { + return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0; + } else if (isVector(MI)) { return false; } else { return AMDGPUInstrInfo::isPredicable(MI); @@ -986,48 +943,39 @@ R600InstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) con return false; } -bool -R600InstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const { - return isPredicateSetter(MI->getOpcode()); +bool R600InstrInfo::DefinesPredicate(MachineInstr &MI, + std::vector &Pred) const { + return isPredicateSetter(MI.getOpcode()); } -bool -R600InstrInfo::SubsumesPredicate(ArrayRef Pred1, - ArrayRef Pred2) const { - return false; -} - - -bool -R600InstrInfo::PredicateInstruction(MachineInstr *MI, - ArrayRef Pred) const { - int PIdx = MI->findFirstPredOperandIdx(); +bool R600InstrInfo::PredicateInstruction(MachineInstr &MI, + ArrayRef Pred) const { + int PIdx = MI.findFirstPredOperandIdx(); - if (MI->getOpcode() == AMDGPU::CF_ALU) { - MI->getOperand(8).setImm(0); + if (MI.getOpcode() == AMDGPU::CF_ALU) { + MI.getOperand(8).setImm(0); return true; } - if (MI->getOpcode() == AMDGPU::DOT_4) { - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X)) + if (MI.getOpcode() == AMDGPU::DOT_4) { + MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X)) .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y)) + MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y)) .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z)) + MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z)) .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W)) + MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W)) .setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); return true; } if (PIdx != -1) { - MachineOperand &PMO = MI->getOperand(PIdx); + MachineOperand &PMO = MI.getOperand(PIdx); PMO.setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); return true; } @@ -1035,45 +983,94 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI, return false; } -unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const { +unsigned int R600InstrInfo::getPredicationCost(const MachineInstr &) const { return 2; } unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, + const MachineInstr &, unsigned *PredCost) const { if (PredCost) *PredCost = 2; return 2; } -bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { +unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + assert(Channel == 0); + return RegIndex; +} - switch(MI->getOpcode()) { - default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); +bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: { + MachineBasicBlock *MBB = MI.getParent(); + int OffsetOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr); + // addr is a custom operand with multiple MI operands, and only the + // first MI operand is given a name. + int RegOpIdx = OffsetOpIdx + 1; + int ChanOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan); + if (isRegisterLoad(MI)) { + int DstOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); + unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); + unsigned Address = calculateIndirectAddress(RegIndex, Channel); + unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); + if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(), + getIndirectAddrRegClass()->getRegister(Address)); + } else { + buildIndirectRead(MBB, MI, MI.getOperand(DstOpIdx).getReg(), Address, + OffsetReg); + } + } else if (isRegisterStore(MI)) { + int ValOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val); + unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); + unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); + unsigned Address = calculateIndirectAddress(RegIndex, Channel); + unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); + if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), + MI.getOperand(ValOpIdx).getReg()); + } else { + buildIndirectWrite(MBB, MI, MI.getOperand(ValOpIdx).getReg(), + calculateIndirectAddress(RegIndex, Channel), + OffsetReg); + } + } else { + return false; + } + + MBB->erase(MI); + return true; + } case AMDGPU::R600_EXTRACT_ELT_V2: case AMDGPU::R600_EXTRACT_ELT_V4: - buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(), - RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address - MI->getOperand(2).getReg(), - RI.getHWRegChan(MI->getOperand(1).getReg())); + buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(), + RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address + MI.getOperand(2).getReg(), + RI.getHWRegChan(MI.getOperand(1).getReg())); break; case AMDGPU::R600_INSERT_ELT_V2: case AMDGPU::R600_INSERT_ELT_V4: - buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value - RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address - MI->getOperand(3).getReg(), // Offset - RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel + buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value + RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address + MI.getOperand(3).getReg(), // Offset + RI.getHWRegChan(MI.getOperand(1).getReg())); // Channel break; } - MI->eraseFromParent(); + MI.eraseFromParent(); return true; } void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const { - const AMDGPUFrameLowering *TFL = static_cast( - MF.getSubtarget().getFrameLowering()); + const R600Subtarget &ST = MF.getSubtarget(); + const R600FrameLowering *TFL = ST.getFrameLowering(); unsigned StackWidth = TFL->getStackWidth(MF); int End = getIndirectIndexEnd(MF); @@ -1091,13 +1088,6 @@ void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, } } -unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const { - // XXX: Remove when we support a stack width > 2 - assert(Channel == 0); - return RegIndex; -} - const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const { return &AMDGPU::R600_TReg32_XRegClass; } @@ -1124,13 +1114,13 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, } MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, AMDGPU::AR_X, OffsetReg); - setImmOperand(MOVA, AMDGPU::OpName::write, 0); + setImmOperand(*MOVA, AMDGPU::OpName::write, 0); MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, AddrReg, ValueReg) .addReg(AMDGPU::AR_X, RegState::Implicit | RegState::Kill); - setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1); + setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1); return Mov; } @@ -1157,17 +1147,74 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, AMDGPU::AR_X, OffsetReg); - setImmOperand(MOVA, AMDGPU::OpName::write, 0); + setImmOperand(*MOVA, AMDGPU::OpName::write, 0); MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, ValueReg, AddrReg) .addReg(AMDGPU::AR_X, RegState::Implicit | RegState::Kill); - setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1); + setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1); return Mov; } +int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = -1; + + if (MFI->getNumObjects() == 0) { + return -1; + } + + if (MRI.livein_empty()) { + return 0; + } + + const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); + for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), + LE = MRI.livein_end(); + LI != LE; ++LI) { + unsigned Reg = LI->first; + if (TargetRegisterInfo::isVirtualRegister(Reg) || + !IndirectRC->contains(Reg)) + continue; + + unsigned RegIndex; + unsigned RegEnd; + for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd; + ++RegIndex) { + if (IndirectRC->getRegister(RegIndex) == Reg) + break; + } + Offset = std::max(Offset, (int)RegIndex); + } + + return Offset + 1; +} + +int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { + int Offset = 0; + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Variable sized objects are not supported + if (MFI->hasVarSizedObjects()) { + return -1; + } + + if (MFI->getNumObjects() == 0) { + return -1; + } + + const R600Subtarget &ST = MF.getSubtarget(); + const R600FrameLowering *TFL = ST.getFrameLowering(); + + unsigned IgnoredFrameReg; + Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg); + + return getIndirectIndexBegin(MF) + Offset; +} + unsigned R600InstrInfo::getMaxAlusPerClause() const { return 115; } @@ -1256,7 +1303,7 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( const { assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); unsigned Opcode; - if (ST.getGeneration() <= AMDGPUSubtarget::R700) + if (ST.getGeneration() <= R600Subtarget::R700) Opcode = AMDGPU::DOT4_r600; else Opcode = AMDGPU::DOT4_eg; @@ -1293,7 +1340,7 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( MachineOperand &MO = MI->getOperand( getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot))); assert (MO.isImm()); - setImmOperand(MIB, Operands[i], MO.getImm()); + setImmOperand(*MIB, Operands[i], MO.getImm()); } MIB->getOperand(20).setImm(0); return MIB; @@ -1305,7 +1352,7 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, uint64_t Imm) const { MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, AMDGPU::ALU_LITERAL_X); - setImmOperand(MovImm, AMDGPU::OpName::literal, Imm); + setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm); return MovImm; } @@ -1323,25 +1370,21 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const { return AMDGPU::getNamedOperandIdx(Opcode, Op); } -void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op, +void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const { - int Idx = getOperandIdx(*MI, Op); + int Idx = getOperandIdx(MI, Op); assert(Idx != -1 && "Operand not supported for this instruction."); - assert(MI->getOperand(Idx).isImm()); - MI->getOperand(Idx).setImm(Imm); + assert(MI.getOperand(Idx).isImm()); + MI.getOperand(Idx).setImm(Imm); } //===----------------------------------------------------------------------===// // Instruction flag getters/setters //===----------------------------------------------------------------------===// -bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const { - return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0; -} - -MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, +MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx, unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + unsigned TargetFlags = get(MI.getOpcode()).TSFlags; int FlagIndex = 0; if (Flag != 0) { // If we pass something other than the default value of Flag to this @@ -1351,20 +1394,26 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; switch (Flag) { case MO_FLAG_CLAMP: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp); + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp); break; case MO_FLAG_MASK: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write); + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write); break; case MO_FLAG_NOT_LAST: case MO_FLAG_LAST: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last); + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last); break; case MO_FLAG_NEG: switch (SrcIdx) { - case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break; - case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break; - case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break; + case 0: + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg); + break; + case 1: + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg); + break; + case 2: + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg); + break; } break; @@ -1373,8 +1422,12 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, "instructions."); (void)IsOP3; switch (SrcIdx) { - case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break; - case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break; + case 0: + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs); + break; + case 1: + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs); + break; } break; @@ -1389,14 +1442,14 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, "Instruction flags not supported for this instruction"); } - MachineOperand &FlagOp = MI->getOperand(FlagIndex); + MachineOperand &FlagOp = MI.getOperand(FlagIndex); assert(FlagOp.isImm()); return FlagOp; } -void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, +void R600InstrInfo::addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + unsigned TargetFlags = get(MI.getOpcode()).TSFlags; if (Flag == 0) { return; } @@ -1415,9 +1468,9 @@ void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, } } -void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, +void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + unsigned TargetFlags = get(MI.getOpcode()).TSFlags; if (HAS_NATIVE_OPERANDS(TargetFlags)) { MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); FlagOp.setImm(0); @@ -1428,3 +1481,11 @@ void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, FlagOp.setImm(InstFlags); } } + +bool R600InstrInfo::isRegisterStore(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; +} + +bool R600InstrInfo::isRegisterLoad(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; +} diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h index e7251c31107b..feaca98def44 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.h +++ b/lib/Target/AMDGPU/R600InstrInfo.h @@ -12,30 +12,28 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H -#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H +#define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H #include "AMDGPUInstrInfo.h" -#include "R600Defines.h" #include "R600RegisterInfo.h" -#include namespace llvm { - - class AMDGPUTargetMachine; - class DFAPacketizer; - class ScheduleDAG; - class MachineFunction; - class MachineInstr; - class MachineInstrBuilder; - - class R600InstrInfo : public AMDGPUInstrInfo { - private: +class AMDGPUTargetMachine; +class DFAPacketizer; +class MachineFunction; +class MachineInstr; +class MachineInstrBuilder; +class R600Subtarget; + +class R600InstrInfo final : public AMDGPUInstrInfo { +private: const R600RegisterInfo RI; + const R600Subtarget &ST; - std::vector > - ExtractSrcs(MachineInstr *MI, const DenseMap &PV, unsigned &ConstCount) const; - + std::vector> + ExtractSrcs(MachineInstr &MI, const DenseMap &PV, + unsigned &ConstCount) const; MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, @@ -44,11 +42,11 @@ namespace llvm { unsigned AddrChan) const; MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg, - unsigned AddrChan) const; - public: + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; +public: enum BankSwizzle { ALU_VEC_012_SCL_210 = 0, ALU_VEC_021_SCL_122, @@ -58,18 +56,18 @@ namespace llvm { ALU_VEC_210 }; - explicit R600InstrInfo(const AMDGPUSubtarget &st); + explicit R600InstrInfo(const R600Subtarget &); - const R600RegisterInfo &getRegisterInfo() const override; - void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, + const R600RegisterInfo &getRegisterInfo() const { + return RI; + } + + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const override; - bool isTrig(const MachineInstr &MI) const; - bool isPlaceHolderOpcode(unsigned opcode) const; bool isReductionOp(unsigned opcode) const; bool isCubeOp(unsigned opcode) const; @@ -77,32 +75,28 @@ namespace llvm { bool isALUInstr(unsigned Opcode) const; bool hasInstrModifiers(unsigned Opcode) const; bool isLDSInstr(unsigned Opcode) const; - bool isLDSNoRetInstr(unsigned Opcode) const; bool isLDSRetInstr(unsigned Opcode) const; /// \returns true if this \p Opcode represents an ALU instruction or an /// instruction that will be lowered in ExpandSpecialInstrs Pass. - bool canBeConsideredALU(const MachineInstr *MI) const; + bool canBeConsideredALU(const MachineInstr &MI) const; bool isTransOnly(unsigned Opcode) const; - bool isTransOnly(const MachineInstr *MI) const; + bool isTransOnly(const MachineInstr &MI) const; bool isVectorOnly(unsigned Opcode) const; - bool isVectorOnly(const MachineInstr *MI) const; + bool isVectorOnly(const MachineInstr &MI) const; bool isExport(unsigned Opcode) const; bool usesVertexCache(unsigned Opcode) const; - bool usesVertexCache(const MachineInstr *MI) const; + bool usesVertexCache(const MachineInstr &MI) const; bool usesTextureCache(unsigned Opcode) const; - bool usesTextureCache(const MachineInstr *MI) const; + bool usesTextureCache(const MachineInstr &MI) const; bool mustBeLastInClause(unsigned Opcode) const; - bool usesAddressRegister(MachineInstr *MI) const; - bool definesAddressRegister(MachineInstr *MI) const; - bool readsLDSSrcReg(const MachineInstr *MI) const; + bool usesAddressRegister(MachineInstr &MI) const; + bool definesAddressRegister(MachineInstr &MI) const; + bool readsLDSSrcReg(const MachineInstr &MI) const; - /// \returns The operand index for the given source number. Legal values - /// for SrcNum are 0, 1, and 2. - int getSrcIdx(unsigned Opcode, unsigned SrcNum) const; /// \returns The operand Index for the Sel operand given an index to one /// of the instruction's src operands. int getSelIdx(unsigned Opcode, unsigned SrcIdx) const; @@ -113,7 +107,7 @@ namespace llvm { /// If register is ALU_LITERAL, second member is IMM. /// Otherwise, second member value is undefined. SmallVector, 3> - getSrcs(MachineInstr *MI) const; + getSrcs(MachineInstr &MI) const; unsigned isLegalUpTo( const std::vector > > &IGSrcs, @@ -152,89 +146,107 @@ namespace llvm { /// instruction slots within an instruction group. bool isVector(const MachineInstr &MI) const; - bool isMov(unsigned Opcode) const override; + bool isMov(unsigned Opcode) const; DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const override; - bool ReverseBranchCondition(SmallVectorImpl &Cond) const override; + bool ReverseBranchCondition( + SmallVectorImpl &Cond) const override; - bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, - SmallVectorImpl &Cond, bool AllowModify) const override; + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef Cond, - DebugLoc DL) const override; + const DebugLoc &DL) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; - bool isPredicated(const MachineInstr *MI) const override; + bool isPredicated(const MachineInstr &MI) const override; - bool isPredicable(MachineInstr *MI) const override; + bool isPredicable(MachineInstr &MI) const override; - bool - isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - BranchProbability Probability) const override; + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + BranchProbability Probability) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, unsigned ExtraPredCycles, BranchProbability Probability) const override ; - bool - isProfitableToIfCvt(MachineBasicBlock &TMBB, - unsigned NumTCycles, unsigned ExtraTCycles, - MachineBasicBlock &FMBB, - unsigned NumFCycles, unsigned ExtraFCycles, - BranchProbability Probability) const override; - - bool DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const override; + bool isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, + BranchProbability Probability) const override; - bool SubsumesPredicate(ArrayRef Pred1, - ArrayRef Pred2) const override; + bool DefinesPredicate(MachineInstr &MI, + std::vector &Pred) const override; bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, - MachineBasicBlock &FMBB) const override; + MachineBasicBlock &FMBB) const override; - bool PredicateInstruction(MachineInstr *MI, + bool PredicateInstruction(MachineInstr &MI, ArrayRef Pred) const override; - unsigned int getPredicationCost(const MachineInstr *) const override; + unsigned int getPredicationCost(const MachineInstr &) const override; unsigned int getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, + const MachineInstr &MI, unsigned *PredCost = nullptr) const override; - int getInstrLatency(const InstrItineraryData *ItinData, - SDNode *Node) const override { return 1;} - - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + bool expandPostRAPseudo(MachineInstr &MI) const override; /// \brief Reserve the registers that may be accesed using indirect addressing. void reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const; - unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const override; + /// Calculate the "Indirect Address" for the given \p RegIndex and + /// \p Channel + /// + /// We model indirect addressing using a virtual address space that can be + /// accesed with loads and stores. The "Indirect Address" is the memory + /// address in this virtual address space that maps to the given \p RegIndex + /// and \p Channel. + unsigned calculateIndirectAddress(unsigned RegIndex, unsigned Channel) const; + + + /// \returns The register class to be used for loading and storing values + /// from an "Indirect Address" . + const TargetRegisterClass *getIndirectAddrRegClass() const; + + /// \returns the smallest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + int getIndirectIndexBegin(const MachineFunction &MF) const; - const TargetRegisterClass *getIndirectAddrRegClass() const override; + /// \returns the largest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + int getIndirectIndexEnd(const MachineFunction &MF) const; + /// \brief Build instruction(s) for an indirect register write. + /// + /// \returns The instruction that performs the indirect register write MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const override; + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const; + /// \brief Build instruction(s) for an indirect register read. + /// + /// \returns The instruction that performs the indirect register read MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const override; + unsigned OffsetReg) const; unsigned getMaxAlusPerClause() const; - ///buildDefaultInstruction - This function returns a MachineInstr with - /// all the instruction modifiers initialized to their default values. - /// You can use this function to avoid manually specifying each instruction - /// modifier operand when building a new instruction. + /// buildDefaultInstruction - This function returns a MachineInstr with all + /// the instruction modifiers initialized to their default values. You can + /// use this function to avoid manually specifying each instruction modifier + /// operand when building a new instruction. /// /// \returns a MachineInstr with all the instruction modifiers initialized /// to their default values. @@ -251,13 +263,13 @@ namespace llvm { unsigned DstReg) const; MachineInstr *buildMovImm(MachineBasicBlock &BB, - MachineBasicBlock::iterator I, - unsigned DstReg, - uint64_t Imm) const; + MachineBasicBlock::iterator I, + unsigned DstReg, + uint64_t Imm) const; MachineInstr *buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const override; + unsigned DstReg, unsigned SrcReg) const; /// \brief Get the index of Op in the MachineInstr. /// @@ -270,13 +282,10 @@ namespace llvm { int getOperandIdx(unsigned Opcode, unsigned Op) const; /// \brief Helper function for setting instruction flag values. - void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const; - - /// \returns true if this instruction has an operand for storing target flags. - bool hasFlagOperand(const MachineInstr &MI) const; + void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const; ///\brief Add one of the MO_FLAG* flags to the specified \p Operand. - void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; + void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const; ///\brief Determine if the specified \p Flag is set on this \p Operand. bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; @@ -285,11 +294,15 @@ namespace llvm { /// \param Flag The flag being set. /// /// \returns the operand containing the flags for this instruction. - MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0, + MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0, unsigned Flag = 0) const; /// \brief Clear the specified flag on the instruction. - void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; + void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const; + + // Helper functions that check the opcode for status information + bool isRegisterStore(const MachineInstr &MI) const; + bool isRegisterLoad(const MachineInstr &MI) const; }; namespace AMDGPU { diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index 33ef6a4e19ea..b6b576d95278 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -15,7 +15,7 @@ include "R600Intrinsics.td" include "R600InstrFormats.td" -class InstR600ISA pattern> : +class InstR600ISA pattern = []> : InstR600 { let Namespace = "AMDGPU"; @@ -160,7 +160,8 @@ class R600_2OP inst, string opName, list pattern, let Inst{63-32} = Word1; } -class R600_2OP_Helper inst, string opName, SDPatternOperator node, +class R600_2OP_Helper inst, string opName, + SDPatternOperator node = null_frag, InstrItinClass itin = AnyALU> : R600_2OP cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, } class VTX_READ buffer_id, dag outs, list pattern> - : InstR600ISA , + : InstR600ISA , VTX_WORD1_GPR { // Static fields @@ -328,18 +329,44 @@ class VTX_READ buffer_id, dag outs, list pattern> class LoadParamFrag : PatFrag < (ops node:$ptr), (load_type node:$ptr), - [{ return isConstantLoad(dyn_cast(N), 0); }] + [{ return isConstantLoad(cast(N), 0) || + (cast(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }] >; def load_param : LoadParamFrag; def load_param_exti8 : LoadParamFrag; def load_param_exti16 : LoadParamFrag; -def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">; +class LoadVtxId1 : PatFrag < + (ops node:$ptr), (load node:$ptr), [{ + const MemSDNode *LD = cast(N); + return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !isa(GetUnderlyingObject( + LD->getMemOperand()->getValue(), CurDAG->getDataLayout()))); +}]>; + +def vtx_id1_az_extloadi8 : LoadVtxId1 ; +def vtx_id1_az_extloadi16 : LoadVtxId1 ; +def vtx_id1_load : LoadVtxId1 ; + +class LoadVtxId2 : PatFrag < + (ops node:$ptr), (load node:$ptr), [{ + const MemSDNode *LD = cast(N); + return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + isa(GetUnderlyingObject( + LD->getMemOperand()->getValue(), CurDAG->getDataLayout())); +}]>; + +def vtx_id2_az_extloadi8 : LoadVtxId2 ; +def vtx_id2_az_extloadi16 : LoadVtxId2 ; +def vtx_id2_load : LoadVtxId2 ; + +def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">; def isR600toCayman : Predicate< - "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; + "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">; //===----------------------------------------------------------------------===// // R600 SDNodes @@ -407,8 +434,7 @@ def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR, def INTERP_VEC_LOAD : AMDGPUShaderInst < (outs R600_Reg128:$dst), (ins i32imm:$src0), - "INTERP_LOAD $src0 : $dst", - [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>; + "INTERP_LOAD $src0 : $dst">; def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { let bank_swizzle = 5; @@ -474,28 +500,6 @@ class ExportBufWord1 { } multiclass ExportPattern cf_inst> { - def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), - (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), - 0, 61, 0, 7, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), - (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), - 0, 61, 7, 0, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_dummy (i32 imm:$type)), - (ExportInst - (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_dummy 1), - (ExportInst - (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0) - >; - def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), (ExportInst R600_Reg128:$src, imm:$type, imm:$base, @@ -507,22 +511,22 @@ multiclass ExportPattern cf_inst> { multiclass SteamOutputExportPattern buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> { // Stream0 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)), (ExportInst R600_Reg128:$src, 0, imm:$arraybase, 4095, imm:$mask, buf0inst, 0)>; // Stream1 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf1inst, 0)>; // Stream2 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf2inst, 0)>; // Stream3 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf3inst, 0)>; @@ -678,7 +682,7 @@ let Predicates = [isR600toCayman] in { def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; // Non-IEEE MUL: 0 * anything = 0 -def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; +def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE">; def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; // TODO: Do these actually match the regular fmin/fmax behavior? def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; @@ -733,6 +737,7 @@ def SETNE_DX10 : R600_2OP < [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))] >; +// FIXME: Need combine for AMDGPUfract def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; @@ -758,6 +763,13 @@ def : Pat < (MOV_IMM_I32 imm:$val) >; +def MOV_IMM_GLOBAL_ADDR : MOV_IMM; +def : Pat < + (AMDGPUconstdata_ptr tglobaladdr:$addr), + (MOV_IMM_GLOBAL_ADDR tglobaladdr:$addr) +>; + + def MOV_IMM_F32 : MOV_IMM; def : Pat < (fpimm:$val), @@ -851,7 +863,7 @@ class R600_TEX inst, string opName> : i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z, CT:$COORD_TYPE_W), - !strconcat(opName, + !strconcat(" ", opName, " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, " "$SRC_GPR.$srcx$srcy$srcz$srcw " "RID:$RESOURCE_ID SID:$SAMPLER_ID " @@ -1099,14 +1111,13 @@ class RECIP_UINT_Common inst> : R600_1OP_Helper < // Clamped to maximum. class RECIPSQRT_CLAMPED_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamp > { let Itinerary = TransALU; } class RECIPSQRT_IEEE_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy -> { + inst, "RECIPSQRT_IEEE", AMDGPUrsq> { let Itinerary = TransALU; } @@ -1134,11 +1145,6 @@ def FNEG_R600 : FNEG; // FIXME: Should be predicated on unsafe fp math. multiclass DIV_Common { -def : Pat< - (int_AMDGPU_div f32:$src0, f32:$src1), - (MUL_IEEE $src0, (recip_ieee $src1)) ->; - def : Pat< (fdiv f32:$src0, f32:$src1), (MUL_IEEE $src0, (recip_ieee $src1)) @@ -1147,12 +1153,6 @@ def : Pat< def : RcpPat; } -class TGSI_LIT_Z_Common - : Pat < - (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w), - (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) ->; - //===----------------------------------------------------------------------===// // R600 / R700 Instructions //===----------------------------------------------------------------------===// @@ -1191,7 +1191,6 @@ let Predicates = [isR600] in { defm DIV_r600 : DIV_Common; def : POW_Common ; - def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common; def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; def : RsqPat; @@ -1332,9 +1331,7 @@ def TXD: InstR600 < (outs R600_Reg128:$dst), (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), - "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, - imm:$resourceId, imm:$samplerId, imm:$textureTarget))], + "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", [], NullALU > { let TEXInst = 1; } @@ -1344,10 +1341,7 @@ def TXD_SHADOW: InstR600 < (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, - imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], - NullALU -> { + [], NullALU> { let TEXInst = 1; } } // End isPseudo = 1 @@ -1426,8 +1420,7 @@ def TEX_VTX_CONSTBUF : } def TEX_VTX_TEXBUF: - InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", - [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr">, VTX_WORD1_GPR, VTX_WORD0_eg { let VC_INST = 0; @@ -1542,8 +1535,9 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { //===---------------------------------------------------------------------===// let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, usesCustomInserter = 1 in { - def RETURN : ILFormat<(outs), (ins variable_ops), - "RETURN", [(IL_retflag)]>; + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(AMDGPUendpgm)] + >; } //===----------------------------------------------------------------------===// @@ -1729,12 +1723,6 @@ def : DwordAddrPat ; } // End isR600toCayman Predicate -let Predicates = [isR600] in { -// Intrinsic patterns -defm : Expand24IBitOps; -defm : Expand24UBitOps; -} // End isR600 - def getLDSNoRetOp : InstrMapping { let FilterClass = "R600_LDS_1A1D"; let RowFields = ["BaseOp"]; diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td index 9681747006d9..a5310e9fd6d0 100644 --- a/lib/Target/AMDGPU/R600Intrinsics.td +++ b/lib/Target/AMDGPU/R600Intrinsics.td @@ -11,65 +11,57 @@ // //===----------------------------------------------------------------------===// -let TargetPrefix = "R600", isTarget = 1 in { - class TextureIntrinsicFloatInput : - Intrinsic<[llvm_v4f32_ty], [ - llvm_v4f32_ty, // Coord - llvm_i32_ty, // offset_x - llvm_i32_ty, // offset_y, - llvm_i32_ty, // offset_z, - llvm_i32_ty, // resource_id - llvm_i32_ty, // samplerid - llvm_i32_ty, // coord_type_x - llvm_i32_ty, // coord_type_y - llvm_i32_ty, // coord_type_z - llvm_i32_ty // coord_type_w - ], [IntrNoMem]>; - class TextureIntrinsicInt32Input : - Intrinsic<[llvm_v4i32_ty], [ - llvm_v4i32_ty, // Coord - llvm_i32_ty, // offset_x - llvm_i32_ty, // offset_y, - llvm_i32_ty, // offset_z, - llvm_i32_ty, // resource_id - llvm_i32_ty, // samplerid - llvm_i32_ty, // coord_type_x - llvm_i32_ty, // coord_type_y - llvm_i32_ty, // coord_type_z - llvm_i32_ty // coord_type_w - ], [IntrNoMem]>; +class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [ + llvm_v4f32_ty, // Coord + llvm_i32_ty, // offset_x + llvm_i32_ty, // offset_y, + llvm_i32_ty, // offset_z, + llvm_i32_ty, // resource_id + llvm_i32_ty, // samplerid + llvm_i32_ty, // coord_type_x + llvm_i32_ty, // coord_type_y + llvm_i32_ty, // coord_type_z + llvm_i32_ty], // coord_type_w + [IntrNoMem] +>; - def int_R600_load_input : - Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_R600_interp_input : - Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_R600_interp_const : - Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; -def int_R600_interp_xy : - Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; -def int_R600_interp_zw : - Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_R600_load_texbuf : - Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_R600_tex : TextureIntrinsicFloatInput; - def int_R600_texc : TextureIntrinsicFloatInput; - def int_R600_txl : TextureIntrinsicFloatInput; - def int_R600_txlc : TextureIntrinsicFloatInput; - def int_R600_txb : TextureIntrinsicFloatInput; - def int_R600_txbc : TextureIntrinsicFloatInput; - def int_R600_txf : TextureIntrinsicInt32Input; - def int_R600_ldptr : TextureIntrinsicInt32Input; - def int_R600_txq : TextureIntrinsicInt32Input; - def int_R600_ddx : TextureIntrinsicFloatInput; - def int_R600_ddy : TextureIntrinsicFloatInput; - def int_R600_store_swizzle : - Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; - def int_R600_store_stream_output : - Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; - def int_R600_store_pixel_depth : - Intrinsic<[], [llvm_float_ty], []>; - def int_R600_store_pixel_stencil : - Intrinsic<[], [llvm_float_ty], []>; - def int_R600_store_dummy : - Intrinsic<[], [llvm_i32_ty], []>; -} +class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [ + llvm_v4i32_ty, // Coord + llvm_i32_ty, // offset_x + llvm_i32_ty, // offset_y, + llvm_i32_ty, // offset_z, + llvm_i32_ty, // resource_id + llvm_i32_ty, // samplerid + llvm_i32_ty, // coord_type_x + llvm_i32_ty, // coord_type_y + llvm_i32_ty, // coord_type_z + llvm_i32_ty], // coord_type_w + [IntrNoMem] +>; + +let TargetPrefix = "r600", isTarget = 1 in { + +def int_r600_store_swizzle : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [] +>; + +def int_r600_store_stream_output : Intrinsic< + [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] +>; + +def int_r600_tex : TextureIntrinsicFloatInput; +def int_r600_texc : TextureIntrinsicFloatInput; +def int_r600_txl : TextureIntrinsicFloatInput; +def int_r600_txlc : TextureIntrinsicFloatInput; +def int_r600_txb : TextureIntrinsicFloatInput; +def int_r600_txbc : TextureIntrinsicFloatInput; +def int_r600_txf : TextureIntrinsicInt32Input; +def int_r600_txq : TextureIntrinsicInt32Input; +def int_r600_ddx : TextureIntrinsicFloatInput; +def int_r600_ddy : TextureIntrinsicFloatInput; + +def int_r600_dot4 : Intrinsic<[llvm_float_ty], + [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem] +>; + +} // End TargetPrefix = "r600", isTarget = 1 diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h index 263561edd30d..04a4436ebe03 100644 --- a/lib/Target/AMDGPU/R600MachineFunctionInfo.h +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h @@ -10,17 +10,16 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H -#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" -#include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/SelectionDAG.h" #include namespace llvm { -class R600MachineFunctionInfo : public AMDGPUMachineFunction { +class R600MachineFunctionInfo final : public AMDGPUMachineFunction { void anchor() override; public: R600MachineFunctionInfo(const MachineFunction &MF); diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp index bcde5fb50dac..db18e5bd1afa 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "R600MachineScheduler.h" +#include "R600InstrInfo.h" #include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Pass.h" @@ -26,7 +27,7 @@ using namespace llvm; void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); DAG = static_cast(dag); - const AMDGPUSubtarget &ST = DAG->MF.getSubtarget(); + const R600Subtarget &ST = DAG->MF.getSubtarget(); TII = static_cast(DAG->TII); TRI = static_cast(DAG->TRI); VLIW5 = !ST.hasCaymanISA(); @@ -48,8 +49,7 @@ void R600SchedStrategy::MoveUnits(std::vector &QSrc, QSrc.clear(); } -static -unsigned getWFCountLimitedByGPR(unsigned GPRCount) { +static unsigned getWFCountLimitedByGPR(unsigned GPRCount) { assert (GPRCount && "GPRCount cannot be 0"); return 248 / GPRCount; } @@ -222,75 +222,74 @@ bool R600SchedStrategy::regBelongsToClass(unsigned Reg, R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { MachineInstr *MI = SU->getInstr(); - if (TII->isTransOnly(MI)) + if (TII->isTransOnly(*MI)) return AluTrans; - switch (MI->getOpcode()) { - case AMDGPU::PRED_X: - return AluPredX; - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: - return AluT_XYZW; - case AMDGPU::COPY: - if (MI->getOperand(1).isUndef()) { - // MI will become a KILL, don't considers it in scheduling - return AluDiscarded; - } - default: - break; - } - - // Does the instruction take a whole IG ? - // XXX: Is it possible to add a helper function in R600InstrInfo that can - // be used here and in R600PacketizerList::isSoloInstruction() ? - if(TII->isVector(*MI) || - TII->isCubeOp(MI->getOpcode()) || - TII->isReductionOp(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::GROUP_BARRIER) { - return AluT_XYZW; + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + return AluPredX; + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return AluT_XYZW; + case AMDGPU::COPY: + if (MI->getOperand(1).isUndef()) { + // MI will become a KILL, don't considers it in scheduling + return AluDiscarded; } + default: + break; + } - if (TII->isLDSInstr(MI->getOpcode())) { - return AluT_X; - } + // Does the instruction take a whole IG ? + // XXX: Is it possible to add a helper function in R600InstrInfo that can + // be used here and in R600PacketizerList::isSoloInstruction() ? + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode()) || + MI->getOpcode() == AMDGPU::GROUP_BARRIER) { + return AluT_XYZW; + } - // Is the result already assigned to a channel ? - unsigned DestSubReg = MI->getOperand(0).getSubReg(); - switch (DestSubReg) { - case AMDGPU::sub0: - return AluT_X; - case AMDGPU::sub1: - return AluT_Y; - case AMDGPU::sub2: - return AluT_Z; - case AMDGPU::sub3: - return AluT_W; - default: - break; - } + if (TII->isLDSInstr(MI->getOpcode())) { + return AluT_X; + } - // Is the result already member of a X/Y/Z/W class ? - unsigned DestReg = MI->getOperand(0).getReg(); - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || - regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) - return AluT_X; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) - return AluT_Y; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) - return AluT_Z; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) - return AluT_W; - if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) - return AluT_XYZW; - - // LDS src registers cannot be used in the Trans slot. - if (TII->readsLDSSrcReg(MI)) - return AluT_XYZW; - - return AluAny; + // Is the result already assigned to a channel ? + unsigned DestSubReg = MI->getOperand(0).getSubReg(); + switch (DestSubReg) { + case AMDGPU::sub0: + return AluT_X; + case AMDGPU::sub1: + return AluT_Y; + case AMDGPU::sub2: + return AluT_Z; + case AMDGPU::sub3: + return AluT_W; + default: + break; + } + // Is the result already member of a X/Y/Z/W class ? + unsigned DestReg = MI->getOperand(0).getReg(); + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || + regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) + return AluT_X; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) + return AluT_Y; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) + return AluT_Z; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) + return AluT_W; + if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) + return AluT_XYZW; + + // LDS src registers cannot be used in the Trans slot. + if (TII->readsLDSSrcReg(*MI)) + return AluT_XYZW; + + return AluAny; } int R600SchedStrategy::getInstKind(SUnit* SU) { @@ -324,9 +323,8 @@ SUnit *R600SchedStrategy::PopInst(std::vector &Q, bool AnyALU) { It != E; ++It) { SUnit *SU = *It; InstructionsGroupCandidate.push_back(SU->getInstr()); - if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) - && (!AnyALU || !TII->isVectorOnly(SU->getInstr())) - ) { + if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) && + (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) { InstructionsGroupCandidate.pop_back(); Q.erase((It + 1).base()); return SU; @@ -350,7 +348,7 @@ void R600SchedStrategy::PrepareNextSlot() { DEBUG(dbgs() << "New Slot\n"); assert (OccupedSlotsMask && "Slot wasn't filled"); OccupedSlotsMask = 0; -// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) +// if (HwGen == R600Subtarget::NORTHERN_ISLANDS) // OccupedSlotsMask |= 16; InstructionsGroupCandidate.clear(); LoadAlu(); diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h index fc5b95c28e71..16d5d939708c 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.h +++ b/lib/Target/AMDGPU/R600MachineScheduler.h @@ -12,20 +12,19 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H -#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H +#define LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H -#include "R600InstrInfo.h" -#include "llvm/ADT/PriorityQueue.h" #include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/Support/Debug.h" using namespace llvm; namespace llvm { -class R600SchedStrategy : public MachineSchedStrategy { +class R600InstrInfo; +struct R600RegisterInfo; +class R600SchedStrategy final : public MachineSchedStrategy { const ScheduleDAGMILive *DAG; const R600InstrInfo *TII; const R600RegisterInfo *TRI; diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 5efb3b9fc20e..ecae27d2233d 100644 --- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -29,6 +29,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "R600Defines.h" #include "R600InstrInfo.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" @@ -210,9 +211,9 @@ MachineInstr *R600VectorRegMerger::RebuildVector( (void)Tmp; SrcVec = DstReg; } - Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg) - .addReg(SrcVec); - DEBUG(dbgs() << " ->"; Pos->dump();); + MachineInstr *NewMI = + BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec); + DEBUG(dbgs() << " ->"; NewMI->dump();); DEBUG(dbgs() << " Updating Swizzle:\n"); for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), @@ -224,11 +225,11 @@ MachineInstr *R600VectorRegMerger::RebuildVector( RSI->Instr->eraseFromParent(); // Update RSI - RSI->Instr = Pos; + RSI->Instr = NewMI; RSI->RegToChan = UpdatedRegToChan; RSI->UndefReg = UpdatedUndef; - return Pos; + return NewMI; } void R600VectorRegMerger::RemoveMI(MachineInstr *MI) { @@ -314,8 +315,13 @@ void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { } bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { - TII = static_cast(Fn.getSubtarget().getInstrInfo()); - MRI = &(Fn.getRegInfo()); + if (skipFunction(*Fn.getFunction())) + return false; + + const R600Subtarget &ST = Fn.getSubtarget(); + TII = ST.getInstrInfo(); + MRI = &Fn.getRegInfo(); + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); MBB != MBBe; ++MBB) { MachineBasicBlock *MB = &*MBB; @@ -325,10 +331,10 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end(); MII != MIIE; ++MII) { - MachineInstr *MI = MII; - if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) { - if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { - unsigned Reg = MI->getOperand(1).getReg(); + MachineInstr &MI = *MII; + if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) { + if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { + unsigned Reg = MI.getOperand(1).getReg(); for (MachineRegisterInfo::def_instr_iterator It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end(); It != E; ++It) { @@ -338,17 +344,17 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { continue; } - - RegSeqInfo RSI(*MRI, MI); + RegSeqInfo RSI(*MRI, &MI); // All uses of MI are swizzeable ? - unsigned Reg = MI->getOperand(0).getReg(); + unsigned Reg = MI.getOperand(0).getReg(); if (!areAllUsesSwizzeable(Reg)) continue; - DEBUG (dbgs() << "Trying to optimize "; - MI->dump(); - ); + DEBUG({ + dbgs() << "Trying to optimize "; + MI.dump(); + }); RegSeqInfo CandidateRSI; std::vector > RemapChan; diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp index 21269613a305..c84866469ae8 100644 --- a/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/lib/Target/AMDGPU/R600Packetizer.cpp @@ -56,15 +56,14 @@ public: char R600Packetizer::ID = 0; class R600PacketizerList : public VLIWPacketizerList { - private: const R600InstrInfo *TII; const R600RegisterInfo &TRI; bool VLIW5; bool ConsideredInstUsesAlreadyWrittenVectorElement; - unsigned getSlot(const MachineInstr *MI) const { - return TRI.getHWRegChan(MI->getOperand(0).getReg()); + unsigned getSlot(const MachineInstr &MI) const { + return TRI.getHWRegChan(MI.getOperand(0).getReg()); } /// \returns register to PV chan mapping for bundle/single instructions that @@ -81,11 +80,11 @@ private: int LastDstChan = -1; do { bool isTrans = false; - int BISlot = getSlot(&*BI); + int BISlot = getSlot(*BI); if (LastDstChan >= BISlot) isTrans = true; LastDstChan = BISlot; - if (TII->isPredicated(&*BI)) + if (TII->isPredicated(*BI)) continue; int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) @@ -95,7 +94,7 @@ private: continue; } unsigned Dst = BI->getOperand(DstIdx).getReg(); - if (isTrans || TII->isTransOnly(&*BI)) { + if (isTrans || TII->isTransOnly(*BI)) { Result[Dst] = AMDGPU::PS; continue; } @@ -129,7 +128,7 @@ private: return Result; } - void substitutePV(MachineInstr *MI, const DenseMap &PVs) + void substitutePV(MachineInstr &MI, const DenseMap &PVs) const { unsigned Ops[] = { AMDGPU::OpName::src0, @@ -137,23 +136,23 @@ private: AMDGPU::OpName::src2 }; for (unsigned i = 0; i < 3; i++) { - int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); + int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]); if (OperandIdx < 0) continue; - unsigned Src = MI->getOperand(OperandIdx).getReg(); + unsigned Src = MI.getOperand(OperandIdx).getReg(); const DenseMap::const_iterator It = PVs.find(Src); if (It != PVs.end()) - MI->getOperand(OperandIdx).setReg(It->second); + MI.getOperand(OperandIdx).setReg(It->second); } } public: // Ctor. - R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) + R600PacketizerList(MachineFunction &MF, const R600Subtarget &ST, + MachineLoopInfo &MLI) : VLIWPacketizerList(MF, MLI, nullptr), - TII(static_cast( - MF.getSubtarget().getInstrInfo())), + TII(ST.getInstrInfo()), TRI(TII->getRegisterInfo()) { - VLIW5 = !MF.getSubtarget().hasCaymanISA(); + VLIW5 = !ST.hasCaymanISA(); } // initPacketizerState - initialize some internal flags. @@ -162,32 +161,30 @@ public: } // ignorePseudoInstruction - Ignore bundling of pseudo instructions. - bool ignorePseudoInstruction(const MachineInstr *MI, + bool ignorePseudoInstruction(const MachineInstr &MI, const MachineBasicBlock *MBB) override { return false; } // isSoloInstruction - return true if instruction MI can not be packetized // with any other instruction, which means that MI itself is a packet. - bool isSoloInstruction(const MachineInstr *MI) override { - if (TII->isVector(*MI)) + bool isSoloInstruction(const MachineInstr &MI) override { + if (TII->isVector(MI)) return true; - if (!TII->isALUInstr(MI->getOpcode())) + if (!TII->isALUInstr(MI.getOpcode())) return true; - if (MI->getOpcode() == AMDGPU::GROUP_BARRIER) + if (MI.getOpcode() == AMDGPU::GROUP_BARRIER) return true; // XXX: This can be removed once the packetizer properly handles all the // LDS instruction group restrictions. - if (TII->isLDSInstr(MI->getOpcode())) - return true; - return false; + return TII->isLDSInstr(MI.getOpcode()); } // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ // together. bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override { MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); - if (getSlot(MII) == getSlot(MIJ)) + if (getSlot(*MII) == getSlot(*MIJ)) ConsideredInstUsesAlreadyWrittenVectorElement = true; // Does MII and MIJ share the same pred_sel ? int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), @@ -210,14 +207,12 @@ public: } } - bool ARDef = TII->definesAddressRegister(MII) || - TII->definesAddressRegister(MIJ); - bool ARUse = TII->usesAddressRegister(MII) || - TII->usesAddressRegister(MIJ); - if (ARDef && ARUse) - return false; + bool ARDef = + TII->definesAddressRegister(*MII) || TII->definesAddressRegister(*MIJ); + bool ARUse = + TII->usesAddressRegister(*MII) || TII->usesAddressRegister(*MIJ); - return true; + return !ARDef || !ARUse; } // isLegalToPruneDependencies - Is it legal to prune dependece between SUI @@ -231,7 +226,7 @@ public: MI->getOperand(LastOp).setImm(Bit); } - bool isBundlableWithCurrentPMI(MachineInstr *MI, + bool isBundlableWithCurrentPMI(MachineInstr &MI, const DenseMap &PV, std::vector &BS, bool &isTransSlot) { @@ -240,11 +235,14 @@ public: // Is the dst reg sequence legal ? if (!isTransSlot && !CurrentPacketMIs.empty()) { - if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) { - if (ConsideredInstUsesAlreadyWrittenVectorElement && + if (getSlot(MI) <= getSlot(*CurrentPacketMIs.back())) { + if (ConsideredInstUsesAlreadyWrittenVectorElement && !TII->isVectorOnly(MI) && VLIW5) { isTransSlot = true; - DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump();); + DEBUG({ + dbgs() << "Considering as Trans Inst :"; + MI.dump(); + }); } else return false; @@ -252,18 +250,18 @@ public: } // Are the Constants limitations met ? - CurrentPacketMIs.push_back(MI); + CurrentPacketMIs.push_back(&MI); if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { - DEBUG( + DEBUG({ dbgs() << "Couldn't pack :\n"; - MI->dump(); + MI.dump(); dbgs() << "with the following packets :\n"; for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { CurrentPacketMIs[i]->dump(); dbgs() << "\n"; } dbgs() << "because of Consts read limitations\n"; - ); + }); CurrentPacketMIs.pop_back(); return false; } @@ -271,16 +269,16 @@ public: // Is there a BankSwizzle set that meet Read Port limitations ? if (!TII->fitsReadPortLimitations(CurrentPacketMIs, PV, BS, isTransSlot)) { - DEBUG( + DEBUG({ dbgs() << "Couldn't pack :\n"; - MI->dump(); + MI.dump(); dbgs() << "with the following packets :\n"; for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { CurrentPacketMIs[i]->dump(); dbgs() << "\n"; } dbgs() << "because of Read port limitations\n"; - ); + }); CurrentPacketMIs.pop_back(); return false; } @@ -293,9 +291,9 @@ public: return true; } - MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override { + MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override { MachineBasicBlock::iterator FirstInBundle = - CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front(); + CurrentPacketMIs.empty() ? &MI : CurrentPacketMIs.front(); const DenseMap &PV = getPreviousVector(FirstInBundle); std::vector BS; @@ -308,9 +306,9 @@ public: AMDGPU::OpName::bank_swizzle); MI->getOperand(Op).setImm(BS[i]); } - unsigned Op = TII->getOperandIdx(MI->getOpcode(), - AMDGPU::OpName::bank_swizzle); - MI->getOperand(Op).setImm(BS.back()); + unsigned Op = + TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle); + MI.getOperand(Op).setImm(BS.back()); if (!CurrentPacketMIs.empty()) setIsLastBit(CurrentPacketMIs.back(), 0); substitutePV(MI, PV); @@ -320,7 +318,7 @@ public: } return It; } - endPacket(MI->getParent(), MI); + endPacket(MI.getParent(), MI); if (TII->isTransOnly(MI)) return MI; return VLIWPacketizerList::addToPacket(MI); @@ -328,15 +326,20 @@ public: }; bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { - const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); + const R600Subtarget &ST = Fn.getSubtarget(); + const R600InstrInfo *TII = ST.getInstrInfo(); + MachineLoopInfo &MLI = getAnalysis(); // Instantiate the packetizer. - R600PacketizerList Packetizer(Fn, MLI); + R600PacketizerList Packetizer(Fn, ST, MLI); // DFA state table should not be empty. assert(Packetizer.getResourceTracker() && "Empty DFA table!"); + if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty()) + return false; + // // Loop over all basic blocks and remove KILL pseudo-instructions // These instructions confuse the dependence analysis. Consider: @@ -375,7 +378,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { // instruction stream until we find the nearest boundary. MachineBasicBlock::iterator I = RegionEnd; for(;I != MBB->begin(); --I, --RemainingCount) { - if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn)) + if (TII->isSchedulingBoundary(*std::prev(I), &*MBB, Fn)) break; } I = MBB->begin(); diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp index fb0359cfc651..dfdc602b80cd 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -28,8 +28,8 @@ R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const R600InstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); + const R600Subtarget &ST = MF.getSubtarget(); + const R600InstrInfo *TII = ST.getInstrInfo(); Reserved.set(AMDGPU::ZERO); Reserved.set(AMDGPU::HALF); @@ -89,3 +89,10 @@ bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { return true; } } + +void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const { + llvm_unreachable("Subroutines not supported yet"); +} diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h index 4f8a129ce4a6..9dfb3106c6cc 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/lib/Target/AMDGPU/R600RegisterInfo.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H -#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H +#define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H #include "AMDGPURegisterInfo.h" @@ -21,7 +21,7 @@ namespace llvm { class AMDGPUSubtarget; -struct R600RegisterInfo : public AMDGPURegisterInfo { +struct R600RegisterInfo final : public AMDGPURegisterInfo { RegClassWeight RCW; R600RegisterInfo(); @@ -31,7 +31,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { /// \brief get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; - unsigned getHWRegIndex(unsigned Reg) const override; + unsigned getHWRegIndex(unsigned Reg) const; /// \brief get the register class of the specified type to use in the /// CFGStructurizer @@ -40,8 +40,13 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { const RegClassWeight & getRegClassWeight(const TargetRegisterClass *RC) const override; - // \returns true if \p Reg can be defined in one ALU caluse and used in another. + // \returns true if \p Reg can be defined in one ALU clause and used in + // another. bool isPhysRegLiveAcrossClauses(unsigned Reg) const; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td index df62bf85c0ad..70fb46c1a7d6 100644 --- a/lib/Target/AMDGPU/R600Schedule.td +++ b/lib/Target/AMDGPU/R600Schedule.td @@ -9,7 +9,7 @@ // // R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction // slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS -// slot has been removed. +// slot has been removed. // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp b/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp deleted file mode 100644 index 2fc7b02f673f..000000000000 --- a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp +++ /dev/null @@ -1,303 +0,0 @@ -//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass translates tgsi-like texture intrinsics into R600 texture -/// closer to hardware intrinsics. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" - -using namespace llvm; - -namespace { -class R600TextureIntrinsicsReplacer : - public FunctionPass, public InstVisitor { - static char ID; - - Module *Mod; - Type *FloatType; - Type *Int32Type; - Type *V4f32Type; - Type *V4i32Type; - FunctionType *TexSign; - FunctionType *TexQSign; - - void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD, - unsigned SrcSelect[4], unsigned CT[4], - bool &useShadowVariant) { - enum TextureTypes { - TEXTURE_1D = 1, - TEXTURE_2D, - TEXTURE_3D, - TEXTURE_CUBE, - TEXTURE_RECT, - TEXTURE_SHADOW1D, - TEXTURE_SHADOW2D, - TEXTURE_SHADOWRECT, - TEXTURE_1D_ARRAY, - TEXTURE_2D_ARRAY, - TEXTURE_SHADOW1D_ARRAY, - TEXTURE_SHADOW2D_ARRAY, - TEXTURE_SHADOWCUBE, - TEXTURE_2D_MSAA, - TEXTURE_2D_ARRAY_MSAA, - TEXTURE_CUBE_ARRAY, - TEXTURE_SHADOWCUBE_ARRAY - }; - - switch (TextureType) { - case 0: - useShadowVariant = false; - return; - case TEXTURE_RECT: - case TEXTURE_1D: - case TEXTURE_2D: - case TEXTURE_3D: - case TEXTURE_CUBE: - case TEXTURE_1D_ARRAY: - case TEXTURE_2D_ARRAY: - case TEXTURE_CUBE_ARRAY: - case TEXTURE_2D_MSAA: - case TEXTURE_2D_ARRAY_MSAA: - useShadowVariant = false; - break; - case TEXTURE_SHADOW1D: - case TEXTURE_SHADOW2D: - case TEXTURE_SHADOWRECT: - case TEXTURE_SHADOW1D_ARRAY: - case TEXTURE_SHADOW2D_ARRAY: - case TEXTURE_SHADOWCUBE: - case TEXTURE_SHADOWCUBE_ARRAY: - useShadowVariant = true; - break; - default: - llvm_unreachable("Unknow Texture Type"); - } - - if (TextureType == TEXTURE_RECT || - TextureType == TEXTURE_SHADOWRECT) { - CT[0] = 0; - CT[1] = 0; - } - - if (TextureType == TEXTURE_CUBE_ARRAY || - TextureType == TEXTURE_SHADOWCUBE_ARRAY) - CT[2] = 0; - - if (TextureType == TEXTURE_1D_ARRAY || - TextureType == TEXTURE_SHADOW1D_ARRAY) { - if (hasLOD && useShadowVariant) { - CT[1] = 0; - } else { - CT[2] = 0; - SrcSelect[2] = 1; - } - } else if (TextureType == TEXTURE_2D_ARRAY || - TextureType == TEXTURE_SHADOW2D_ARRAY) { - CT[2] = 0; - } - - if ((TextureType == TEXTURE_SHADOW1D || - TextureType == TEXTURE_SHADOW2D || - TextureType == TEXTURE_SHADOWRECT || - TextureType == TEXTURE_SHADOW1D_ARRAY) && - !(hasLOD && useShadowVariant)) - SrcSelect[3] = 2; - } - - void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name, - unsigned SrcSelect[4], Value *Offset[3], Value *Resource, - Value *Sampler, unsigned CT[4], Value *Coord) { - IRBuilder<> Builder(&I); - Constant *Mask[] = { - ConstantInt::get(Int32Type, SrcSelect[0]), - ConstantInt::get(Int32Type, SrcSelect[1]), - ConstantInt::get(Int32Type, SrcSelect[2]), - ConstantInt::get(Int32Type, SrcSelect[3]) - }; - Value *SwizzleMask = ConstantVector::get(Mask); - Value *SwizzledCoord = - Builder.CreateShuffleVector(Coord, Coord, SwizzleMask); - - Value *Args[] = { - SwizzledCoord, - Offset[0], - Offset[1], - Offset[2], - Resource, - Sampler, - ConstantInt::get(Int32Type, CT[0]), - ConstantInt::get(Int32Type, CT[1]), - ConstantInt::get(Int32Type, CT[2]), - ConstantInt::get(Int32Type, CT[3]) - }; - - Function *F = Mod->getFunction(Name); - if (!F) { - F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod); - F->addFnAttr(Attribute::ReadNone); - } - I.replaceAllUsesWith(Builder.CreateCall(F, Args)); - I.eraseFromParent(); - } - - void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT, - const char *VanillaInt, - const char *ShadowInt) { - Value *Coord = I.getArgOperand(0); - Value *ResourceId = I.getArgOperand(1); - Value *SamplerId = I.getArgOperand(2); - - unsigned TextureType = - cast(I.getArgOperand(3))->getZExtValue(); - - unsigned SrcSelect[4] = { 0, 1, 2, 3 }; - unsigned CT[4] = {1, 1, 1, 1}; - Value *Offset[3] = { - ConstantInt::get(Int32Type, 0), - ConstantInt::get(Int32Type, 0), - ConstantInt::get(Int32Type, 0) - }; - bool useShadowVariant; - - getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT, - useShadowVariant); - - ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect, - Offset, ResourceId, SamplerId, CT, Coord); - } - - void ReplaceTXF(CallInst &I) { - Value *Coord = I.getArgOperand(0); - Value *ResourceId = I.getArgOperand(4); - Value *SamplerId = I.getArgOperand(5); - - unsigned TextureType = - cast(I.getArgOperand(6))->getZExtValue(); - - unsigned SrcSelect[4] = { 0, 1, 2, 3 }; - unsigned CT[4] = {1, 1, 1, 1}; - Value *Offset[3] = { - I.getArgOperand(1), - I.getArgOperand(2), - I.getArgOperand(3), - }; - bool useShadowVariant; - - getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT, - useShadowVariant); - - ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect, - Offset, ResourceId, SamplerId, CT, Coord); - } - -public: - R600TextureIntrinsicsReplacer(): - FunctionPass(ID) { - } - - bool doInitialization(Module &M) override { - LLVMContext &Ctx = M.getContext(); - Mod = &M; - FloatType = Type::getFloatTy(Ctx); - Int32Type = Type::getInt32Ty(Ctx); - V4f32Type = VectorType::get(FloatType, 4); - V4i32Type = VectorType::get(Int32Type, 4); - Type *ArgsType[] = { - V4f32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - }; - TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false); - Type *ArgsQType[] = { - V4i32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - }; - TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false); - return false; - } - - bool runOnFunction(Function &F) override { - visit(F); - return false; - } - - const char *getPassName() const override { - return "R600 Texture Intrinsics Replacer"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - } - - void visitCallInst(CallInst &I) { - if (!I.getCalledFunction()) - return; - - StringRef Name = I.getCalledFunction()->getName(); - if (Name == "llvm.AMDGPU.tex") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc"); - return; - } - if (Name == "llvm.AMDGPU.txl") { - ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc"); - return; - } - if (Name == "llvm.AMDGPU.txb") { - ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc"); - return; - } - if (Name == "llvm.AMDGPU.txf") { - ReplaceTXF(I); - return; - } - if (Name == "llvm.AMDGPU.txq") { - ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq"); - return; - } - if (Name == "llvm.AMDGPU.ddx") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx"); - return; - } - if (Name == "llvm.AMDGPU.ddy") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy"); - return; - } - } - -}; - -char R600TextureIntrinsicsReplacer::ID = 0; - -} - -FunctionPass *llvm::createR600TextureIntrinsicsReplacer() { - return new R600TextureIntrinsicsReplacer(); -} diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index fa4d24a2f25a..5f182c5304c6 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -14,6 +14,7 @@ #include "AMDGPU.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" @@ -34,17 +35,16 @@ typedef std::pair StackEntry; typedef SmallVector StackVector; // Intrinsic names the control flow is annotated with -static const char *const IfIntrinsic = "llvm.SI.if"; -static const char *const ElseIntrinsic = "llvm.SI.else"; -static const char *const BreakIntrinsic = "llvm.SI.break"; -static const char *const IfBreakIntrinsic = "llvm.SI.if.break"; -static const char *const ElseBreakIntrinsic = "llvm.SI.else.break"; -static const char *const LoopIntrinsic = "llvm.SI.loop"; -static const char *const EndCfIntrinsic = "llvm.SI.end.cf"; +static const char *const IfIntrinsic = "llvm.amdgcn.if"; +static const char *const ElseIntrinsic = "llvm.amdgcn.else"; +static const char *const BreakIntrinsic = "llvm.amdgcn.break"; +static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break"; +static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break"; +static const char *const LoopIntrinsic = "llvm.amdgcn.loop"; +static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf"; class SIAnnotateControlFlow : public FunctionPass { - - static char ID; + DivergenceAnalysis *DA; Type *Boolean; Type *Void; @@ -69,6 +69,8 @@ class SIAnnotateControlFlow : public FunctionPass { LoopInfo *LI; + bool isUniform(BranchInst *T); + bool isTopOfStack(BasicBlock *BB); Value *popSaved(); @@ -83,13 +85,16 @@ class SIAnnotateControlFlow : public FunctionPass { void insertElse(BranchInst *Term); - Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L); + Value *handleLoopCondition(Value *Cond, PHINode *Broken, + llvm::Loop *L, BranchInst *Term); void handleLoop(BranchInst *Term); void closeControlFlow(BasicBlock *BB); public: + static char ID; + SIAnnotateControlFlow(): FunctionPass(ID) { } @@ -104,6 +109,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); FunctionPass::getAnalysisUsage(AU); } @@ -112,6 +118,12 @@ public: } // end anonymous namespace +INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, + "Annotate SI Control Flow", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, + "Annotate SI Control Flow", false, false) + char SIAnnotateControlFlow::ID = 0; /// \brief Initialize all the types and constants used in the pass @@ -152,6 +164,13 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { return false; } +/// \brief Is the branch condition uniform or did the StructurizeCFG pass +/// consider it as such? +bool SIAnnotateControlFlow::isUniform(BranchInst *T) { + return DA->isUniform(T->getCondition()) || + T->getMetadata("structurizecfg.uniform") != nullptr; +} + /// \brief Is BB the last block saved on the stack ? bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { return !Stack.empty() && Stack.back().first == BB; @@ -194,6 +213,9 @@ void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { /// \brief Open a new "If" block void SIAnnotateControlFlow::openIf(BranchInst *Term) { + if (isUniform(Term)) { + return; + } Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); @@ -201,6 +223,9 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) { /// \brief Close the last "If" block and open a new "Else" block void SIAnnotateControlFlow::insertElse(BranchInst *Term) { + if (isUniform(Term)) { + return; + } Value *Ret = CallInst::Create(Else, popSaved(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); @@ -208,7 +233,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { /// \brief Recursively handle the condition leading to a loop Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, - llvm::Loop *L) { + llvm::Loop *L, BranchInst *Term) { // Only search through PHI nodes which are inside the loop. If we try this // with PHI nodes that are outside of the loop, we end up inserting new PHI @@ -232,7 +257,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, } Phi->setIncomingValue(i, BoolFalse); - Value *PhiArg = handleLoopCondition(Incoming, Broken, L); + Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term); NewPhi->addIncoming(PhiArg, From); } @@ -246,7 +271,23 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, BasicBlock *From = Phi->getIncomingBlock(i); if (From == IDom) { + // We're in the following situation: + // IDom/From + // | \ + // | If-block + // | / + // Parent + // where we want to break out of the loop if the If-block is not taken. + // Due to the depth-first traversal, there should be an end.cf + // intrinsic in Parent, and we insert an else.break before it. + // + // Note that the end.cf need not be the first non-phi instruction + // of parent, particularly when we're dealing with a multi-level + // break, but it should occur within a group of intrinsic calls + // at the beginning of the block. CallInst *OldEnd = dyn_cast(Parent->getFirstInsertionPt()); + while (OldEnd && OldEnd->getCalledFunction() != EndCf) + OldEnd = dyn_cast(OldEnd->getNextNode()); if (OldEnd && OldEnd->getCalledFunction() == EndCf) { Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); @@ -271,14 +312,23 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, Value *Args[] = { Cond, Broken }; return CallInst::Create(IfBreak, Args, "", Insert); + // Insert IfBreak before TERM for constant COND. + } else if (isa(Cond)) { + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Term); + } else { llvm_unreachable("Unhandled loop condition!"); } - return 0; + return nullptr; } /// \brief Handle a back edge (loop) void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { + if (isUniform(Term)) { + return; + } + BasicBlock *BB = Term->getParent(); llvm::Loop *L = LI->getLoopFor(BB); BasicBlock *Target = Term->getSuccessor(1); @@ -286,7 +336,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); - Value *Arg = handleLoopCondition(Cond, Broken, L); + Value *Arg = handleLoopCondition(Cond, Broken, L, Term); for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); PI != PE; ++PI) { @@ -300,6 +350,8 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { llvm::Loop *L = LI->getLoopFor(BB); + assert(Stack.back().first == BB); + if (L && L->getHeader() == BB) { // We can't insert an EndCF call into a loop header, because it will // get executed on every iteration of the loop, when it should be @@ -315,14 +367,18 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); } - CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt()); + Value *Exec = popSaved(); + if (!isa(Exec)) + CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt()); } /// \brief Annotate the control flow with intrinsics so the backend can /// recognize if/then/else and loops. bool SIAnnotateControlFlow::runOnFunction(Function &F) { + DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); + DA = &getAnalysis(); for (df_iterator I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { @@ -332,12 +388,14 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { if (!Term || Term->isUnconditional()) { if (isTopOfStack(*I)) closeControlFlow(*I); + continue; } if (I.nodeVisited(Term->getSuccessor(1))) { if (isTopOfStack(*I)) closeControlFlow(*I); + handleLoop(Term); continue; } diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp new file mode 100644 index 000000000000..65ceff3930ac --- /dev/null +++ b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp @@ -0,0 +1,96 @@ +//===--- SIDebuggerInsertNops.cpp - Inserts nops for debugger usage -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Inserts one nop instruction for each high level source statement for +/// debugger usage. +/// +/// Tools, such as a debugger, need to pause execution based on user input (i.e. +/// breakpoint). In order to do this, one nop instruction is inserted before the +/// first isa instruction of each high level source statement. Further, the +/// debugger may replace nop instructions with trap instructions based on user +/// input. +// +//===----------------------------------------------------------------------===// + +#include "SIInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "si-debugger-insert-nops" +#define PASS_NAME "SI Debugger Insert Nops" + +namespace { + +class SIDebuggerInsertNops : public MachineFunctionPass { +public: + static char ID; + + SIDebuggerInsertNops() : MachineFunctionPass(ID) { } + const char *getPassName() const override { return PASS_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // anonymous namespace + +INITIALIZE_PASS(SIDebuggerInsertNops, DEBUG_TYPE, PASS_NAME, false, false) + +char SIDebuggerInsertNops::ID = 0; +char &llvm::SIDebuggerInsertNopsID = SIDebuggerInsertNops::ID; + +FunctionPass *llvm::createSIDebuggerInsertNopsPass() { + return new SIDebuggerInsertNops(); +} + +bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) { + // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not + // specified. + const SISubtarget &ST = MF.getSubtarget(); + if (!ST.debuggerInsertNops()) + return false; + + // Skip machine functions without debug info. + if (!MF.getMMI().hasDebugInfo()) + return false; + + // Target instruction info. + const SIInstrInfo *TII = ST.getInstrInfo(); + + // Set containing line numbers that have nop inserted. + DenseSet NopInserted; + + for (auto &MBB : MF) { + for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { + // Skip DBG_VALUE instructions and instructions without location. + if (MI->isDebugValue() || !MI->getDebugLoc()) + continue; + + // Insert nop instruction if line number does not have nop inserted. + auto DL = MI->getDebugLoc(); + if (NopInserted.find(DL.getLine()) == NopInserted.end()) { + BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP)) + .addImm(0); + NopInserted.insert(DL.getLine()); + } + } + } + + return true; +} diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index aa1e352ed748..54efdc0a0466 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -10,8 +10,8 @@ #include "llvm/MC/MCInstrDesc.h" -#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H -#define LLVM_LIB_TARGET_R600_SIDEFINES_H +#ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H +#define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H namespace SIInstrFlags { // This needs to be kept in sync with the field bits in InstSI. @@ -29,16 +29,19 @@ enum { VOP2 = 1 << 11, VOP3 = 1 << 12, VOPC = 1 << 13, + SDWA = 1 << 14, + DPP = 1 << 15, - MUBUF = 1 << 14, - MTBUF = 1 << 15, - SMRD = 1 << 16, - DS = 1 << 17, - MIMG = 1 << 18, - FLAT = 1 << 19, - WQM = 1 << 20, - VGPRSpill = 1 << 21, - VOPAsmPrefer32Bit = 1 << 22 + MUBUF = 1 << 16, + MTBUF = 1 << 17, + SMRD = 1 << 18, + DS = 1 << 19, + MIMG = 1 << 20, + FLAT = 1 << 21, + WQM = 1 << 22, + VGPRSpill = 1 << 23, + VOPAsmPrefer32Bit = 1 << 24, + Gather4 = 1 << 25 }; } @@ -46,9 +49,14 @@ namespace llvm { namespace AMDGPU { enum OperandType { /// Operand with register or 32-bit immediate - OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET, + OPERAND_REG_IMM32 = MCOI::OPERAND_FIRST_TARGET, /// Operand with register or inline constant - OPERAND_REG_INLINE_C + OPERAND_REG_INLINE_C, + + /// Operand with 32-bit immediate that uses the constant bus. The standard + /// OPERAND_IMMEDIATE should be used for special immediates such as source + /// modifiers. + OPERAND_KIMM32 }; } } @@ -77,10 +85,13 @@ namespace SIInstrFlags { }; } +// Input operand modifiers bit-masks +// NEG and SEXT share same bit-mask because they can't be set simultaneously. namespace SISrcMods { enum { - NEG = 1 << 0, - ABS = 1 << 1 + NEG = 1 << 0, // Floating-point negate modifier + ABS = 1 << 1, // Floating-point absolute modifier + SEXT = 1 << 0 // Integer sign-extend modifier }; } @@ -93,6 +104,109 @@ namespace SIOutMods { }; } +namespace llvm { +namespace AMDGPU { +namespace EncValues { // Encoding values of enum9/8/7 operands + +enum { + SGPR_MIN = 0, + SGPR_MAX = 101, + TTMP_MIN = 112, + TTMP_MAX = 123, + INLINE_INTEGER_C_MIN = 128, + INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64 + INLINE_INTEGER_C_MAX = 208, + INLINE_FLOATING_C_MIN = 240, + INLINE_FLOATING_C_MAX = 248, + LITERAL_CONST = 255, + VGPR_MIN = 256, + VGPR_MAX = 511 +}; + +} // namespace EncValues +} // namespace AMDGPU +} // namespace llvm + +namespace llvm { +namespace AMDGPU { +namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns. + +enum Id { // Message ID, width(4) [3:0]. + ID_UNKNOWN_ = -1, + ID_INTERRUPT = 1, + ID_GS, + ID_GS_DONE, + ID_SYSMSG = 15, + ID_GAPS_LAST_, // Indicate that sequence has gaps. + ID_GAPS_FIRST_ = ID_INTERRUPT, + ID_SHIFT_ = 0, + ID_WIDTH_ = 4, + ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) +}; + +enum Op { // Both GS and SYS operation IDs. + OP_UNKNOWN_ = -1, + OP_SHIFT_ = 4, + // width(2) [5:4] + OP_GS_NOP = 0, + OP_GS_CUT, + OP_GS_EMIT, + OP_GS_EMIT_CUT, + OP_GS_LAST_, + OP_GS_FIRST_ = OP_GS_NOP, + OP_GS_WIDTH_ = 2, + OP_GS_MASK_ = (((1 << OP_GS_WIDTH_) - 1) << OP_SHIFT_), + // width(3) [6:4] + OP_SYS_ECC_ERR_INTERRUPT = 1, + OP_SYS_REG_RD, + OP_SYS_HOST_TRAP_ACK, + OP_SYS_TTRACE_PC, + OP_SYS_LAST_, + OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT, + OP_SYS_WIDTH_ = 3, + OP_SYS_MASK_ = (((1 << OP_SYS_WIDTH_) - 1) << OP_SHIFT_) +}; + +enum StreamId { // Stream ID, (2) [9:8]. + STREAM_ID_DEFAULT_ = 0, + STREAM_ID_LAST_ = 4, + STREAM_ID_FIRST_ = STREAM_ID_DEFAULT_, + STREAM_ID_SHIFT_ = 8, + STREAM_ID_WIDTH_= 2, + STREAM_ID_MASK_ = (((1 << STREAM_ID_WIDTH_) - 1) << STREAM_ID_SHIFT_) +}; + +} // namespace SendMsg + +namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns. + +enum Id { // HwRegCode, (6) [5:0] + ID_UNKNOWN_ = -1, + ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined. + ID_SYMBOLIC_LAST_ = 8, + ID_SHIFT_ = 0, + ID_WIDTH_ = 6, + ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) +}; + +enum Offset { // Offset, (5) [10:6] + OFFSET_DEFAULT_ = 0, + OFFSET_SHIFT_ = 6, + OFFSET_WIDTH_ = 5, + OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_) +}; + +enum WidthMinusOne { // WidthMinusOne, (5) [15:11] + WIDTH_M1_DEFAULT_ = 31, + WIDTH_M1_SHIFT_ = 11, + WIDTH_M1_WIDTH_ = 5, + WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_) +}; + +} // namespace Hwreg +} // namespace AMDGPU +} // namespace llvm + #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C #define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) @@ -134,7 +248,7 @@ namespace SIOutMods { #define C_00B84C_LDS_SIZE 0xFF007FFF #define S_00B84C_EXCP_EN(x) (((x) & 0x7F) << 24) #define G_00B84C_EXCP_EN(x) (((x) >> 24) & 0x7F) -#define C_00B84C_EXCP_EN +#define C_00B84C_EXCP_EN #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC #define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0 @@ -194,5 +308,7 @@ namespace SIOutMods { #define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 #define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define R_SPILLED_SGPRS 0x4 +#define R_SPILLED_VGPRS 0x8 #endif diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index f59d9948f98e..9e0086b79087 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -77,7 +77,7 @@ using namespace llvm; -#define DEBUG_TYPE "sgpr-copies" +#define DEBUG_TYPE "si-fix-sgpr-copies" namespace { @@ -237,11 +237,10 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, } bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIRegisterInfo *TRI = - static_cast(MF.getSubtarget().getRegisterInfo()); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); SmallVector Worklist; diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp deleted file mode 100644 index 8bda283f0fca..000000000000 --- a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp +++ /dev/null @@ -1,219 +0,0 @@ -//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file SALU instructions ignore the execution mask, so we need to modify the -/// live ranges of the registers they define in some cases. -/// -/// The main case we need to handle is when a def is used in one side of a -/// branch and not another. For example: -/// -/// %def -/// IF -/// ... -/// ... -/// ELSE -/// %use -/// ... -/// ENDIF -/// -/// Here we need the register allocator to avoid assigning any of the defs -/// inside of the IF to the same register as %def. In traditional live -/// interval analysis %def is not live inside the IF branch, however, since -/// SALU instructions inside of IF will be executed even if the branch is not -/// taken, there is the chance that one of the instructions will overwrite the -/// value of %def, so the use in ELSE will see the wrong value. -/// -/// The strategy we use for solving this is to add an extra use after the ENDIF: -/// -/// %def -/// IF -/// ... -/// ... -/// ELSE -/// %use -/// ... -/// ENDIF -/// %use -/// -/// Adding this use will make the def live throughout the IF branch, which is -/// what we want. - -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/LiveVariables.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-sgpr-live-ranges" - -namespace { - -class SIFixSGPRLiveRanges : public MachineFunctionPass { -public: - static char ID; - -public: - SIFixSGPRLiveRanges() : MachineFunctionPass(ID) { - initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Fix SGPR live ranges"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); - - AU.addRequired(); - AU.addPreserved(); - AU.setPreservesCFG(); - - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, - "SI Fix SGPR Live Ranges", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) -INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, - "SI Fix SGPR Live Ranges", false, false) - -char SIFixSGPRLiveRanges::ID = 0; - -char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID; - -FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { - return new SIFixSGPRLiveRanges(); -} - -bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - const SIRegisterInfo *TRI = static_cast( - MF.getSubtarget().getRegisterInfo()); - bool MadeChange = false; - - MachinePostDominatorTree *PDT = &getAnalysis(); - SmallVector SGPRLiveRanges; - - LiveVariables *LV = &getAnalysis(); - MachineBasicBlock *Entry = &MF.front(); - - // Use a depth first order so that in SSA, we encounter all defs before - // uses. Once the defs of the block have been found, attempt to insert - // SGPR_USE instructions in successor blocks if required. - for (MachineBasicBlock *MBB : depth_first(Entry)) { - for (const MachineInstr &MI : *MBB) { - for (const MachineOperand &MO : MI.defs()) { - // We should never see a live out def of a physical register, so we also - // do not need to worry about implicit_defs(). - unsigned Def = MO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Def)) { - if (TRI->isSGPRClass(MRI.getRegClass(Def))) { - // Only consider defs that are live outs. We don't care about def / - // use within the same block. - - // LiveVariables does not consider registers that are only used in a - // phi in a sucessor block as live out, unlike LiveIntervals. - // - // This is OK because SIFixSGPRCopies replaced any SGPR phis with - // VGPRs. - if (LV->isLiveOut(Def, *MBB)) - SGPRLiveRanges.push_back(Def); - } - } - } - } - - if (MBB->succ_size() < 2) - continue; - - // We have structured control flow, so the number of successors should be - // two. - assert(MBB->succ_size() == 2); - MachineBasicBlock *SuccA = *MBB->succ_begin(); - MachineBasicBlock *SuccB = *(++MBB->succ_begin()); - MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); - - if (!NCD) - continue; - - MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator(); - - if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) { - assert(NCD->succ_size() == 2); - // We want to make sure we insert the Use after the ENDIF, not after - // the ELSE. - NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), - *(++NCD->succ_begin())); - } - - for (unsigned Reg : SGPRLiveRanges) { - // FIXME: We could be smarter here. If the register is Live-In to one - // block, but the other doesn't have any SGPR defs, then there won't be a - // conflict. Also, if the branch condition is uniform then there will be - // no conflict. - bool LiveInToA = LV->isLiveIn(Reg, *SuccA); - bool LiveInToB = LV->isLiveIn(Reg, *SuccB); - - if (!LiveInToA && !LiveInToB) { - DEBUG(dbgs() << PrintReg(Reg, TRI, 0) - << " is live into neither successor\n"); - continue; - } - - if (LiveInToA && LiveInToB) { - DEBUG(dbgs() << PrintReg(Reg, TRI, 0) - << " is live into both successors\n"); - continue; - } - - // This interval is live in to one successor, but not the other, so - // we need to update its range so it is live in to both. - DEBUG(dbgs() << "Possible SGPR conflict detected for " - << PrintReg(Reg, TRI, 0) - << " BB#" << SuccA->getNumber() - << ", BB#" << SuccB->getNumber() - << " with NCD = BB#" << NCD->getNumber() << '\n'); - - assert(TargetRegisterInfo::isVirtualRegister(Reg) && - "Not expecting to extend live range of physreg"); - - // FIXME: Need to figure out how to update LiveRange here so this pass - // will be able to preserve LiveInterval analysis. - MachineInstr *NCDSGPRUse = - BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), - TII->get(AMDGPU::SGPR_USE)) - .addReg(Reg, RegState::Implicit); - - MadeChange = true; - LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse); - - DEBUG(NCDSGPRUse->dump()); - } - } - - return MadeChange; -} diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 6230d1e28b74..4ecc0fcc6232 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -13,12 +13,9 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -44,8 +41,6 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -76,11 +71,8 @@ struct FoldCandidate { } // End anonymous namespace. -INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, - "SI Fold Operands", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, - "SI Fold Operands", false, false) +INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) char SIFoldOperands::ID = 0; @@ -140,7 +132,7 @@ static bool tryAddToFoldList(std::vector &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII) { - if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { // Special case for v_mac_f32_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); @@ -167,7 +159,7 @@ static bool tryAddToFoldList(std::vector &FoldList, // see if this makes it possible to fold. unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; - bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); + bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1); if (CanCommute) { if (CommuteIdx0 == OpNo) @@ -185,10 +177,10 @@ static bool tryAddToFoldList(std::vector &FoldList, return false; if (!CanCommute || - !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1)) + !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) return false; - if (!TII->isOperandLegal(MI, OpNo, OpToFold)) + if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) return false; } @@ -301,9 +293,13 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, } bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + + const SISubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 7d20509c464d..03b11f0fd38d 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -11,6 +11,8 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "AMDGPUSubtarget.h" + #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -21,24 +23,13 @@ using namespace llvm; static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, const MachineFrameInfo *FrameInfo) { - if (!FuncInfo->hasSpilledSGPRs()) - return false; - - if (FuncInfo->hasSpilledVGPRs()) - return false; - - for (int I = FrameInfo->getObjectIndexBegin(), - E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { - if (!FrameInfo->isSpillSlotObjectIndex(I)) - return false; - } - - return true; + return FuncInfo->hasSpilledSGPRs() && + (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects()); } static ArrayRef getAllSGPR128() { - return makeArrayRef(AMDGPU::SReg_128RegClass.begin(), - AMDGPU::SReg_128RegClass.getNumRegs()); + return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), + AMDGPU::SGPR_128RegClass.getNumRegs()); } static ArrayRef getAllSGPRs() { @@ -48,6 +39,12 @@ static ArrayRef getAllSGPRs() { void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { + // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was + // specified. + const SISubtarget &ST = MF.getSubtarget(); + if (ST.debuggerEmitPrologue()) + emitDebuggerPrologue(MF, MBB); + if (!MF.getFrameInfo()->hasStackObjects()) return; @@ -63,10 +60,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) return; - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - const AMDGPUSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineBasicBlock::iterator I = MBB.begin(); // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); @@ -84,6 +81,46 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } + if (MFI->hasFlatScratchInit()) { + // We don't need this if we only have spills since there is no user facing + // scratch. + + // TODO: If we know we don't have flat instructions earlier, we can omit + // this from the input registers. + // + // TODO: We only need to know if we access scratch space through a flat + // pointer. Because we only detect if flat instructions are used at all, + // this will be used more often than necessary on VI. + + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. + DebugLoc DL; + + unsigned FlatScratchInitReg + = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); + + MRI.addLiveIn(FlatScratchInitReg); + MBB.addLiveIn(FlatScratchInitReg); + + // Copy the size in bytes. + unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitHi, RegState::Kill); + + unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + + // Add wave offset in bytes to private base offset. + // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + + // Convert offset to 256-byte units. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) + .addReg(FlatScrInitLo, RegState::Kill) + .addImm(8); + } + // If we reserved the original input registers, we don't need to copy to the // reserved registers. if (ScratchRsrcReg == PreloadedPrivateBufferReg) { @@ -96,7 +133,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. - MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); @@ -137,15 +173,28 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { MachineRegisterInfo &MRI = MF.getRegInfo(); - // Skip the last 2 elements because the last one is reserved for VCC, and - // this is the 2nd to last element already. unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + + // We need to drop register from the end of the list that we cannot use + // for the scratch wave offset. + // + 2 s102 and s103 do not exist on VI. + // + 2 for vcc + // + 2 for xnack_mask + // + 2 for flat_scratch + // + 4 for registers reserved for scratch resource register + // + 1 for register reserved for scratch wave offset. (By exluding this + // register from the list to consider, it means that when this + // register is being used for the scratch wave offset and there + // are no other free SGPRs, then the value will stay in this register. + // ---- + // 13 + for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) { // Pick the first unallocated SGPR. Be careful not to pick an alias of the // scratch descriptor, since we haven’t added its uses yet. if (!MRI.isPhysRegUsed(Reg)) { - assert(MRI.isAllocatable(Reg) && - !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + if (!MRI.isAllocatable(Reg) || + TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) + continue; MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); ScratchWaveOffsetReg = Reg; @@ -160,7 +209,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - MachineBasicBlock::iterator I = MBB.begin(); DebugLoc DL; if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { @@ -223,6 +271,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, } } +void SIFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + +} + void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { @@ -243,3 +296,44 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( RS->addScavengingFrameIndex(ScavengeFI); } } + +void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + // For each dimension: + for (unsigned i = 0; i < 3; ++i) { + // Get work group ID SGPR, and make it live-in again. + unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); + MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); + MBB.addLiveIn(WorkGroupIDSGPR); + + // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in + // order to spill it to scratch. + unsigned WorkGroupIDVGPR = + MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) + .addReg(WorkGroupIDSGPR); + + // Spill work group ID. + int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); + TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, + WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); + + // Get work item ID VGPR, and make it live-in again. + unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); + MF.getRegInfo().addLiveIn(WorkItemIDVGPR); + MBB.addLiveIn(WorkItemIDVGPR); + + // Spill work item ID. + int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); + TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, + WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); + } +} diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index a9152fd8b2aa..37417d098f31 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -23,10 +23,16 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const override; void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; + +private: + /// \brief Emits debugger prologue. + void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; }; } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 544867513d9c..51241cf0a432 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -18,33 +18,46 @@ #include #endif -#include "SIISelLowering.h" #include "AMDGPU.h" -#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" -#include "llvm/ADT/SmallString.h" using namespace llvm; -SITargetLowering::SITargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) +// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv. +static cl::opt EnableAMDGPUFastFDIV( + "amdgpu-fast-fdiv", + cl::desc("Enable faster 2.5 ulp fdiv"), + cl::init(false)); + +static unsigned findFirstFreeSGPR(CCState &CCInfo) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { + if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { + return AMDGPU::SGPR0 + Reg; + } + } + llvm_unreachable("Cannot allocate sgpr"); +} + +SITargetLowering::SITargetLowering(const TargetMachine &TM, + const SISubtarget &STI) : AMDGPUTargetLowering(TM, STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); - addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); - addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); @@ -66,34 +79,25 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, computeRegisterProperties(STI.getRegisterInfo()); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - - setOperationAction(ISD::ADD, MVT::i32, Legal); - setOperationAction(ISD::ADDC, MVT::i32, Legal); - setOperationAction(ISD::ADDE, MVT::i32, Legal); - setOperationAction(ISD::SUBC, MVT::i32, Legal); - setOperationAction(ISD::SUBE, MVT::i32, Legal); - - setOperationAction(ISD::FSIN, MVT::f32, Custom); - setOperationAction(ISD::FCOS, MVT::f32, Custom); - - setOperationAction(ISD::FMINNUM, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); - // We need to custom lower vector stores from local memory + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); - setOperationAction(ISD::STORE, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); + + setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); @@ -102,109 +106,39 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SETCC, MVT::i1, Promote); setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); - setOperationAction(ISD::BSWAP, MVT::i32, Legal); - setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); - - for (MVT VT : MVT::integer_valuetypes()) { - if (VT == MVT::i64) - continue; - - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); - } - - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); - } - - for (MVT VT : MVT::fp_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); - - setTruncStoreAction(MVT::i64, MVT::i32, Expand); - setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); - - - setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); - - setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); - setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); - - setOperationAction(ISD::LOAD, MVT::i1, Custom); - - setOperationAction(ISD::LOAD, MVT::v2i64, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); - - setOperationAction(ISD::STORE, MVT::v2i64, Promote); - AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); - - setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); - - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); - - // These should use UDIVREM, so set them to expand - setOperationAction(ISD::UDIV, MVT::i64, Expand); - setOperationAction(ISD::UREM, MVT::i64, Expand); - - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::SELECT, MVT::i1, Promote); - - setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); - - - setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BR_CC, MVT::f64, Expand); // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { - switch(Op) { + switch (Op) { case ISD::LOAD: case ISD::STORE: case ISD::BUILD_VECTOR: @@ -241,13 +175,46 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); } - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, + // and output demarshalling + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); + + // We can't return success/failure, only the old value, + // let LLVM add the comparison + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); + + if (getSubtarget()->hasFlatAddressSpace()) { + setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); + setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); + } + + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + + // On SI this is s_memtime and s_memrealtime on VI. + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); + setOperationAction(ISD::TRAP, MVT::Other, Custom); + + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + + if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); } setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + + setOperationAction(ISD::FSIN, MVT::f32, Custom); + setOperationAction(ISD::FCOS, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); @@ -263,6 +230,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::FCANONICALIZE); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -287,10 +255,33 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setSchedulingPreference(Sched::RegPressure); } +const SISubtarget *SITargetLowering::getSubtarget() const { + return static_cast(Subtarget); +} + //===----------------------------------------------------------------------===// // TargetLowering queries //===----------------------------------------------------------------------===// +bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &CI, + unsigned IntrID) const { + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = true; + return true; + default: + return false; + } +} + bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl &, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no @@ -348,7 +339,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: { - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { // Assume the we will use FLAT for all global memory accesses // on VI. // FIXME: This assumption is currently wrong. On VI we still use @@ -376,16 +367,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (DL.getTypeStoreSize(Ty) < 4) return isLegalMUBUFAddressingMode(AM); - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { // SMRD instructions have an 8-bit, dword offset on SI. if (!isUInt<8>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { + } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) { // On CI+, this can also be a 32-bit literal constant offset. If it fits // in 8-bits, it can use a smaller encoding. if (!isUInt<32>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) { // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; @@ -402,7 +393,6 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } case AMDGPUAS::PRIVATE_ADDRESS: - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: return isLegalMUBUFAddressingMode(AM); case AMDGPUAS::LOCAL_ADDRESS: @@ -423,6 +413,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; } case AMDGPUAS::FLAT_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: + // For an unknown address space, this usually means that this is for some + // reason being used for pure arithmetic, and not based on some addressing + // computation. We don't have instructions that compute pointers with any + // addressing modes, so treat them as having no offset like flat + // instructions. return isLegalFlatAddressingMode(AM); default: @@ -442,24 +438,30 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (!VT.isSimple() || VT == MVT::Other) return false; - // TODO - CI+ supports unaligned memory accesses, but this requires driver - // support. - - // XXX - The only mention I see of this in the ISA manual is for LDS direct - // reads the "byte address and must be dword aligned". Is it also true for the - // normal loads and stores? - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS) { // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. bool AlignedBy4 = (Align % 4 == 0); if (IsFast) *IsFast = AlignedBy4; + return AlignedBy4; } + if (Subtarget->hasUnalignedBufferAccess()) { + // If we have an uniform constant load, it still requires using a slow + // buffer instruction if unaligned. + if (IsFast) { + *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? + (Align % 4 == 0) : true; + } + + return true; + } + // Smaller than dword value must be aligned. - // FIXME: This should be allowed on CI+ if (VT.bitsLT(MVT::i32)) return false; @@ -500,21 +502,22 @@ static bool isFlatGlobalAddrSpace(unsigned AS) { bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { - return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); } - bool SITargetLowering::isMemOpUniform(const SDNode *N) const { const MemSDNode *MemNode = cast(N); const Value *Ptr = MemNode->getMemOperand()->getValue(); // UndefValue means this is a load of a kernel input. These are uniform. - // Sometimes LDS instructions have constant pointers - if (isa(Ptr) || isa(Ptr) || isa(Ptr) || - isa(Ptr)) + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa(Ptr) || isa(Ptr) || + isa(Ptr) || isa(Ptr)) return true; - const Instruction *I = dyn_cast_or_null(Ptr); + const Instruction *I = dyn_cast(Ptr); return I && I->getMetadata("amdgpu.uniform"); } @@ -528,29 +531,42 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const { bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); return TII->isInlineConstant(Imm); } -SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, - SDLoc SL, SDValue Chain, - unsigned Offset, bool Signed) const { +bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { + + // SimplifySetCC uses this function to determine whether or not it should + // create setcc with i1 operands. We don't have instructions for i1 setcc. + if (VT == MVT::i1 && Op == ISD::SETCC) + return false; + + return TargetLowering::isTypeDesirableForOp(Op, VT); +} + +SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, + const SDLoc &SL, SDValue Chain, + unsigned Offset) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg), PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(Offset, SL, PtrVT)); + return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(Offset, SL, PtrVT)); +} +SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Chain, + unsigned Offset, bool Signed) const { + const DataLayout &DL = DAG.getDataLayout(); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); SDValue PtrOffset = DAG.getUNDEF(PtrVT); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); @@ -560,34 +576,35 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, if (MemVT.isFloatingPoint()) ExtTy = ISD::EXTLOAD; - return DAG.getLoad(ISD::UNINDEXED, ExtTy, - VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment + SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); + return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset, + PtrInfo, MemVT, Align, MachineMemOperand::MONonTemporal | + MachineMemOperand::MOInvariant); } SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); + const SmallVectorImpl &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl &InVals) const { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo(); - const AMDGPUSubtarget &ST = MF.getSubtarget(); + const SISubtarget &ST = MF.getSubtarget(); - if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { + if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { const Function *Fn = MF.getFunction(); - DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA"); + DiagnosticInfoUnsupported NoGraphicsHSA( + *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); DAG.getContext()->diagnose(NoGraphicsHSA); - return SDValue(); + return DAG.getEntryNode(); } - // FIXME: We currently assume all calling conventions are kernels. + // Create stack objects that are used for emitting debugger prologue if + // "amdgpu-debugger-emit-prologue" attribute was specified. + if (ST.debuggerEmitPrologue()) + createDebuggerPrologueStackObjects(MF); SmallVector Splits; BitVector Skipped(Ins.size()); @@ -596,7 +613,7 @@ SDValue SITargetLowering::LowerFormalArguments( const ISD::InputArg &Arg = Ins[i]; // First check if it's a PS input addr - if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && + if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && !Arg.Flags.isByVal() && PSInputNum <= 15) { if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { @@ -613,25 +630,26 @@ SDValue SITargetLowering::LowerFormalArguments( ++PSInputNum; } - // Second split vertices into their elements - if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { - ISD::InputArg NewArg = Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg.VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eight. - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned j = 0; j != NumElements; ++j) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); + if (AMDGPU::isShader(CallConv)) { + // Second split vertices into their elements + if (Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eight. + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); + } + } else { + Splits.push_back(Arg); } - - } else if (Info->getShaderType() != ShaderType::COMPUTE) { - Splits.push_back(Arg); } } @@ -651,19 +669,27 @@ SDValue SITargetLowering::LowerFormalArguments( // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be // enabled too. - if (Info->getShaderType() == ShaderType::PIXEL && + if (CallConv == CallingConv::AMDGPU_PS && ((Info->getPSInputAddr() & 0x7F) == 0 || - ((Info->getPSInputAddr() & 0xF) == 0 && - Info->isPSInputAllocated(11)))) { + ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); Info->markPSInputAllocated(0); Info->PSInputEna |= 1; } - if (Info->getShaderType() == ShaderType::COMPUTE) { + if (!AMDGPU::isShader(CallConv)) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); + + assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); + } else { + assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && + !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && + !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && + !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && + !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && + !Info->hasWorkItemIDZ()); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? @@ -679,12 +705,24 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(DispatchPtrReg); } + if (Info->hasQueuePtr()) { + unsigned QueuePtrReg = Info->addQueuePtr(*TRI); + MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(QueuePtrReg); + } + if (Info->hasKernargSegmentPtr()) { unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); CCInfo.AllocateReg(InputPtrReg); } + if (Info->hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(FlatScratchInitReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector Chains; @@ -713,7 +751,7 @@ SDValue SITargetLowering::LowerFormalArguments( auto *ParamTy = dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be @@ -765,7 +803,7 @@ SDValue SITargetLowering::LowerFormalArguments( NumElements = Arg.VT.getVectorNumElements() - NumElements; Regs.append(NumElements, DAG.getUNDEF(VT)); - InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); + InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); continue; } @@ -780,8 +818,7 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned Reg = Info->addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); CCInfo.AllocateReg(Reg); - } else - llvm_unreachable("work group id x is always enabled"); + } if (Info->hasWorkGroupIDY()) { unsigned Reg = Info->addWorkGroupIDY(); @@ -803,8 +840,13 @@ SDValue SITargetLowering::LowerFormalArguments( if (Info->hasPrivateSegmentWaveByteOffset()) { // Scratch wave offset passed in system SGPR. - unsigned PrivateSegmentWaveByteOffsetReg - = Info->addPrivateSegmentWaveByteOffset(); + unsigned PrivateSegmentWaveByteOffsetReg; + + if (AMDGPU::isShader(CallConv)) { + PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); + Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + } else + PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); @@ -812,8 +854,11 @@ SDValue SITargetLowering::LowerFormalArguments( // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. - bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + // Record that we know we have non-spill stack objects so we don't need to + // check all stack objects later. + if (HasStackObjects) + Info->setHasNonSpillStackObjects(true); if (ST.isAmdHsaOS()) { // TODO: Assume we will spill without optimizations. @@ -866,8 +911,7 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); CCInfo.AllocateReg(Reg); - } else - llvm_unreachable("workitem id x should always be enabled"); + } if (Info->hasWorkItemIDY()) { unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); @@ -887,16 +931,16 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } -SDValue SITargetLowering::LowerReturn(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - SDLoc DL, SelectionDAG &DAG) const { +SDValue +SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SDLoc &DL, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo(); - if (Info->getShaderType() == ShaderType::COMPUTE) + if (!AMDGPU::isShader(CallConv)) return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, OutVals, DL, DAG); @@ -975,17 +1019,131 @@ SDValue SITargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); + unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN; + return DAG.getNode(Opc, DL, MVT::Other, RetOps); } -MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( - MachineInstr * MI, MachineBasicBlock * BB) const { - - switch (MI->getOpcode()) { +unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { + unsigned Reg = StringSwitch(RegName) + .Case("m0", AMDGPU::M0) + .Case("exec", AMDGPU::EXEC) + .Case("exec_lo", AMDGPU::EXEC_LO) + .Case("exec_hi", AMDGPU::EXEC_HI) + .Case("flat_scratch", AMDGPU::FLAT_SCR) + .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) + .Default(AMDGPU::NoRegister); + + if (Reg == AMDGPU::NoRegister) { + report_fatal_error(Twine("invalid register name \"" + + StringRef(RegName) + "\".")); + + } + + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && + Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { + report_fatal_error(Twine("invalid register \"" + + StringRef(RegName) + "\" for subtarget.")); + } + + switch (Reg) { + case AMDGPU::M0: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + if (VT.getSizeInBits() == 32) + return Reg; + break; + case AMDGPU::EXEC: + case AMDGPU::FLAT_SCR: + if (VT.getSizeInBits() == 64) + return Reg; + break; default: - return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + llvm_unreachable("missing register type checking"); + } + + report_fatal_error(Twine("invalid type for register \"" + + StringRef(RegName) + "\".")); +} + +// If kill is not the last instruction, split the block so kill is always a +// proper terminator. +MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator SplitPoint(&MI); + ++SplitPoint; + + if (SplitPoint == BB->end()) { + // Don't bother with a new block. + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return BB; + } + + MachineFunction *MF = BB->getParent(); + MachineBasicBlock *SplitBB + = MF->CreateMachineBasicBlock(BB->getBasicBlock()); + + // Fix the block phi references to point to the new block for the defs in the + // second piece of the block. + for (MachineBasicBlock *Succ : BB->successors()) { + for (MachineInstr &MI : *Succ) { + if (!MI.isPHI()) + break; + + for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) { + MachineOperand &FromBB = MI.getOperand(I); + if (BB == FromBB.getMBB()) { + FromBB.setMBB(SplitBB); + break; + } + } + } + } + + MF->insert(++MachineFunction::iterator(BB), SplitBB); + SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); + + SplitBB->transferSuccessors(BB); + BB->addSuccessor(SplitBB); + + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return SplitBB; +} + +MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *BB) const { + switch (MI.getOpcode()) { + case AMDGPU::SI_INIT_M0: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addOperand(MI.getOperand(0)); + MI.eraseFromParent(); + break; + } case AMDGPU::BRANCH: return BB; + case AMDGPU::GET_GROUPSTATICSIZE: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineFunction *MF = BB->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + DebugLoc DL = MI.getDebugLoc(); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32)) + .addOperand(MI.getOperand(0)) + .addImm(MFI->LDSSize); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::SI_KILL: + return splitKillBlock(MI, BB); + default: + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } return BB; } @@ -1072,6 +1230,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: { MachineFunction &MF = DAG.getMachineFunction(); @@ -1079,7 +1238,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalAddress(MFI, Op, DAG); } case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); + case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); + case ISD::TRAP: return lowerTRAP(Op, DAG); } return SDValue(); } @@ -1106,25 +1268,78 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { FrameIndexSDNode *FINode = cast(Op); unsigned FrameIndex = FINode->getIndex(); - // A FrameIndex node represents a 32-bit offset into scratch memory. If - // the high bit of a frame index offset were to be set, this would mean - // that it represented an offset of ~2GB * 64 = ~128GB from the start of the - // scratch buffer, with 64 being the number of threads per wave. + // A FrameIndex node represents a 32-bit offset into scratch memory. If the + // high bit of a frame index offset were to be set, this would mean that it + // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch + // buffer, with 64 being the number of threads per wave. // - // If we know the machine uses less than 128GB of scratch, then we can - // amrk the high bit of the FrameIndex node as known zero, - // which is important, because it means in most situations we can - // prove that values derived from FrameIndex nodes are non-negative. - // This enables us to take advantage of more addressing modes when - // accessing scratch buffers, since for scratch reads/writes, the register - // offset must always be positive. + // The maximum private allocation for the entire GPU is 4G, and we are + // concerned with the largest the index could ever be for an individual + // workitem. This will occur with the minmum dispatch size. If a program + // requires more, the dispatch size will be reduced. + // + // With this limit, we can mark the high bit of the FrameIndex node as known + // zero, which is important, because it means in most situations we can prove + // that values derived from FrameIndex nodes are non-negative. This enables us + // to take advantage of more addressing modes when accessing scratch buffers, + // since for scratch reads/writes, the register offset must always be + // positive. - SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); - if (Subtarget->enableHugeScratchBuffer()) - return TFI; + uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; + // XXX - It is unclear if partial dispatch works. Assume it works at half wave + // granularity. It is probably a full wave. + uint64_t MinGranularity = 32; + + unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); + EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); + + SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, - DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); + DAG.getValueType(ExtVT)); +} + +bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { + if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return false; + + switch (cast(Intr->getOperand(1))->getZExtValue()) { + default: return false; + case AMDGPUIntrinsic::amdgcn_if: + case AMDGPUIntrinsic::amdgcn_else: + case AMDGPUIntrinsic::amdgcn_break: + case AMDGPUIntrinsic::amdgcn_if_break: + case AMDGPUIntrinsic::amdgcn_else_break: + case AMDGPUIntrinsic::amdgcn_loop: + case AMDGPUIntrinsic::amdgcn_end_cf: + return true; + } +} + +void SITargetLowering::createDebuggerPrologueStackObjects( + MachineFunction &MF) const { + // Create stack objects that are used for emitting debugger prologue. + // + // Debugger prologue writes work group IDs and work item IDs to scratch memory + // at fixed location in the following format: + // offset 0: work group ID x + // offset 4: work group ID y + // offset 8: work group ID z + // offset 16: work item ID x + // offset 20: work item ID y + // offset 24: work item ID z + SIMachineFunctionInfo *Info = MF.getInfo(); + int ObjectIdx = 0; + + // For each dimension: + for (unsigned i = 0; i < 3; ++i) { + // Create fixed stack object for work group ID. + ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true); + Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); + // Create fixed stack object for work item ID. + ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true); + Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); + } } /// This transforms the control flow intrinsics to get the branch destination as @@ -1137,13 +1352,11 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SDNode *Intr = BRCOND.getOperand(1).getNode(); SDValue Target = BRCOND.getOperand(2); SDNode *BR = nullptr; + SDNode *SetCC = nullptr; if (Intr->getOpcode() == ISD::SETCC) { // As long as we negate the condition everything is fine - SDNode *SetCC = Intr; - assert(SetCC->getConstantOperandVal(1) == 1); - assert(cast(SetCC->getOperand(2).getNode())->get() == - ISD::SETNE); + SetCC = Intr; Intr = SetCC->getOperand(0).getNode(); } else { @@ -1152,7 +1365,15 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, Target = BR->getOperand(1); } - assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); + if (!isCFIntrinsic(Intr)) { + // This is a uniform branch so we don't need to legalize. + return BRCOND; + } + + assert(!SetCC || + (SetCC->getConstantOperandVal(1) == 1 && + cast(SetCC->getOperand(2).getNode())->get() == + ISD::SETNE)); // Build the result and ArrayRef Res(Intr->value_begin() + 1, Intr->value_end()); @@ -1204,37 +1425,185 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +SDValue SITargetLowering::getSegmentAperture(unsigned AS, + SelectionDAG &DAG) const { + SDLoc SL; + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo(); + unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + assert(UserSGPR != AMDGPU::NoRegister); + + SDValue QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + + // Offset into amd_queue_t for group_segment_aperture_base_hi / + // private_segment_aperture_base_hi. + uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; + + SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, + DAG.getConstant(StructOffset, SL, MVT::i64)); + + // TODO: Use custom target PseudoSourceValue. + // TODO: We should use the value from the IR intrinsic call, but it might not + // be available and how do we get it? + Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), + AMDGPUAS::CONSTANT_ADDRESS)); + + MachinePointerInfo PtrInfo(V, StructOffset); + return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, + MinAlign(64, StructOffset), + MachineMemOperand::MOInvariant); +} + +SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + const AddrSpaceCastSDNode *ASC = cast(Op); + + SDValue Src = ASC->getOperand(0); + + // FIXME: Really support non-0 null pointers. + SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32); + SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); + + // flat -> local/private + if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); + SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + + return DAG.getNode(ISD::SELECT, SL, MVT::i32, + NonNull, Ptr, SegmentNullPtr); + } + } + + // local/private -> flat + if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NonNull + = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); + + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); + SDValue CvtPtr + = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); + + return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, + DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), + FlatNullPtr); + } + } + + // global <-> flat are no-ops and never emitted. + + const MachineFunction &MF = DAG.getMachineFunction(); + DiagnosticInfoUnsupported InvalidAddrSpaceCast( + *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); + DAG.getContext()->diagnose(InvalidAddrSpaceCast); + + return DAG.getUNDEF(ASC->getValueType(0)); +} + +static bool shouldEmitGOTReloc(const GlobalValue *GV, + const TargetMachine &TM) { + return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); +} + +bool +SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + // We can fold offsets for anything that doesn't require a GOT relocation. + return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine()); +} + +static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, + SDLoc DL, unsigned Offset, EVT PtrVT, + unsigned GAFlags = SIInstrInfo::MO_NONE) { + // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is + // lowered to the following code sequence: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // a fixup or relocation is emitted to replace $symbol with a literal + // constant, which is a pc-relative offset from the encoding of the $symbol + // operand to the global variable. + // + // What we want here is an offset from the value returned by s_getpc + // (which is the address of the s_add_u32 instruction) to the global + // variable, but since the encoding of $symbol starts 4 bytes after the start + // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too + // small. This requires us to add 4 to the global variable offset in order to + // compute the correct address. + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, + GAFlags); + return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA); +} + SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast(Op); - if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && + GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); SDLoc DL(GSD); const GlobalValue *GV = GSD->getGlobal(); - MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); + EVT PtrVT = Op.getValueType(); + + if (!shouldEmitGOTReloc(GV, getTargetMachine())) + return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); + + SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, + SIInstrInfo::MO_GOTPCREL); + + Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + const DataLayout &DataLayout = DAG.getDataLayout(); + unsigned Align = DataLayout.getABITypeAlignment(PtrTy); + // FIXME: Use a PseudoSourceValue once those can be assigned an address space. + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); + return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, + MachineMemOperand::MOInvariant); } -SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, - SDValue V) const { +SDValue SITargetLowering::lowerTRAP(SDValue Op, + SelectionDAG &DAG) const { + const MachineFunction &MF = DAG.getMachineFunction(); + DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), + "trap handler not supported", + Op.getDebugLoc(), + DS_Warning); + DAG.getContext()->diagnose(NoTrap); + + // Emit s_endpgm. + + // FIXME: This should really be selected to s_trap, but that requires + // setting up the trap handler for it o do anything. + return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, + Op.getOperand(0)); +} + +SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, + const SDLoc &DL, SDValue V) const { + // We can't use S_MOV_B32 directly, because there is no way to specify m0 as + // the destination register. + // // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, // so we will end up with redundant moves to m0. // - // We can't use S_MOV_B32, because there is no way to specify m0 as the - // destination register. - // - // We have to use them both. Machine cse will combine all the S_MOV_B32 - // instructions and the register coalescer eliminate the extra copies. - SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); - return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), - SDValue(M0, 0), SDValue()); // Glue - // A Null SDValue creates - // a glue result. + // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. + + // A Null SDValue creates a glue result. + SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, + V, Chain); + return SDValue(M0, 0); } SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, @@ -1249,12 +1618,27 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, DAG.getValueType(VT)); } +static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { + DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), + "non-hsa intrinsic with hsa target", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); +} + +static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { + DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), + "intrinsic not supported on subtarget", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto MFI = MF.getInfo(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -1264,62 +1648,134 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_dispatch_ptr: + case Intrinsic::amdgcn_queue_ptr: { if (!Subtarget->isAmdHsaOS()) { - DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(), - "hsa intrinsic without hsa target"); + DiagnosticInfoUnsupported BadIntrin( + *MF.getFunction(), "unsupported hsa intrinsic without hsa target", + DL.getDebugLoc()); DAG.getContext()->diagnose(BadIntrin); return DAG.getUNDEF(VT); } + auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? + SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); - + TRI->getPreloadedValue(MF, Reg), VT); + } + case Intrinsic::amdgcn_implicitarg_ptr: { + unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset); + } + case Intrinsic::amdgcn_kernarg_segment_ptr: { + unsigned Reg + = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + } + case Intrinsic::amdgcn_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_rsq: + case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_rsq_legacy: { + if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + return emitRemovedIntrinsicError(DAG, DL, VT); + + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + } + case Intrinsic::amdgcn_rsq_clamp: { + if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); + + Type *Type = VT.getTypeForEVT(*DAG.getContext()); + APFloat Max = APFloat::getLargest(Type->getFltSemantics()); + APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); + + SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, + DAG.getConstantFP(Max, DL, VT)); + return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, + DAG.getConstantFP(Min, DL, VT)); + } case Intrinsic::r600_read_ngroups_x: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_X, false); case Intrinsic::r600_read_ngroups_y: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_Y, false); case Intrinsic::r600_read_ngroups_z: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_Z, false); case Intrinsic::r600_read_global_size_x: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_X, false); case Intrinsic::r600_read_global_size_y: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); case Intrinsic::r600_read_global_size_z: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Z); - case Intrinsic::AMDGPU_read_workdim: + case Intrinsic::amdgcn_read_workdim: + case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name. // Really only 2 bits. return lowerImplicitZextParam(DAG, Op, MVT::i8, getImplicitParameterOffset(MFI, GRID_DIM)); + case Intrinsic::amdgcn_workgroup_id_x: case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); + case Intrinsic::amdgcn_workgroup_id_y: case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); + case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); + case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); + case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); + case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); @@ -1336,24 +1792,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } - case AMDGPUIntrinsic::SI_sample: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); - case AMDGPUIntrinsic::SI_sampleb: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); - case AMDGPUIntrinsic::SI_sampled: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); - case AMDGPUIntrinsic::SI_samplel: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), - DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); case AMDGPUIntrinsic::SI_fs_constant: { SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); SDValue Glue = M0.getValue(1); @@ -1393,11 +1837,93 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Glue); } + case Intrinsic::amdgcn_sin: + return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); + + case Intrinsic::amdgcn_cos: + return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); + + case Intrinsic::amdgcn_log_clamp: { + if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + return SDValue(); + + DiagnosticInfoUnsupported BadIntrin( + *MF.getFunction(), "intrinsic not supported on subtarget", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); + } + case Intrinsic::amdgcn_ldexp: + return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::amdgcn_fract: + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + + case Intrinsic::amdgcn_class: + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); + + case Intrinsic::amdgcn_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::amdgcn_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + + // Note this order is opposite of the machine instruction's operations, + // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The + // intrinsic has the numerator as the first operand to match a normal + // division operation. + + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, + Denominator, Numerator); + } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } } +SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: { + MemSDNode *M = cast(Op); + unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? + AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + + return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + default: + return SDValue(); + } +} + SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -1439,6 +1965,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, Op->getVTList(), Ops, VT, MMO); } + case AMDGPUIntrinsic::AMDGPU_kill: { + if (const ConstantFPSDNode *K = dyn_cast(Op.getOperand(2))) { + if (!K->isNegative()) + return Chain; + } + + return Op; + } default: return SDValue(); } @@ -1447,48 +1981,92 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT MemVT = Load->getMemoryVT(); - if (Op.getValueType().isVector()) { - assert(Op.getValueType().getVectorElementType() == MVT::i32 && - "Custom lowering for non-i32 vectors hasn't been implemented."); - unsigned NumElements = Op.getValueType().getVectorNumElements(); - assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { + assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); + // FIXME: Copied from PPC + // First, load into 32 bits, then truncate to 1 bit. - switch (Load->getAddressSpace()) { - default: break; - case AMDGPUAS::CONSTANT_ADDRESS: - if (isMemOpUniform(Load)) - break; - // Non-uniform loads will be selected to MUBUF instructions, so they - // have the same legalization requires ments as global and private - // loads. - // - // Fall-through - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::PRIVATE_ADDRESS: - if (NumElements >= 8) - return SplitVectorLoad(Op, DAG); - - // v4 loads are supported for private and global memory. - if (NumElements <= 4) - break; - // fall-through - case AMDGPUAS::LOCAL_ADDRESS: - // If properly aligned, if we split we might be able to use ds_read_b64. + SDValue Chain = Load->getChain(); + SDValue BasePtr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, + BasePtr, MVT::i8, MMO); + + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + + if (!MemVT.isVector()) + return SDValue(); + + assert(Op.getValueType().getVectorElementType() == MVT::i32 && + "Custom lowering for non-i32 vectors hasn't been implemented."); + + unsigned AS = Load->getAddressSpace(); + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + AS, Load->getAlignment())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, DL); + } + + unsigned NumElements = MemVT.getVectorNumElements(); + switch (AS) { + case AMDGPUAS::CONSTANT_ADDRESS: + if (isMemOpUniform(Load)) + return SDValue(); + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requires ments as global and private + // loads. + // + // Fall-through + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + if (NumElements > 4) + return SplitVectorLoad(Op, DAG); + // v4 loads are supported for private and global memory. + return SDValue(); + case AMDGPUAS::PRIVATE_ADDRESS: { + // Depending on the setting of the private_element_size field in the + // resource descriptor, we can only make private accesses up to a certain + // size. + switch (Subtarget->getMaxPrivateElementSize()) { + case 4: + return scalarizeVectorLoad(Load, DAG); + case 8: + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); + return SDValue(); + case 16: + // Same as global/flat + if (NumElements > 4) return SplitVectorLoad(Op, DAG); + return SDValue(); + default: + llvm_unreachable("unsupported private_element_size"); } } + case AMDGPUAS::LOCAL_ADDRESS: { + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); - return AMDGPUTargetLowering::LowerLOAD(Op, DAG); -} + if (NumElements == 2) + return SDValue(); -SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, - const SDValue &Op, - SelectionDAG &DAG) const { - return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4)); + // If properly aligned, if we split we might be able to use ds_read_b64. + return SplitVectorLoad(Op, DAG); + } + default: + return SDValue(); + } } SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -1514,7 +2092,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); - SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); + SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); } @@ -1547,7 +2125,9 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { } } - if (Unsafe) { + const SDNodeFlags *Flags = Op->getFlags(); + + if (Unsafe || Flags->hasAllowReciprocal()) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) SDNodeFlags Flags; @@ -1560,45 +2140,71 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - SDValue FastLowered = LowerFastFDIV(Op, DAG); - if (FastLowered.getNode()) + if (SDValue FastLowered = LowerFastFDIV(Op, DAG)) return FastLowered; - // This uses v_rcp_f32 which does not handle denormals. Let this hit a - // selection error for now rather than do something incorrect. - if (Subtarget->hasFP32Denormals()) - return SDValue(); - SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag + if (EnableAMDGPUFastFDIV) { + // This does not support denormals. + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + // TODO: Should this propagate fast-math-flags? - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + // rcp does not support denormals. + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + } + + // Generates more precise fpdiv32. const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); + SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + // Denominator is scaled to not be denormal, so using rcp is ok. + SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); - // TODO: Should this propagate fast-math-flags? + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One); + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp); - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled); + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled); - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + SDValue Scale = NumeratorScaled.getValue(1); + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { @@ -1635,7 +2241,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { SDValue Scale; - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. @@ -1685,26 +2291,57 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast(Op); EVT VT = Store->getMemoryVT(); - // These stores are legal. - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { - if (VT.isVector() && VT.getVectorNumElements() > 4) - return ScalarizeVectorStore(Op, DAG); - return SDValue(); + if (VT == MVT::i1) { + return DAG.getTruncStore(Store->getChain(), DL, + DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), + Store->getBasePtr(), MVT::i1, Store->getMemOperand()); } - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) - return Ret; + assert(VT.isVector() && + Store->getValue().getValueType().getScalarType() == MVT::i32); - if (VT.isVector() && VT.getVectorNumElements() >= 8) + unsigned AS = Store->getAddressSpace(); + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + AS, Store->getAlignment())) { + return expandUnalignedStore(Store, DAG); + } + + unsigned NumElements = VT.getVectorNumElements(); + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + if (NumElements > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + case AMDGPUAS::PRIVATE_ADDRESS: { + switch (Subtarget->getMaxPrivateElementSize()) { + case 4: + return scalarizeVectorStore(Store, DAG); + case 8: + if (NumElements > 2) + return SplitVectorStore(Op, DAG); + return SDValue(); + case 16: + if (NumElements > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + default: + llvm_unreachable("unsupported private_element_size"); + } + } + case AMDGPUAS::LOCAL_ADDRESS: { + if (NumElements > 2) return SplitVectorStore(Op, DAG); - if (VT == MVT::i1) - return DAG.getTruncStore(Store->getChain(), DL, - DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), - Store->getBasePtr(), MVT::i1, Store->getMemOperand()); + if (NumElements == 2) + return Op; - return SDValue(); + // If properly aligned, if we split we might be able to use ds_write_b64. + return SplitVectorStore(Op, DAG); + } + default: + llvm_unreachable("unhandled address space"); + } } SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { @@ -1727,6 +2364,33 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { } } +SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { + AtomicSDNode *AtomicNode = cast(Op); + assert(AtomicNode->isCompareAndSwap()); + unsigned AS = AtomicNode->getAddressSpace(); + + // No custom lowering required for local address space + if (!isFlatGlobalAddrSpace(AS)) + return Op; + + // Non-local address space requires custom lowering for atomic compare + // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 + SDLoc DL(Op); + SDValue ChainIn = Op.getOperand(0); + SDValue Addr = Op.getOperand(1); + SDValue Old = Op.getOperand(2); + SDValue New = Op.getOperand(3); + EVT VT = Op.getValueType(); + MVT SimpleVT = VT.getSimpleVT(); + MVT VecType = MVT::getVectorVT(SimpleVT, 2); + + SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); + SDValue Ops[] = { ChainIn, Addr, NewOld }; + + return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), + Ops, VT, AtomicNode->getMemOperand()); +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// @@ -1756,88 +2420,13 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, } } - // We are primarily trying to catch operations on illegal vector types - // before they are expanded. - // For scalars, we can use the more flexible method of checking masked bits - // after legalization. - if (!DCI.isBeforeLegalize() || - !SrcVT.isVector() || - SrcVT.getVectorElementType() != MVT::i8) { - return SDValue(); - } - - assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - - // Weird sized vectors are a pain to handle, but we know 3 is really the same - // size as 4. - unsigned NElts = SrcVT.getVectorNumElements(); - if (!SrcVT.isSimple() && NElts != 3) - return SDValue(); - - // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to - // prevent a mess from expanding to v4i32 and repacking. - if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); - EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); - EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); - LoadSDNode *Load = cast(Src); - - unsigned AS = Load->getAddressSpace(); - unsigned Align = Load->getAlignment(); - Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); - - // Don't try to replace the load if we have to expand it due to alignment - // problems. Otherwise we will end up scalarizing the load, and trying to - // repack into the vector for no real reason. - if (Align < ABIAlignment && - !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { - return SDValue(); - } - - SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, - Load->getChain(), - Load->getBasePtr(), - LoadVT, - Load->getMemOperand()); - - // Make sure successors of the original load stay after it by updating - // them to use the new Chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); - - SmallVector Elts; - if (RegVT.isVector()) - DAG.ExtractVectorElements(NewLoad, Elts); - else - Elts.push_back(NewLoad); - - SmallVector Ops; - - unsigned EltIdx = 0; - for (SDValue Elt : Elts) { - unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); - for (unsigned I = 0; I < ComponentsInElt; ++I) { - unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; - SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); - DCI.AddToWorklist(Cvt.getNode()); - Ops.push_back(Cvt); - } - - ++EltIdx; - } - - assert(Ops.size() == NElts); - - return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); - } - return SDValue(); } /// \brief Return true if the given offset Size in bytes can be folded into /// the immediate offsets of a memory instruction for the given address space. static bool canFoldOffset(unsigned OffsetSize, unsigned AS, - const AMDGPUSubtarget &STI) { + const SISubtarget &STI) { switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: { // MUBUF instructions a 12-bit offset in bytes. @@ -1846,7 +2435,7 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS, case AMDGPUAS::CONSTANT_ADDRESS: { // SMRD instructions have an 8-bit offset in dwords on SI and // a 20-bit offset in bytes on VI. - if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) return isUInt<20>(OffsetSize); else return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); @@ -1897,7 +2486,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, // If the resulting offset is too large, we can't fold it into the addressing // mode offset. APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); - if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) + if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget())) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -1915,6 +2504,9 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, if (DCI.isBeforeLegalize()) return SDValue(); + if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI)) + return Base; + SelectionDAG &DAG = DCI.DAG; // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> @@ -1970,6 +2562,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + EVT VT = N->getValueType(0); + if (VT == MVT::i64) { + // TODO: This could be a generic combine with a predicate for extracting the + // high half of an integer being free. + + // (or i64:x, (zero_extend i32:y)) -> + // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) + if (LHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOpcode() != ISD::ZERO_EXTEND) + std::swap(LHS, RHS); + + if (RHS.getOpcode() == ISD::ZERO_EXTEND) { + SDValue ExtSrc = RHS.getOperand(0); + EVT SrcVT = ExtSrc.getValueType(); + if (SrcVT == MVT::i32) { + SDLoc SL(N); + SDValue LowLHS, HiBits; + std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); + SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); + + DCI.AddToWorklist(LowOr.getNode()); + DCI.AddToWorklist(HiBits.getNode()); + + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + LowOr, HiBits); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); + } + } + } + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && RHS.getOpcode() == AMDGPUISD::FP_CLASS) { @@ -2005,9 +2627,52 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, return DAG.getConstant(0, SDLoc(N), MVT::i1); } + if (N->getOperand(0).isUndef()) + return DAG.getUNDEF(MVT::i1); + return SDValue(); } +// Constant fold canonicalize. +SDValue SITargetLowering::performFCanonicalizeCombine( + SDNode *N, + DAGCombinerInfo &DCI) const { + ConstantFPSDNode *CFP = dyn_cast(N->getOperand(0)); + if (!CFP) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + const APFloat &C = CFP->getValueAPF(); + + // Flush denormals to 0 if not enabled. + if (C.isDenormal()) { + EVT VT = N->getValueType(0); + if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) + return DAG.getConstantFP(0.0, SDLoc(N), VT); + + if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) + return DAG.getConstantFP(0.0, SDLoc(N), VT); + } + + if (C.isNaN()) { + EVT VT = N->getValueType(0); + APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); + if (C.isSignaling()) { + // Quiet a signaling NaN. + return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); + } + + // Make sure it is the canonical NaN bitpattern. + // + // TODO: Can we use -1 as the canonical NaN value since it's an inline + // immediate? + if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) + return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); + } + + return SDValue(CFP, 0); +} + static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: @@ -2027,8 +2692,64 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { } } -SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, - DAGCombinerInfo &DCI) const { +static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) { + ConstantSDNode *K1 = dyn_cast(Op1); + if (!K1) + return SDValue(); + + ConstantSDNode *K0 = dyn_cast(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + if (Signed) { + if (K0->getAPIntValue().sge(K1->getAPIntValue())) + return SDValue(); + } else { + if (K0->getAPIntValue().uge(K1->getAPIntValue())) + return SDValue(); + } + + EVT VT = K0->getValueType(0); + return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); +} + +static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { + if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) + return true; + + return DAG.isKnownNeverNaN(Op); +} + +static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1) { + ConstantFPSDNode *K1 = dyn_cast(Op1); + if (!K1) + return SDValue(); + + ConstantFPSDNode *K0 = dyn_cast(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + // Ordered >= (although NaN inputs should have folded away by now). + APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); + if (Cmp == APFloat::cmpGreaterThan) + return SDValue(); + + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a + // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then + // give the other result, which is different from med3 with a NaN input. + SDValue Var = Op0.getOperand(0); + if (!isKnownNeverSNan(DAG, Var)) + return SDValue(); + + return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), + Var, SDValue(K0, 0), SDValue(K1, 0)); +} + +SDValue SITargetLowering::performMinMaxCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; unsigned Opc = N->getOpcode(); @@ -2038,26 +2759,51 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - // max(max(a, b), c) - if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0.getOperand(0), - Op0.getOperand(1), - Op1); + if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { + // max(max(a, b), c) -> max3(a, b, c) + // min(min(a, b), c) -> min3(a, b, c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // Try commuted. + // max(a, max(b, c)) -> max3(a, b, c) + // min(a, min(b, c)) -> min3(a, b, c) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } } - // max(a, max(b, c)) - if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0, - Op1.getOperand(0), - Op1.getOperand(1)); + // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) + if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) + return Med3; + } + + if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) + return Med3; + } + + // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) + if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || + (Opc == AMDGPUISD::FMIN_LEGACY && + Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && + N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) + return Res; } return SDValue(); @@ -2104,16 +2850,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); - case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: - case ISD::UMIN: { + case ISD::UMIN: + case AMDGPUISD::FMIN_LEGACY: + case AMDGPUISD::FMAX_LEGACY: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) - return performMin3Max3Combine(N, DCI); + return performMinMaxCombine(N, DCI); break; } @@ -2122,8 +2870,23 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::CVT_F32_UBYTE2: case AMDGPUISD::CVT_F32_UBYTE3: { unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; - SDValue Src = N->getOperand(0); + + // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. + if (Src.getOpcode() == ISD::SRL) { + // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x + // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x + // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x + + if (const ConstantSDNode *C = dyn_cast(Src.getOperand(1))) { + unsigned SrcOffset = C->getZExtValue() + 8 * Offset; + if (SrcOffset < 32 && SrcOffset % 8 == 0) { + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL, + MVT::f32, Src.getOperand(0)); + } + } + } + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); APInt KnownZero, KnownOne; @@ -2238,7 +3001,9 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. + case ISD::ATOMIC_LOAD_UMAX: + case AMDGPUISD::ATOMIC_INC: + case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; @@ -2264,6 +3029,19 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performOrCombine(N, DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); + case ISD::FCANONICALIZE: + return performFCanonicalizeCombine(N, DCI); + case AMDGPUISD::FRACT: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RSQ_CLAMP: + case AMDGPUISD::LDEXP: { + SDValue Src = N->getOperand(0); + if (Src.isUndef()) + return Src; + break; + } } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } @@ -2273,9 +3051,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate /// and the immediate value if it's a literal immediate int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { - - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (const ConstantSDNode *Node = dyn_cast(N)) { if (TII->isInlineConstant(Node->getAPIntValue())) @@ -2314,7 +3090,8 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, SelectionDAG &DAG) const { SDNode *Users[4] = { }; unsigned Lane = 0; - unsigned OldDmask = Node->getConstantOperandVal(0); + unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; + unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; // Try to figure out the used register components @@ -2354,8 +3131,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Adjust the writemask in the node std::vector Ops; + Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); - Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); + Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); // If we only got one lane, replace it with a copy @@ -2421,14 +3199,15 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, /// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + unsigned Opcode = Node->getMachineOpcode(); - if (TII->isMIMG(Node->getMachineOpcode())) + if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && + !TII->isGather4(Opcode)) adjustWritemask(Node, DAG); - if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || - Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { + if (Opcode == AMDGPU::INSERT_SUBREG || + Opcode == AMDGPU::REG_SEQUENCE) { legalizeTargetIndependentNode(Node, DAG); return Node; } @@ -2437,22 +3216,22 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, /// \brief Assign the register class depending on the number of /// bits set in the writemask -void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, +void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - if (TII->isVOP3(MI->getOpcode())) { + if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); return; } - if (TII->isMIMG(*MI)) { - unsigned VReg = MI->getOperand(0).getReg(); - unsigned Writemask = MI->getOperand(1).getImm(); + if (TII->isMIMG(MI)) { + unsigned VReg = MI.getOperand(0).getReg(); + unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; + unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); unsigned BitsSet = 0; for (unsigned i = 0; i < 4; ++i) BitsSet += Writemask & (1 << i) ? 1 : 0; @@ -2465,34 +3244,58 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, case 3: RC = &AMDGPU::VReg_96RegClass; break; } - unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); - MI->setDesc(TII->get(NewOpcode)); + unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet); + MI.setDesc(TII->get(NewOpcode)); MRI.setRegClass(VReg, RC); return; } // Replace unused atomics with the no return version. - int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); + int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); if (NoRetAtomicOp != -1) { if (!Node->hasAnyUseOfValue(0)) { - MI->setDesc(TII->get(NoRetAtomicOp)); - MI->RemoveOperand(0); + MI.setDesc(TII->get(NoRetAtomicOp)); + MI.RemoveOperand(0); + return; } + // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg + // instruction, because the return type of these instructions is a vec2 of + // the memory type, so it can be tied to the input operand. + // This means these instructions always have a use, so we need to add a + // special case to check if the atomic has only one extract_subreg use, + // which itself has no uses. + if ((Node->hasNUsesOfValue(1, 0) && + Node->use_begin()->isMachineOpcode() && + Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && + !Node->use_begin()->hasAnyUseOfValue(0))) { + unsigned Def = MI.getOperand(0).getReg(); + + // Change this into a noret atomic. + MI.setDesc(TII->get(NoRetAtomicOp)); + MI.RemoveOperand(0); + + // If we only remove the def operand from the atomic instruction, the + // extract_subreg will be left with a use of a vreg without a def. + // So we need to insert an implicit_def to avoid machine verifier + // errors. + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Def); + } return; } } -static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { +static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, + uint64_t Val) { SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); } MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, - SDLoc DL, + const SDLoc &DL, SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); // Build the half of the subregister with the constants before building the // full 128-bit register. If we are building multiple resource descriptors, @@ -2524,10 +3327,8 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] /// of the resource descriptor) to create an offset, which is added to /// the resource pointer. -MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr, - uint32_t RsrcDword1, +MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, + SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const { SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index f01b2c0d09f3..8e055eea58c2 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -12,26 +12,26 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H -#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H #include "AMDGPUISelLowering.h" #include "SIInstrInfo.h" namespace llvm { -class SITargetLowering : public AMDGPUTargetLowering { - SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, +class SITargetLowering final : public AMDGPUTargetLowering { + SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, + unsigned Offset) const; + SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, unsigned Offset, bool Signed) const; - SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, - SelectionDAG &DAG) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; - SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT, unsigned Offset) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; @@ -43,8 +43,13 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; + SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; + void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; SDValue performUCharToFloatCombine(SDNode *N, @@ -55,14 +60,25 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; + + bool isCFIntrinsic(const SDNode *Intr) const; + + void createDebuggerPrologueStackObjects(MachineFunction &MF) const; public: - SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); + SITargetLowering(const TargetMachine &tm, const SISubtarget &STI); + + const SISubtarget *getSubtarget() const; + + bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, + unsigned IntrinsicID) const override; bool isShuffleMaskLegal(const SmallVectorImpl &/*Mask*/, EVT /*VT*/) const override; @@ -89,21 +105,30 @@ public: bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + bool isTypeDesirableForOp(unsigned Op, EVT VT) const override; + + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - SDLoc DL, SelectionDAG &DAG, + const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; - SDValue LowerReturn(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - SDLoc DL, SelectionDAG &DAG) const override; + const SmallVectorImpl &OutVals, const SDLoc &DL, + SelectionDAG &DAG) const override; + + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; + + MachineBasicBlock *splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const; - MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, - MachineBasicBlock * BB) const override; + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const override; bool enableAggressiveFMAFusion(EVT VT) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -112,7 +137,7 @@ public: SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; - void AdjustInstrPostInstrSelection(MachineInstr *MI, + void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; int32_t analyzeImmediate(const SDNode *N) const; @@ -120,17 +145,16 @@ public: unsigned Reg, EVT VT) const override; void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; - MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const; - MachineSDNode *buildRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr, - uint32_t RsrcDword1, - uint64_t RsrcDword2And3) const; + MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, + SDValue Ptr) const; + MachineSDNode *buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, + uint32_t RsrcDword1, uint64_t RsrcDword2And3) const; std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; ConstraintType getConstraintType(StringRef Constraint) const override; - SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; + SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, + SDValue V) const; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index 94e614750d2f..d24588d6c143 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -26,6 +26,8 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#define DEBUG_TYPE "si-insert-waits" + using namespace llvm; namespace { @@ -53,7 +55,7 @@ typedef std::pair RegInterval; class SIInsertWaits : public MachineFunctionPass { private: - static char ID; + const SISubtarget *ST; const SIInstrInfo *TII; const SIRegisterInfo *TRI; const MachineRegisterInfo *MRI; @@ -67,6 +69,10 @@ private: /// \brief Counter values we have already waited on. Counters WaitedOn; + /// \brief Counter values that we must wait on before the next counter + /// increase. + Counters DelayedWaitOn; + /// \brief Counter values for last instruction issued. Counters LastIssued; @@ -87,6 +93,9 @@ private: /// \brief Whether the machine function returns void bool ReturnsVoid; + /// Whether the VCCZ bit is possibly corrupt + bool VCCZCorrupt; + /// \brief Get increment/decrement amount for this instruction. Counters getHwCounts(MachineInstr &MI); @@ -99,13 +108,17 @@ private: /// \brief Handle instructions async components void pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I); + MachineBasicBlock::iterator I, + const Counters& Increment); /// \brief Insert the actual wait instruction bool insertWait(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const Counters &Counts); + /// \brief Handle existing wait instructions (from intrinsics) + void handleExistingWait(MachineBasicBlock::iterator I); + /// \brief Do we need def2def checks? bool unorderedDefines(MachineInstr &MI); @@ -115,12 +128,20 @@ private: /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + /// Return true if there are LGKM instrucitons that haven't been waited on + /// yet. + bool hasOutstandingLGKM() const; + public: - SIInsertWaits(TargetMachine &tm) : + static char ID; + + SIInsertWaits() : MachineFunctionPass(ID), + ST(nullptr), TII(nullptr), TRI(nullptr), - ExpInstrTypesSeen(0) { } + ExpInstrTypesSeen(0), + VCCZCorrupt(false) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -136,13 +157,28 @@ public: } // End anonymous namespace +INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE, + "SI Insert Waits", false, false) +INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE, + "SI Insert Waits", false, false) + char SIInsertWaits::ID = 0; -const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; +char &llvm::SIInsertWaitsID = SIInsertWaits::ID; + +FunctionPass *llvm::createSIInsertWaitsPass() { + return new SIInsertWaits(); +} + +const Counters SIInsertWaits::WaitCounts = { { 15, 7, 15 } }; const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; -FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { - return new SIInsertWaits(tm); +static bool readsVCCZ(unsigned Opcode) { + return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ; +} + +bool SIInsertWaits::hasOutstandingLGKM() const { + return WaitedOn.Named.LGKM != LastIssued.Named.LGKM; } Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { @@ -205,24 +241,23 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { return false; // Check if this operand is the value being stored. - // Special case for DS instructions, since the address + // Special case for DS/FLAT instructions, since the address // operand comes before the value operand and it may have // multiple data operands. - if (TII->isDS(MI)) { + if (TII->isDS(MI) || TII->isFLAT(MI)) { MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); if (Data && Op.isIdenticalTo(*Data)) return true; + } + if (TII->isDS(MI)) { MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); if (Data0 && Op.isIdenticalTo(*Data0)) return true; MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); - if (Data1 && Op.isIdenticalTo(*Data1)) - return true; - - return false; + return Data1 && Op.isIdenticalTo(*Data1); } // NOTE: This assumes that the value operand is before the @@ -250,10 +285,10 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, } void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator I, + const Counters &Increment) { // Get the hardware counter increments and sum them up - Counters Increment = getHwCounts(*I); Counters Limit = ZeroCounts; unsigned Sum = 0; @@ -270,8 +305,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, return; } - if (MBB.getParent()->getSubtarget().getGeneration() >= - AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM // or SMEM clause, respectively. // @@ -281,8 +315,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, // and destination registers don't overlap, e.g. this is illegal: // r0 = load r2 // r2 = load r0 - if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) || - (LastOpcodeType == VMEM && Increment.Named.VM)) { + if (LastOpcodeType == VMEM && Increment.Named.VM) { // Insert a NOP to break the clause. BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) .addImm(0); @@ -379,7 +412,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) .addImm((Counts.Named.VM & 0xF) | ((Counts.Named.EXP & 0x7) << 4) | - ((Counts.Named.LGKM & 0x7) << 8)); + ((Counts.Named.LGKM & 0xF) << 8)); LastOpcodeType = OTHER; LastInstWritesM0 = false; @@ -393,16 +426,38 @@ static void increaseCounters(Counters &Dst, const Counters &Src) { Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); } +/// \brief check whether any of the counters is non-zero +static bool countersNonZero(const Counters &Counter) { + for (unsigned i = 0; i < 3; ++i) + if (Counter.Array[i]) + return true; + return false; +} + +void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { + assert(I->getOpcode() == AMDGPU::S_WAITCNT); + + unsigned Imm = I->getOperand(0).getImm(); + Counters Counts, WaitOn; + + Counts.Named.VM = Imm & 0xF; + Counts.Named.EXP = (Imm >> 4) & 0x7; + Counts.Named.LGKM = (Imm >> 8) & 0xF; + + for (unsigned i = 0; i < 3; ++i) { + if (Counts.Array[i] <= LastIssued.Array[i]) + WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; + else + WaitOn.Array[i] = 0; + } + + increaseCounters(DelayedWaitOn, WaitOn); +} + Counters SIInsertWaits::handleOperands(MachineInstr &MI) { Counters Result = ZeroCounts; - // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, - // but we also want to wait for any other outstanding transfers before - // signalling other hardware blocks - if (MI.getOpcode() == AMDGPU::S_SENDMSG) - return LastIssued; - // For each register affected by this instruction increase the result // sequence. // @@ -432,8 +487,7 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - if (MBB.getParent()->getSubtarget().getGeneration() < - AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) return; // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. @@ -460,13 +514,13 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; - TII = static_cast(MF.getSubtarget().getInstrInfo()); - TRI = - static_cast(MF.getSubtarget().getRegisterInfo()); - + ST = &MF.getSubtarget(); + TII = ST->getInstrInfo(); + TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); WaitedOn = ZeroCounts; + DelayedWaitOn = ZeroCounts; LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; @@ -475,6 +529,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); + SmallVector RemoveMI; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { @@ -482,27 +538,81 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { + if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { + // There is a hardware bug on CI/SI where SMRD instruction may corrupt + // vccz bit, so when we detect that an instruction may read from a + // corrupt vccz bit, we need to: + // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to + // complete. + // 2. Restore the correct value of vccz by writing the current value + // of vcc back to vcc. + + if (TII->isSMRD(I->getOpcode())) { + VCCZCorrupt = true; + } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) { + // FIXME: We only care about SMRD instructions here, not LDS or GDS. + // Whenever we store a value in vcc, the correct value of vccz is + // restored. + VCCZCorrupt = false; + } + + // Check if we need to apply the bug work-around + if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) { + DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n'); + + // Wait on everything, not just LGKM. vccz reads usually come from + // terminators, and we always wait on everything at the end of the + // block, so if we only wait on LGKM here, we might end up with + // another s_waitcnt inserted right after this if there are non-LGKM + // instructions still outstanding. + insertWait(MBB, I, LastIssued); + + // Restore the vccz bit. Any time a value is written to vcc, the vcc + // bit is updated, so we can restore the bit by reading the value of + // vcc and then writing it back to the register. + BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), + AMDGPU::VCC) + .addReg(AMDGPU::VCC); + } + } + + // Record pre-existing, explicitly requested waits + if (I->getOpcode() == AMDGPU::S_WAITCNT) { + handleExistingWait(*I); + RemoveMI.push_back(&*I); + continue; + } + + Counters Required; + // Wait for everything before a barrier. - if (I->getOpcode() == AMDGPU::S_BARRIER) - Changes |= insertWait(MBB, I, LastIssued); + // + // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, + // but we also want to wait for any other outstanding transfers before + // signalling other hardware blocks + if (I->getOpcode() == AMDGPU::S_BARRIER || + I->getOpcode() == AMDGPU::S_SENDMSG) + Required = LastIssued; else - Changes |= insertWait(MBB, I, handleOperands(*I)); + Required = handleOperands(*I); + + Counters Increment = getHwCounts(*I); - pushInstruction(MBB, I); + if (countersNonZero(Required) || countersNonZero(Increment)) + increaseCounters(Required, DelayedWaitOn); + + Changes |= insertWait(MBB, I, Required); + + pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); } // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); - - // Functions returning something shouldn't contain S_ENDPGM, because other - // bytecode will be appended after it. - if (!ReturnsVoid) { - MachineBasicBlock::iterator I = MBB.getFirstTerminator(); - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) - I->eraseFromParent(); - } } + for (MachineInstr *I : RemoveMI) + I->eraseFromParent(); + return Changes; } diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index 0e883f64caa3..2f63d4ed13b3 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -11,8 +11,9 @@ // //===----------------------------------------------------------------------===// -class InstSI pattern> : - AMDGPUInst, PredicateControl { +class InstSI pattern = []> : + AMDGPUInst, PredicateControl { field bits<1> VM_CNT = 0; field bits<1> EXP_CNT = 0; @@ -31,6 +32,8 @@ class InstSI pattern> : field bits<1> VOP2 = 0; field bits<1> VOP3 = 0; field bits<1> VOPC = 0; + field bits<1> SDWA = 0; + field bits<1> DPP = 0; field bits<1> MUBUF = 0; field bits<1> MTBUF = 0; @@ -45,6 +48,8 @@ class InstSI pattern> : // is unable to infer the encoding from the operands. field bits<1> VOPAsmPrefer32Bit = 0; + field bits<1> Gather4 = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; @@ -63,18 +68,33 @@ class InstSI pattern> : let TSFlags{11} = VOP2; let TSFlags{12} = VOP3; let TSFlags{13} = VOPC; - - let TSFlags{14} = MUBUF; - let TSFlags{15} = MTBUF; - let TSFlags{16} = SMRD; - let TSFlags{17} = DS; - let TSFlags{18} = MIMG; - let TSFlags{19} = FLAT; - let TSFlags{20} = WQM; - let TSFlags{21} = VGPRSpill; - let TSFlags{22} = VOPAsmPrefer32Bit; + let TSFlags{14} = SDWA; + let TSFlags{15} = DPP; + + let TSFlags{16} = MUBUF; + let TSFlags{17} = MTBUF; + let TSFlags{18} = SMRD; + let TSFlags{19} = DS; + let TSFlags{20} = MIMG; + let TSFlags{21} = FLAT; + let TSFlags{22} = WQM; + let TSFlags{23} = VGPRSpill; + let TSFlags{24} = VOPAsmPrefer32Bit; + let TSFlags{25} = Gather4; let SchedRW = [Write32Bit]; + + field bits<1> DisableSIDecoder = 0; + field bits<1> DisableVIDecoder = 0; + field bits<1> DisableDecoder = 0; + + let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1); +} + +class PseudoInstSI pattern = []> + : InstSI { + let isPseudo = 1; + let isCodeGenOnly = 1; } class Enc32 { @@ -123,8 +143,10 @@ class VOP2Common pattern> : let Size = 4; } -class VOP3Common pattern> : - VOPAnyCommon { +class VOP3Common pattern = [], bit HasMods = 0, + bit VOP3Only = 0> : + VOPAnyCommon { // Using complex patterns gives VOP3 patterns a very high complexity rating, // but standalone patterns are almost always prefered, so we need to adjust the @@ -135,7 +157,11 @@ class VOP3Common pattern> : let VOP3 = 1; let VALU = 1; - let AsmMatchConverter = "cvtVOP3"; + let AsmMatchConverter = + !if(!eq(VOP3Only,1), + "cvtVOP3", + !if(!eq(HasMods,1), "cvtVOP3_2_mod", "")); + let isCodeGenOnly = 0; int Size = 8; @@ -154,9 +180,9 @@ class VOP3Common pattern> : class SOP1e op> : Enc32 { bits<7> sdst; - bits<8> ssrc0; + bits<8> src0; - let Inst{7-0} = ssrc0; + let Inst{7-0} = src0; let Inst{15-8} = op; let Inst{22-16} = sdst; let Inst{31-23} = 0x17d; //encoding; @@ -164,22 +190,22 @@ class SOP1e op> : Enc32 { class SOP2e op> : Enc32 { bits<7> sdst; - bits<8> ssrc0; - bits<8> ssrc1; + bits<8> src0; + bits<8> src1; - let Inst{7-0} = ssrc0; - let Inst{15-8} = ssrc1; + let Inst{7-0} = src0; + let Inst{15-8} = src1; let Inst{22-16} = sdst; let Inst{29-23} = op; let Inst{31-30} = 0x2; // encoding } class SOPCe op> : Enc32 { - bits<8> ssrc0; - bits<8> ssrc1; + bits<8> src0; + bits<8> src1; - let Inst{7-0} = ssrc0; - let Inst{15-8} = ssrc1; + let Inst{7-0} = src0; + let Inst{15-8} = src1; let Inst{22-16} = op; let Inst{31-23} = 0x17e; } @@ -218,9 +244,7 @@ class SOPPe op> : Enc32 { class SMRDe op, bits<1> imm> : Enc32 { bits<7> sdst; bits<7> sbase; - bits<8> offset; - let Inst{7-0} = offset; let Inst{8} = imm; let Inst{14-9} = sbase{6-1}; let Inst{21-15} = sdst; @@ -228,6 +252,18 @@ class SMRDe op, bits<1> imm> : Enc32 { let Inst{31-27} = 0x18; //encoding } +class SMRD_IMMe op> : SMRDe { + bits<8> offset; + let Inst{7-0} = offset; +} + +class SMRD_SOFFe op> : SMRDe { + bits<8> soff; + let Inst{7-0} = soff; +} + + + class SMRD_IMMe_ci op> : Enc64 { bits<7> sdst; bits<7> sbase; @@ -348,19 +384,18 @@ class VOP2_MADKe op> : Enc64 { bits<8> vdst; bits<9> src0; - bits<8> vsrc1; - bits<32> src2; + bits<8> src1; + bits<32> imm; let Inst{8-0} = src0; - let Inst{16-9} = vsrc1; + let Inst{16-9} = src1; let Inst{24-17} = vdst; let Inst{30-25} = op; let Inst{31} = 0x0; // encoding - let Inst{63-32} = src2; + let Inst{63-32} = imm; } -class VOP3e op> : Enc64 { - bits<8> vdst; +class VOP3a op> : Enc64 { bits<2> src0_modifiers; bits<9> src0; bits<2> src1_modifiers; @@ -370,7 +405,6 @@ class VOP3e op> : Enc64 { bits<1> clamp; bits<2> omod; - let Inst{7-0} = vdst; let Inst{8} = src0_modifiers{1}; let Inst{9} = src1_modifiers{1}; let Inst{10} = src2_modifiers{1}; @@ -386,6 +420,20 @@ class VOP3e op> : Enc64 { let Inst{63} = src2_modifiers{0}; } +class VOP3e op> : VOP3a { + bits<8> vdst; + + let Inst{7-0} = vdst; +} + +// Encoding used for VOPC instructions encoded as VOP3 +// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst +class VOP3ce op> : VOP3a { + bits<8> sdst; + + let Inst{7-0} = sdst; +} + class VOP3be op> : Enc64 { bits<8> vdst; bits<2> src0_modifiers; @@ -412,10 +460,10 @@ class VOP3be op> : Enc64 { class VOPCe op> : Enc32 { bits<9> src0; - bits<8> vsrc1; + bits<8> src1; let Inst{8-0} = src0; - let Inst{16-9} = vsrc1; + let Inst{16-9} = src1; let Inst{24-17} = op; let Inst{31-25} = 0x3e; } @@ -675,17 +723,17 @@ class FLAT op, dag outs, dag ins, string asm, list pattern> : let UseNamedOperandTable = 1; let hasSideEffects = 0; - let AsmMatchConverter = "cvtFlat"; let SchedRW = [WriteVMEM]; } -class MIMG op, dag outs, dag ins, string asm, list pattern> : - InstSI , MIMGe { +class MIMG pattern> : + InstSI { let VM_CNT = 1; let EXP_CNT = 1; let MIMG = 1; let Uses = [EXEC]; + let UseNamedOperandTable = 1; let hasSideEffects = 0; // XXX ???? } diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 1e10d25e8fb7..d171e21c8a4f 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -12,14 +12,15 @@ // //===----------------------------------------------------------------------===// - #include "SIInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "GCNHazardRecognizer.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/IR/Function.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" @@ -27,8 +28,8 @@ using namespace llvm; -SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), RI() {} +SIInstrInfo::SIInstrInfo(const SISubtarget &ST) + : AMDGPUInstrInfo(ST), RI(), ST(ST) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -74,12 +75,12 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); } -bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, +bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, AliasAnalysis *AA) const { // TODO: The generic check fails for VALU instructions that should be // rematerializable due to implicit reads of exec. We really want all of the // generic logic for this except for this. - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: @@ -201,18 +202,18 @@ static bool isStride64(unsigned Opc) { } } -bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, +bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, + int64_t &Offset, const TargetRegisterInfo *TRI) const { - unsigned Opc = LdSt->getOpcode(); + unsigned Opc = LdSt.getOpcode(); - if (isDS(*LdSt)) { - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); + if (isDS(LdSt)) { + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); if (OffsetImm) { // Normal, single offset LDS instruction. - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::addr); + const MachineOperand *AddrReg = + getNamedOperand(LdSt, AMDGPU::OpName::addr); BaseReg = AddrReg->getReg(); Offset = OffsetImm->getImm(); @@ -222,10 +223,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, // The 2 offset instructions use offset0 and offset1 instead. We can treat // these as a load with a single offset if the 2 offsets are consecutive. We // will use this for some partially aligned loads. - const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset0); - const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset1); + const MachineOperand *Offset0Imm = + getNamedOperand(LdSt, AMDGPU::OpName::offset0); + const MachineOperand *Offset1Imm = + getNamedOperand(LdSt, AMDGPU::OpName::offset1); uint8_t Offset0 = Offset0Imm->getImm(); uint8_t Offset1 = Offset1Imm->getImm(); @@ -235,19 +236,19 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, // to bytes of the individual reads. unsigned EltSize; - if (LdSt->mayLoad()) - EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; + if (LdSt.mayLoad()) + EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; else { - assert(LdSt->mayStore()); + assert(LdSt.mayStore()); int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); - EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); + EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); } if (isStride64(Opc)) EltSize *= 64; - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::addr); + const MachineOperand *AddrReg = + getNamedOperand(LdSt, AMDGPU::OpName::addr); BaseReg = AddrReg->getReg(); Offset = EltSize * Offset0; return true; @@ -256,63 +257,91 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, return false; } - if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { + if (isMUBUF(LdSt) || isMTBUF(LdSt)) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) return false; - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::vaddr); + const MachineOperand *AddrReg = + getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (!AddrReg) return false; - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); BaseReg = AddrReg->getReg(); Offset = OffsetImm->getImm(); return true; } - if (isSMRD(*LdSt)) { - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); + if (isSMRD(LdSt)) { + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); if (!OffsetImm) return false; - const MachineOperand *SBaseReg = getNamedOperand(*LdSt, - AMDGPU::OpName::sbase); + const MachineOperand *SBaseReg = + getNamedOperand(LdSt, AMDGPU::OpName::sbase); BaseReg = SBaseReg->getReg(); Offset = OffsetImm->getImm(); return true; } + if (isFLAT(LdSt)) { + const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); + BaseReg = AddrReg->getReg(); + Offset = 0; + return true; + } + return false; } -bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { - // TODO: This needs finer tuning - if (NumLoads > 4) +bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, + MachineInstr &SecondLdSt, + unsigned NumLoads) const { + const MachineOperand *FirstDst = nullptr; + const MachineOperand *SecondDst = nullptr; + + if (isDS(FirstLdSt) && isDS(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); + } + + if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); + } + + if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || + (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); + } + + if (!FirstDst || !SecondDst) return false; - if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) - return true; + // Try to limit clustering based on the total number of bytes loaded + // rather than the number of instructions. This is done to help reduce + // register pressure. The method used is somewhat inexact, though, + // because it assumes that all loads in the cluster will load the + // same number of bytes as FirstLdSt. - if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) - return true; + // The unit of this value is bytes. + // FIXME: This needs finer tuning. + unsigned LoadClusterThreshold = 16; - if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) && - (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt))) - return true; + const MachineRegisterInfo &MRI = + FirstLdSt.getParent()->getParent()->getRegInfo(); + const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); - return false; + return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; } -void -SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { +void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { // If we are trying to copy to or from SCC, there is a bug somewhere else in // the backend. While it may be theoretically possible to do this, it should @@ -361,7 +390,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Opcode; ArrayRef SubIndices; - bool Forward; if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); @@ -445,10 +473,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, llvm_unreachable("Can't copy register!"); } - if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) - Forward = true; - else - Forward = false; + bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { unsigned SubIdx; @@ -463,10 +488,12 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); if (Idx == SubIndices.size() - 1) - Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); + Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); if (Idx == 0) Builder.addReg(DestReg, RegState::Define | RegState::Implicit); + + Builder.addReg(SrcReg, RegState::Implicit); } } @@ -525,6 +552,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V32_SAVE; case 8: return AMDGPU::SI_SPILL_V64_SAVE; + case 12: + return AMDGPU::SI_SPILL_V96_SAVE; case 16: return AMDGPU::SI_SPILL_V128_SAVE; case 32: @@ -558,19 +587,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); + if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { + // m0 may not be allowed for readlane. + MachineRegisterInfo &MRI = MF->getRegInfo(); + MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); + } + // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling // SGPRs. unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) // src + .addReg(SrcReg, getKillRegState(isKill)) // src .addFrameIndex(FrameIndex) // frame_idx .addMemOperand(MMO); return; } - if (!ST.isVGPRSpillingEnabled(MFI)) { + if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" " spill register"); @@ -585,10 +620,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) // src + .addReg(SrcReg, getKillRegState(isKill)) // src .addFrameIndex(FrameIndex) // frame_idx .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } @@ -615,6 +651,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V32_RESTORE; case 8: return AMDGPU::SI_SPILL_V64_RESTORE; + case 12: + return AMDGPU::SI_SPILL_V96_RESTORE; case 16: return AMDGPU::SI_SPILL_V128_RESTORE; case 32: @@ -648,6 +686,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); + + if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { + // m0 may not be allowed for readlane. + MachineRegisterInfo &MRI = MF->getRegInfo(); + MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); + } + BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx .addMemOperand(MMO); @@ -655,7 +700,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - if (!ST.isVGPRSpillingEnabled(MFI)) { + if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" " restore register"); @@ -671,20 +716,18 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // frame_idx .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled -unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - RegScavenger *RS, unsigned TmpReg, - unsigned FrameOffset, - unsigned Size) const { +unsigned SIInstrInfo::calculateLDSSpillAddress( + MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, + unsigned FrameOffset, unsigned Size) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); - const AMDGPUSubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = - static_cast(ST.getRegisterInfo()); + const SISubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -699,8 +742,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, if (TIDReg == AMDGPU::NoRegister) return TIDReg; - - if (MFI->getShaderType() == ShaderType::COMPUTE && + if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && WorkGroupSize > WavefrontSize) { unsigned TIDIGXReg @@ -716,7 +758,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, Entry.addLiveIn(Reg); } - RS->enterBasicBlock(&Entry); + RS->enterBasicBlock(Entry); // FIXME: Can we scavenge an SReg_64 and access the subregs? unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); @@ -773,8 +815,10 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, return TmpReg; } -void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, +void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, int Count) const { + DebugLoc DL = MBB.findDebugLoc(MI); while (Count > 0) { int Arg; if (Count >= 8) @@ -782,76 +826,87 @@ void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, else Arg = Count - 1; Count -= 8; - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) .addImm(Arg); } } -bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - MachineBasicBlock &MBB = *MI->getParent(); +void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + insertWaitStates(MBB, MI, 1); +} + +unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: return 1; // FIXME: Do wait states equal cycles? + + case AMDGPU::S_NOP: + return MI.getOperand(0).getImm() + 1; + } +} + +bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - case AMDGPU::SGPR_USE: - // This is just a placeholder for register allocation. - MI->eraseFromParent(); - break; - case AMDGPU::V_MOV_B64_PSEUDO: { - unsigned Dst = MI->getOperand(0).getReg(); + unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - const MachineOperand &SrcOp = MI->getOperand(1); + const MachineOperand &SrcOp = MI.getOperand(1); // FIXME: Will this work for 64-bit floating point immediates? assert(!SrcOp.isFPImm()); if (SrcOp.isImm()) { APInt Imm(64, SrcOp.getImm()); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addImm(Imm.getLoBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit); + .addImm(Imm.getLoBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addImm(Imm.getHiBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit); + .addImm(Imm.getHiBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); } else { assert(SrcOp.isReg()); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) - .addReg(Dst, RegState::Implicit); + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) + .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) - .addReg(Dst, RegState::Implicit); + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) + .addReg(Dst, RegState::Implicit | RegState::Define); } - MI->eraseFromParent(); + MI.eraseFromParent(); break; } case AMDGPU::V_CNDMASK_B64_PSEUDO: { - unsigned Dst = MI->getOperand(0).getReg(); + unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - unsigned Src0 = MI->getOperand(1).getReg(); - unsigned Src1 = MI->getOperand(2).getReg(); - const MachineOperand &SrcCond = MI->getOperand(3); + unsigned Src0 = MI.getOperand(1).getReg(); + unsigned Src1 = MI.getOperand(2).getReg(); + const MachineOperand &SrcCond = MI.getOperand(3); BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) - .addOperand(SrcCond); + .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) + .addReg(SrcCond.getReg()) + .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) - .addOperand(SrcCond); - MI->eraseFromParent(); + .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) + .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill())) + .addReg(Dst, RegState::Implicit | RegState::Define); + MI.eraseFromParent(); break; } - case AMDGPU::SI_CONSTDATA_PTR: { - const SIRegisterInfo *TRI = - static_cast(ST.getRegisterInfo()); + case AMDGPU::SI_PC_ADD_REL_OFFSET: { + const SIRegisterInfo *TRI + = static_cast(ST.getRegisterInfo()); MachineFunction &MF = *MBB.getParent(); - unsigned Reg = MI->getOperand(0).getReg(); + unsigned Reg = MI.getOperand(0).getReg(); unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); @@ -863,15 +918,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { // Add 32-bit offset from this instruction to the start of the // constant data. Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .addOperand(MI->getOperand(1))); + .addReg(RegLo) + .addOperand(MI.getOperand(1))); Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi) .addImm(0)); llvm::finalizeBundle(MBB, Bundler.begin()); - MI->eraseFromParent(); + MI.eraseFromParent(); break; } } @@ -885,22 +940,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { /// non-commutable pair of operand indices OpIdx0 and OpIdx1. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. -MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, - bool NewMI, +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const { - int CommutedOpcode = commuteOpcode(*MI); + int CommutedOpcode = commuteOpcode(MI); if (CommutedOpcode == -1) return nullptr; - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - MachineOperand &Src0 = MI->getOperand(Src0Idx); + int Src0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); if (!Src0.isReg()) return nullptr; - int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src1); + int Src1Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); if ((OpIdx0 != static_cast(Src0Idx) || OpIdx1 != static_cast(Src1Idx)) && @@ -908,33 +962,32 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, OpIdx1 != static_cast(Src0Idx))) return nullptr; - MachineOperand &Src1 = MI->getOperand(Src1Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); - - if (isVOP2(*MI)) { - const MCInstrDesc &InstrDesc = MI->getDesc(); - // For VOP2 instructions, any operand type is valid to use for src0. Make - // sure we can use the src1 as src0. + if (isVOP2(MI) || isVOPC(MI)) { + const MCInstrDesc &InstrDesc = MI.getDesc(); + // For VOP2 and VOPC instructions, any operand type is valid to use for + // src0. Make sure we can use the src0 as src1. // // We could be stricter here and only allow commuting if there is a reason // to do so. i.e. if both operands are VGPRs there is no real benefit, // although MachineCSE attempts to find matches by commuting. - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) return nullptr; } + MachineInstr *CommutedMI = &MI; if (!Src1.isReg()) { // Allow commuting instructions with Imm operands. - if (NewMI || !Src1.isImm() || - (!isVOP2(*MI) && !isVOP3(*MI))) { + if (NewMI || !Src1.isImm() || (!isVOP2(MI) && !isVOP3(MI))) { return nullptr; } // Be sure to copy the source modifiers to the right place. - if (MachineOperand *Src0Mods - = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { - MachineOperand *Src1Mods - = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); + if (MachineOperand *Src0Mods = + getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) { + MachineOperand *Src1Mods = + getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); int Src0ModsVal = Src0Mods->getImm(); if (!Src1Mods && Src0ModsVal != 0) @@ -959,26 +1012,26 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, Src1.ChangeToRegister(Reg, false); Src1.setSubReg(SubReg); } else { - MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); + CommutedMI = + TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); } - if (MI) - MI->setDesc(get(CommutedOpcode)); + if (CommutedMI) + CommutedMI->setDesc(get(CommutedOpcode)); - return MI; + return CommutedMI; } // This needs to be implemented because the source modifiers may be inserted // between the true commutable operands, and the base // TargetInstrInfo::commuteInstruction uses it. -bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx0, +bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const { - const MCInstrDesc &MCID = MI->getDesc(); + const MCInstrDesc &MCID = MI.getDesc(); if (!MCID.isCommutable()) return false; - unsigned Opc = MI->getOpcode(); + unsigned Opc = MI.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return false; @@ -986,24 +1039,24 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on // immediate. Also, immediate src0 operand is not handled in // SIInstrInfo::commuteInstruction(); - if (!MI->getOperand(Src0Idx).isReg()) + if (!MI.getOperand(Src0Idx).isReg()) return false; int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return false; - MachineOperand &Src1 = MI->getOperand(Src1Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); if (Src1.isImm()) { // SIInstrInfo::commuteInstruction() does support commuting the immediate // operand src1 in 2 and 3 operand instructions. - if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) + if (!isVOP2(MI.getOpcode()) && !isVOP3(MI.getOpcode())) return false; } else if (Src1.isReg()) { // If any source modifiers are set, the generic instruction commuting won't // understand how to copy the source modifiers. - if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)) return false; } else return false; @@ -1011,23 +1064,135 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } -MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, - unsigned SrcReg) const { - return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), - DstReg) .addReg(SrcReg); +unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { + switch (Cond) { + case SIInstrInfo::SCC_TRUE: + return AMDGPU::S_CBRANCH_SCC1; + case SIInstrInfo::SCC_FALSE: + return AMDGPU::S_CBRANCH_SCC0; + case SIInstrInfo::VCCNZ: + return AMDGPU::S_CBRANCH_VCCNZ; + case SIInstrInfo::VCCZ: + return AMDGPU::S_CBRANCH_VCCZ; + case SIInstrInfo::EXECNZ: + return AMDGPU::S_CBRANCH_EXECNZ; + case SIInstrInfo::EXECZ: + return AMDGPU::S_CBRANCH_EXECZ; + default: + llvm_unreachable("invalid branch predicate"); + } +} + +SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::S_CBRANCH_SCC0: + return SCC_FALSE; + case AMDGPU::S_CBRANCH_SCC1: + return SCC_TRUE; + case AMDGPU::S_CBRANCH_VCCNZ: + return VCCNZ; + case AMDGPU::S_CBRANCH_VCCZ: + return VCCZ; + case AMDGPU::S_CBRANCH_EXECNZ: + return EXECNZ; + case AMDGPU::S_CBRANCH_EXECZ: + return EXECZ; + default: + return INVALID_BR; + } } -bool SIInstrInfo::isMov(unsigned Opcode) const { - switch(Opcode) { - default: return false; - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: +bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + + if (I == MBB.end()) + return false; + + if (I->getOpcode() == AMDGPU::S_BRANCH) { + // Unconditional Branch + TBB = I->getOperand(0).getMBB(); + return false; + } + + BranchPredicate Pred = getBranchPredicate(I->getOpcode()); + if (Pred == INVALID_BR) return true; + + MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(Pred)); + + ++I; + + if (I == MBB.end()) { + // Conditional branch followed by fall-through. + TBB = CondBB; + return false; + } + + if (I->getOpcode() == AMDGPU::S_BRANCH) { + TBB = CondBB; + FBB = I->getOperand(0).getMBB(); + return false; + } + + return true; +} + +unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + + unsigned Count = 0; + while (I != MBB.end()) { + MachineBasicBlock::iterator Next = std::next(I); + I->eraseFromParent(); + ++Count; + I = Next; + } + + return Count; +} + +unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef Cond, + const DebugLoc &DL) const { + + if (!FBB && Cond.empty()) { + BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) + .addMBB(TBB); + return 1; + } + + assert(TBB && Cond[0].isImm()); + + unsigned Opcode + = getBranchOpcode(static_cast(Cond[0].getImm())); + + if (!FBB) { + BuildMI(&MBB, DL, get(Opcode)) + .addMBB(TBB); + return 1; } + + assert(TBB && FBB); + + BuildMI(&MBB, DL, get(Opcode)) + .addMBB(TBB); + BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) + .addMBB(FBB); + + return 2; +} + +bool SIInstrInfo::ReverseBranchCondition( + SmallVectorImpl &Cond) const { + assert(Cond.size() == 1); + Cond[0].setImm(-Cond[0].getImm()); + return false; } static void removeModOperands(MachineInstr &MI) { @@ -1044,81 +1209,76 @@ static void removeModOperands(MachineInstr &MI) { MI.RemoveOperand(Src0ModIdx); } -bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, +// TODO: Maybe this should be removed this and custom fold everything in +// SIFoldOperands? +bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) return false; - unsigned Opc = UseMI->getOpcode(); + unsigned Opc = UseMI.getOpcode(); if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. - if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || - hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { + if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { return false; } - MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); - MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); - MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); + const MachineOperand &ImmOp = DefMI.getOperand(1); + + // If this is a free constant, there's no reason to do this. + // TODO: We could fold this here instead of letting SIFoldOperands do it + // later. + if (isInlineConstant(ImmOp, 4)) + return false; + + MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); + MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); + MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); // Multiplied part is the constant: Use v_madmk_f32 // We should only expect these to be on src0 due to canonicalizations. if (Src0->isReg() && Src0->getReg() == Reg) { - if (!Src1->isReg() || - (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; - if (!Src2->isReg() || - (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) + if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) return false; - // We need to do some weird looking operand shuffling since the madmk - // operands are out of the normal expected order with the multiplied - // constant as the last operand. - // - // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 - // src0 -> src2 K - // src1 -> src0 - // src2 -> src1 + // We need to swap operands 0 and 1 since madmk constant is at operand 1. - const int64_t Imm = DefMI->getOperand(1).getImm(); + const int64_t Imm = DefMI.getOperand(1).getImm(); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::clamp)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); unsigned Src1Reg = Src1->getReg(); unsigned Src1SubReg = Src1->getSubReg(); - unsigned Src2Reg = Src2->getReg(); - unsigned Src2SubReg = Src2->getSubReg(); Src0->setReg(Src1Reg); Src0->setSubReg(Src1SubReg); Src0->setIsKill(Src1->isKill()); - Src1->setReg(Src2Reg); - Src1->setSubReg(Src2SubReg); - Src1->setIsKill(Src2->isKill()); - if (Opc == AMDGPU::V_MAC_F32_e64) { - UseMI->untieRegOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + UseMI.untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } - Src2->ChangeToImmediate(Imm); + Src1->ChangeToImmediate(Imm); - removeModOperands(*UseMI); - UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); + removeModOperands(UseMI); + UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) - DefMI->eraseFromParent(); + DefMI.eraseFromParent(); return true; } @@ -1131,36 +1291,35 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) return false; - if (!Src1->isReg() || - (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; - const int64_t Imm = DefMI->getOperand(1).getImm(); + const int64_t Imm = DefMI.getOperand(1).getImm(); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::clamp)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); if (Opc == AMDGPU::V_MAC_F32_e64) { - UseMI->untieRegOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + UseMI.untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. - removeModOperands(*UseMI); - UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); + removeModOperands(UseMI); + UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) - DefMI->eraseFromParent(); + DefMI.eraseFromParent(); return true; } @@ -1177,17 +1336,20 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA, return LowOffset + LowWidth <= HighOffset; } -bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, - MachineInstr *MIb) const { - unsigned BaseReg0, Offset0; - unsigned BaseReg1, Offset1; +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, + MachineInstr &MIb) const { + unsigned BaseReg0, BaseReg1; + int64_t Offset0, Offset1; if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { - assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && - "read2 / write2 not expected here yet"); - unsigned Width0 = (*MIa->memoperands_begin())->getSize(); - unsigned Width1 = (*MIb->memoperands_begin())->getSize(); + + if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { + // FIXME: Handle ds_read2 / ds_write2. + return false; + } + unsigned Width0 = (*MIa.memoperands_begin())->getSize(); + unsigned Width1 = (*MIb.memoperands_begin())->getSize(); if (BaseReg0 == BaseReg1 && offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { return true; @@ -1197,19 +1359,19 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, return false; } -bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, - MachineInstr *MIb, +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, + MachineInstr &MIb, AliasAnalysis *AA) const { - assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && + assert((MIa.mayLoad() || MIa.mayStore()) && "MIa must load from or modify a memory location"); - assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && + assert((MIb.mayLoad() || MIb.mayStore()) && "MIb must load from or modify a memory location"); - if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) + if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) return false; // XXX - Can we relax this between address spaces? - if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; // TODO: Should we check the address space from the MachineMemOperand? That @@ -1217,29 +1379,29 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, // underlying address space, even if it was lowered to a different one, // e.g. private accesses lowered to use MUBUF instructions on a scratch // buffer. - if (isDS(*MIa)) { - if (isDS(*MIb)) + if (isDS(MIa)) { + if (isDS(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(*MIb); + return !isFLAT(MIb); } - if (isMUBUF(*MIa) || isMTBUF(*MIa)) { - if (isMUBUF(*MIb) || isMTBUF(*MIb)) + if (isMUBUF(MIa) || isMTBUF(MIa)) { + if (isMUBUF(MIb) || isMTBUF(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(*MIb) && !isSMRD(*MIb); + return !isFLAT(MIb) && !isSMRD(MIb); } - if (isSMRD(*MIa)) { - if (isSMRD(*MIb)) + if (isSMRD(MIa)) { + if (isSMRD(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); + return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); } - if (isFLAT(*MIa)) { - if (isFLAT(*MIb)) + if (isFLAT(MIa)) { + if (isFLAT(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); return false; @@ -1249,35 +1411,49 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, } MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, - MachineBasicBlock::iterator &MI, - LiveVariables *LV) const { - - switch (MI->getOpcode()) { - default: return nullptr; - case AMDGPU::V_MAC_F32_e64: break; - case AMDGPU::V_MAC_F32_e32: { - const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); - if (Src0->isImm() && !isInlineConstant(*Src0, 4)) - return nullptr; - break; - } + MachineInstr &MI, + LiveVariables *LV) const { + + switch (MI.getOpcode()) { + default: + return nullptr; + case AMDGPU::V_MAC_F32_e64: + break; + case AMDGPU::V_MAC_F32_e32: { + const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); + if (Src0->isImm() && !isInlineConstant(*Src0, 4)) + return nullptr; + break; + } } - const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst); - const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); - const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); - const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); + const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); - return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) - .addOperand(*Dst) - .addImm(0) // Src0 mods - .addOperand(*Src0) - .addImm(0) // Src1 mods - .addOperand(*Src1) - .addImm(0) // Src mods - .addOperand(*Src2) - .addImm(0) // clamp - .addImm(0); // omod + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) + .addOperand(*Dst) + .addImm(0) // Src0 mods + .addOperand(*Src0) + .addImm(0) // Src1 mods + .addOperand(*Src1) + .addImm(0) // Src mods + .addOperand(*Src2) + .addImm(0) // clamp + .addImm(0); // omod +} + +bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + // XXX - Do we want the SP check in the base implementation? + + // Target-independent instructions do not have an implicit-use of EXEC, even + // when they operate on VGPRs. Treating EXEC modifications as scheduling + // boundaries prevents incorrect movements of such instructions. + return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || + MI.modifiesRegister(AMDGPU::EXEC, &RI); } bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { @@ -1355,9 +1531,9 @@ static bool compareMachineOp(const MachineOperand &Op0, } } -bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, - const MachineOperand &MO) const { - const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; +bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, + const MachineOperand &MO) const { + const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); @@ -1418,14 +1594,10 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, return true; // SGPRs use the constant bus - if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || - (!MO.isImplicit() && - (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || - AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { - return true; - } - - return false; + return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || + (!MO.isImplicit() && + (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || + AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); } static unsigned findImplicitSGPRRead(const MachineInstr &MI) { @@ -1448,10 +1620,33 @@ static unsigned findImplicitSGPRRead(const MachineInstr &MI) { return AMDGPU::NoRegister; } -bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, +static bool shouldReadExec(const MachineInstr &MI) { + if (SIInstrInfo::isVALU(MI)) { + switch (MI.getOpcode()) { + case AMDGPU::V_READLANE_B32: + case AMDGPU::V_READLANE_B32_si: + case AMDGPU::V_READLANE_B32_vi: + case AMDGPU::V_WRITELANE_B32: + case AMDGPU::V_WRITELANE_B32_si: + case AMDGPU::V_WRITELANE_B32_vi: + return false; + } + + return true; + } + + if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || + SIInstrInfo::isSALU(MI) || + SIInstrInfo::isSMRD(MI)) + return false; + + return true; +} + +bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { - uint16_t Opcode = MI->getOpcode(); - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + uint16_t Opcode = MI.getOpcode(); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); @@ -1459,14 +1654,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); if (!Desc.isVariadic() && - Desc.getNumOperands() != MI->getNumExplicitOperands()) { - ErrInfo = "Instruction has wrong number of operands."; - return false; + Desc.getNumOperands() != MI.getNumExplicitOperands()) { + ErrInfo = "Instruction has wrong number of operands."; + return false; } // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { - if (MI->getOperand(i).isFPImm()) { + if (MI.getOperand(i).isFPImm()) { ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " "all fp values to integers."; return false; @@ -1476,7 +1671,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: - if (MI->getOperand(i).isImm()) { + if (MI.getOperand(i).isImm()) { ErrInfo = "Illegal immediate value for operand."; return false; } @@ -1484,17 +1679,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, case AMDGPU::OPERAND_REG_IMM32: break; case AMDGPU::OPERAND_REG_INLINE_C: - if (isLiteralConstant(MI->getOperand(i), + if (isLiteralConstant(MI.getOperand(i), RI.getRegClass(RegClass)->getSize())) { ErrInfo = "Illegal immediate value for operand."; return false; } break; case MCOI::OPERAND_IMMEDIATE: + case AMDGPU::OPERAND_KIMM32: // Check if this operand is an immediate. // FrameIndex operands will be replaced by immediates, so they are // allowed. - if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { + if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { ErrInfo = "Expected immediate, but got non-immediate"; return false; } @@ -1503,12 +1699,13 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, continue; } - if (!MI->getOperand(i).isReg()) + if (!MI.getOperand(i).isReg()) continue; if (RegClass != -1) { - unsigned Reg = MI->getOperand(i).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) + unsigned Reg = MI.getOperand(i).getReg(); + if (Reg == AMDGPU::NoRegister || + TargetRegisterInfo::isVirtualRegister(Reg)) continue; const TargetRegisterClass *RC = RI.getRegClass(RegClass); @@ -1519,23 +1716,26 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } } - // Verify VOP* - if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { + if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; unsigned ConstantBusCount = 0; - unsigned SGPRUsed = findImplicitSGPRRead(*MI); + + if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) + ++ConstantBusCount; + + unsigned SGPRUsed = findImplicitSGPRRead(MI); if (SGPRUsed != AMDGPU::NoRegister) ++ConstantBusCount; for (int OpIdx : OpIndices) { if (OpIdx == -1) break; - const MachineOperand &MO = MI->getOperand(OpIdx); + const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { if (MO.isReg()) { if (MO.getReg() != SGPRUsed) @@ -1555,9 +1755,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { - const MachineOperand &Src0 = MI->getOperand(Src0Idx); - const MachineOperand &Src1 = MI->getOperand(Src1Idx); - const MachineOperand &Src2 = MI->getOperand(Src2Idx); + const MachineOperand &Src0 = MI.getOperand(Src0Idx); + const MachineOperand &Src1 = MI.getOperand(Src1Idx); + const MachineOperand &Src2 = MI.getOperand(Src2Idx); if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { if (!compareMachineOp(Src0, Src1) && !compareMachineOp(Src0, Src2)) { @@ -1569,9 +1769,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Make sure we aren't losing exec uses in the td files. This mostly requires // being careful when using let Uses to try to add other use registers. - if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { - const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); - if (!Exec || !Exec->isImplicit()) { + if (shouldReadExec(MI)) { + if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { ErrInfo = "VALU instruction does not implicitly read exec mask"; return false; } @@ -1624,22 +1823,18 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: - case AMDGPU::S_LOAD_DWORD_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: - case AMDGPU::S_LOAD_DWORDX2_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: - case AMDGPU::S_LOAD_DWORDX4_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; + case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; + case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; + case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; + case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; + case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; + case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; + case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; } } @@ -1676,12 +1871,12 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { } } -void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { +void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MachineBasicBlock::iterator I = MI; - MachineBasicBlock *MBB = MI->getParent(); - MachineOperand &MO = MI->getOperand(OpIdx); + MachineBasicBlock *MBB = MI.getParent(); + MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; + unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (MO.isReg()) @@ -1689,7 +1884,6 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { else if (RI.isSGPRClass(RC)) Opcode = AMDGPU::S_MOV_B32; - const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) VRC = &AMDGPU::VReg_64RegClass; @@ -1698,8 +1892,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { unsigned Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); - BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) - .addOperand(MO); + BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO); MO.ChangeToRegister(Reg, false); } @@ -1758,11 +1951,11 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm( } // Change the order of operands from (0, 1, 2) to (0, 2, 1) -void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { - assert(Inst->getNumExplicitOperands() == 3); - MachineOperand Op1 = Inst->getOperand(1); - Inst->RemoveOperand(1); - Inst->addOperand(Op1); +void SIInstrInfo::swapOperands(MachineInstr &Inst) const { + assert(Inst.getNumExplicitOperands() == 3); + MachineOperand Op1 = Inst.getOperand(1); + Inst.RemoveOperand(1); + Inst.addOperand(Op1); } bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, @@ -1804,26 +1997,32 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, return true; } -bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, +bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - const MCInstrDesc &InstDesc = get(MI->getOpcode()); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; const TargetRegisterClass *DefinedRC = OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; if (!MO) - MO = &MI->getOperand(OpIdx); + MO = &MI.getOperand(OpIdx); + + if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { + + RegSubRegPair SGPRUsed; + if (MO->isReg()) + SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); - if (isVALU(*MI) && - usesConstantBus(MRI, *MO, DefinedRC->getSize())) { - unsigned SGPRUsed = - MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (i == OpIdx) continue; - const MachineOperand &Op = MI->getOperand(i); - if (Op.isReg() && Op.getReg() != SGPRUsed && - usesConstantBus(MRI, Op, getOpSize(*MI, i))) { + const MachineOperand &Op = MI.getOperand(i); + if (Op.isReg()) { + if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && + usesConstantBus(MRI, Op, getOpSize(MI, i))) { + return false; + } + } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { return false; } } @@ -1834,7 +2033,6 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, return isLegalRegOperand(MRI, OpInfo, *MO); } - // Handle non-register types that are treated like immediates. assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); @@ -1847,12 +2045,12 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, } void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, - MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); + MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); const MCInstrDesc &InstrDesc = get(Opc); int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); - MachineOperand &Src1 = MI->getOperand(Src1Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 // we need to only have one constant bus use. @@ -1860,10 +2058,10 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // Note we do not need to worry about literal constants here. They are // disabled for the operand type for instructions because they will always // violate the one constant bus use rule. - bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; + bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; if (HasImplicitSGPR) { int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI->getOperand(Src0Idx); + MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) legalizeOpWithMove(MI, Src0Idx); @@ -1878,13 +2076,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // commute if it is possible. We only want to commute here if it improves // legality. This can be called a fairly large number of times so don't waste // compile time pointlessly swapping and checking legality again. - if (HasImplicitSGPR || !MI->isCommutable()) { + if (HasImplicitSGPR || !MI.isCommutable()) { legalizeOpWithMove(MI, Src1Idx); return; } int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI->getOperand(Src0Idx); + MachineOperand &Src0 = MI.getOperand(Src0Idx); // If src0 can be used as src1, commuting will make the operands legal. // Otherwise we have to give up and insert a move. @@ -1897,13 +2095,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, return; } - int CommutedOpc = commuteOpcode(*MI); + int CommutedOpc = commuteOpcode(MI); if (CommutedOpc == -1) { legalizeOpWithMove(MI, Src1Idx); return; } - MI->setDesc(get(CommutedOpc)); + MI.setDesc(get(CommutedOpc)); unsigned Src0Reg = Src0.getReg(); unsigned Src0SubReg = Src0.getSubReg(); @@ -1925,10 +2123,9 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // operand, and since literal constants are not allowed and should never be // seen, we only need to worry about inserting copies if we use multiple SGPR // operands. -void SIInstrInfo::legalizeOperandsVOP3( - MachineRegisterInfo &MRI, - MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); +void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, + MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); int VOP3Idx[3] = { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), @@ -1943,7 +2140,7 @@ void SIInstrInfo::legalizeOperandsVOP3( int Idx = VOP3Idx[i]; if (Idx == -1) break; - MachineOperand &MO = MI->getOperand(Idx); + MachineOperand &MO = MI.getOperand(Idx); // We should never see a VOP3 instruction with an illegal immediate operand. if (!MO.isReg()) @@ -1964,32 +2161,78 @@ void SIInstrInfo::legalizeOperandsVOP3( } } -void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); +unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, + MachineRegisterInfo &MRI) const { + const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); + unsigned DstReg = MRI.createVirtualRegister(SRC); + unsigned SubRegs = VRC->getSize() / 4; + + SmallVector SRegs; + for (unsigned i = 0; i < SubRegs; ++i) { + unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(AMDGPU::V_READFIRSTLANE_B32), SGPR) + .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); + SRegs.push_back(SGPR); + } + + MachineInstrBuilder MIB = + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(AMDGPU::REG_SEQUENCE), DstReg); + for (unsigned i = 0; i < SubRegs; ++i) { + MIB.addReg(SRegs[i]); + MIB.addImm(RI.getSubRegFromChannel(i)); + } + return DstReg; +} + +void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, + MachineInstr &MI) const { + + // If the pointer is store in VGPRs, then we need to move them to + // SGPRs using v_readfirstlane. This is safe because we only select + // loads with uniform pointers to SMRD instruction so we know the + // pointer value is uniform. + MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); + if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); + SBase->setReg(SGPR); + } +} + +void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); // Legalize VOP2 - if (isVOP2(*MI)) { + if (isVOP2(MI) || isVOPC(MI)) { legalizeOperandsVOP2(MRI, MI); return; } // Legalize VOP3 - if (isVOP3(*MI)) { + if (isVOP3(MI)) { legalizeOperandsVOP3(MRI, MI); return; } + // Legalize SMRD + if (isSMRD(MI)) { + legalizeOperandsSMRD(MRI, MI); + return; + } + // Legalize REG_SEQUENCE and PHI // The register class of the operands much be the same type as the register // class of the output. - if (MI->getOpcode() == AMDGPU::PHI) { + if (MI.getOpcode() == AMDGPU::PHI) { const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; - for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { - if (!MI->getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { + if (!MI.getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) continue; const TargetRegisterClass *OpRC = - MRI.getRegClass(MI->getOperand(i).getReg()); + MRI.getRegClass(MI.getOperand(i).getReg()); if (RI.hasVGPRs(OpRC)) { VRC = OpRC; } else { @@ -2000,7 +2243,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // If any of the operands are VGPR registers, then they all most be // otherwise we will create illegal VGPR->SGPR copies when legalizing // them. - if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { + if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); VRC = RI.getEquivalentVGPRClass(SRC); @@ -2011,18 +2254,18 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } // Update all the operands so they have the same type. - for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { - MachineOperand &Op = MI->getOperand(I); + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; unsigned DstReg = MRI.createVirtualRegister(RC); // MI is a PHI instruction. - MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); + MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); - BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); + BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); Op.setReg(DstReg); } } @@ -2030,15 +2273,15 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // REG_SEQUENCE doesn't really require operand legalization, but if one has a // VGPR dest type and SGPR sources, insert copies so all operands are // VGPRs. This seems to help operand folding / the register coalescer. - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { - MachineBasicBlock *MBB = MI->getParent(); - const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); + if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { + MachineBasicBlock *MBB = MI.getParent(); + const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); if (RI.hasVGPRs(DstRC)) { // Update all the operands so they are VGPR register classes. These may // not be the same register class because REG_SEQUENCE supports mixing // subregister index types e.g. sub0_sub1 + sub2 + sub3 - for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { - MachineOperand &Op = MI->getOperand(I); + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; @@ -2049,8 +2292,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { unsigned DstReg = MRI.createVirtualRegister(VRC); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); + BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); Op.setReg(DstReg); Op.setIsKill(); @@ -2062,17 +2305,33 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // Legalize INSERT_SUBREG // src0 must have the same register class as dst - if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { - unsigned Dst = MI->getOperand(0).getReg(); - unsigned Src0 = MI->getOperand(1).getReg(); + if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src0 = MI.getOperand(1).getReg(); const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); if (DstRC != Src0RC) { - MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock &MBB = *MI.getParent(); unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) - .addReg(Src0); - MI->getOperand(1).setReg(NewSrc0); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) + .addReg(Src0); + MI.getOperand(1).setReg(NewSrc0); + } + return; + } + + // Legalize MIMG + if (isMIMG(MI)) { + MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); + if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); + SRsrc->setReg(SGPR); + } + + MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); + if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); + SSamp->setReg(SGPR); } return; } @@ -2081,11 +2340,11 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // FIXME: If we start using the non-addr64 instructions for compute, we // may need to legalize them here. int SRsrcIdx = - AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); if (SRsrcIdx != -1) { // We have an MUBUF instruction - MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); - unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; + MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); + unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), RI.getRegClass(SRsrcRC))) { // The operands are legal. @@ -2093,7 +2352,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { return; } - MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock &MBB = *MI.getParent(); // Extract the ptr from the resource descriptor. unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, @@ -2107,30 +2366,27 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); // Zero64 = 0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), - Zero64) - .addImm(0); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) + .addImm(0); // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatLo) - .addImm(RsrcDataFormat & 0xFFFFFFFF); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) + .addImm(RsrcDataFormat & 0xFFFFFFFF); // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatHi) - .addImm(RsrcDataFormat >> 32); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) + .addImm(RsrcDataFormat >> 32); // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); - - MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); + + MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); if (VAddr) { // This is already an ADDR64 instruction so we need to add the pointer @@ -2139,7 +2395,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) .addReg(SRsrcPtr, 0, AMDGPU::sub0) .addReg(VAddr->getReg(), 0, AMDGPU::sub0); @@ -2150,82 +2406,82 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { .addReg(VAddr->getReg(), 0, AMDGPU::sub1); // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); } else { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. - assert(MBB.getParent()->getSubtarget().getGeneration() - < AMDGPUSubtarget::VOLCANIC_ISLANDS && + assert(MBB.getParent()->getSubtarget().getGeneration() + < SISubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here"); - MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); - MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); - MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); - unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); + MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); + MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); + MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); + unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); // Atomics rith return have have an additional tied operand and are // missing some of the special bits. - MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); + MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); MachineInstr *Addr64; if (!VDataIn) { // Regular buffer load / store. - MachineInstrBuilder MIB - = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset); + MachineInstrBuilder MIB = + BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset); // Atomics do not have this operand. - if (const MachineOperand *GLC - = getNamedOperand(*MI, AMDGPU::OpName::glc)) { + if (const MachineOperand *GLC = + getNamedOperand(MI, AMDGPU::OpName::glc)) { MIB.addImm(GLC->getImm()); } - MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); + MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); - if (const MachineOperand *TFE - = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { + if (const MachineOperand *TFE = + getNamedOperand(MI, AMDGPU::OpName::tfe)) { MIB.addImm(TFE->getImm()); } - MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); Addr64 = MIB; } else { // Atomics with return. - Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addOperand(*VDataIn) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) - .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addOperand(*VDataIn) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset) + .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) + .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } - MI->removeFromParent(); - MI = Addr64; + MI.removeFromParent(); // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) - .addReg(SRsrcPtr, 0, AMDGPU::sub0) - .addImm(AMDGPU::sub0) - .addReg(SRsrcPtr, 0, AMDGPU::sub1) - .addImm(AMDGPU::sub1); - - VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); - SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); + BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewVAddr) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addImm(AMDGPU::sub1); + + VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); + SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); } // Update the instruction to use NewVaddr @@ -2235,300 +2491,85 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } } -void SIInstrInfo::splitSMRD(MachineInstr *MI, - const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const { - - DebugLoc DL = MI->getDebugLoc(); - MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RegLo = MRI.createVirtualRegister(HalfRC); - unsigned RegHi = MRI.createVirtualRegister(HalfRC); - unsigned HalfSize = HalfRC->getSize(); - const MachineOperand *OffOp = - getNamedOperand(*MI, AMDGPU::OpName::offset); - const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); - - // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes - // on VI. - - bool IsKill = SBase->isKill(); - if (OffOp) { - bool isVI = - MBB->getParent()->getSubtarget().getGeneration() >= - AMDGPUSubtarget::VOLCANIC_ISLANDS; - unsigned OffScale = isVI ? 1 : 4; - // Handle the _IMM variant - unsigned LoOffset = OffOp->getImm() * OffScale; - unsigned HiOffset = LoOffset + HalfSize; - Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) - // Use addReg instead of addOperand - // to make sure kill flag is cleared. - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addImm(LoOffset / OffScale); - - if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { - unsigned OffsetSGPR = - MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) - .addImm(HiOffset); // The offset in register is in bytes. - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } else { - Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addImm(HiOffset / OffScale); - } - } else { - // Handle the _SGPR variant - MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); - Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addOperand(*SOff); - unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) - .addReg(SOff->getReg(), 0, SOff->getSubReg()) - .addImm(HalfSize); - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } - - unsigned SubLo, SubHi; - const TargetRegisterClass *NewDstRC; - switch (HalfSize) { - case 4: - SubLo = AMDGPU::sub0; - SubHi = AMDGPU::sub1; - NewDstRC = &AMDGPU::VReg_64RegClass; - break; - case 8: - SubLo = AMDGPU::sub0_sub1; - SubHi = AMDGPU::sub2_sub3; - NewDstRC = &AMDGPU::VReg_128RegClass; - break; - case 16: - SubLo = AMDGPU::sub0_sub1_sub2_sub3; - SubHi = AMDGPU::sub4_sub5_sub6_sub7; - NewDstRC = &AMDGPU::VReg_256RegClass; - break; - case 32: - SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; - SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; - NewDstRC = &AMDGPU::VReg_512RegClass; - break; - default: - llvm_unreachable("Unhandled HalfSize"); - } - - unsigned OldDst = MI->getOperand(0).getReg(); - unsigned NewDst = MRI.createVirtualRegister(NewDstRC); - - MRI.replaceRegWith(OldDst, NewDst); - - BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst) - .addReg(RegLo) - .addImm(SubLo) - .addReg(RegHi) - .addImm(SubHi); -} - -void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, - MachineRegisterInfo &MRI, - SmallVectorImpl &Worklist) const { - MachineBasicBlock *MBB = MI->getParent(); - int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); - assert(DstIdx != -1); - unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass; - switch(RI.getRegClass(DstRCID)->getSize()) { - case 4: - case 8: - case 16: { - unsigned NewOpcode = getVALUOp(*MI); - unsigned RegOffset; - unsigned ImmOffset; - - if (MI->getOperand(2).isReg()) { - RegOffset = MI->getOperand(2).getReg(); - ImmOffset = 0; - } else { - assert(MI->getOperand(2).isImm()); - // SMRD instructions take a dword offsets on SI and byte offset on VI - // and MUBUF instructions always take a byte offset. - ImmOffset = MI->getOperand(2).getImm(); - if (MBB->getParent()->getSubtarget().getGeneration() <= - AMDGPUSubtarget::SEA_ISLANDS) - ImmOffset <<= 2; - RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - - if (isUInt<12>(ImmOffset)) { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(0); - } else { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(ImmOffset); - ImmOffset = 0; - } - } - - unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - unsigned DWord0 = RegOffset; - unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); - - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) - .addImm(0); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) - .addImm(RsrcDataFormat >> 32); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - - const MCInstrDesc &NewInstDesc = get(NewOpcode); - const TargetRegisterClass *NewDstRC - = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - unsigned DstReg = MI->getOperand(0).getReg(); - MRI.replaceRegWith(DstReg, NewDstReg); - - MachineInstr *NewInst = - BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) - .addOperand(MI->getOperand(1)) // sbase - .addReg(SRsrc) - .addImm(0) - .addImm(ImmOffset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); - MI->eraseFromParent(); - - legalizeOperands(NewInst); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); - break; - } - case 32: { - MachineInstr *Lo, *Hi; - splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, - AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI, Worklist); - moveSMRDToVALU(Hi, MRI, Worklist); - break; - } - - case 64: { - MachineInstr *Lo, *Hi; - splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, - AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI, Worklist); - moveSMRDToVALU(Hi, MRI, Worklist); - break; - } - } -} - void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { SmallVector Worklist; Worklist.push_back(&TopInst); while (!Worklist.empty()) { - MachineInstr *Inst = Worklist.pop_back_val(); - MachineBasicBlock *MBB = Inst->getParent(); + MachineInstr &Inst = *Worklist.pop_back_val(); + MachineBasicBlock *MBB = Inst.getParent(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Opcode = Inst->getOpcode(); - unsigned NewOpcode = getVALUOp(*Inst); + unsigned Opcode = Inst.getOpcode(); + unsigned NewOpcode = getVALUOp(Inst); // Handle some special cases switch (Opcode) { default: - if (isSMRD(*Inst)) { - moveSMRDToVALU(Inst, MRI, Worklist); - continue; - } break; case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_OR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_XOR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_NOT_B64: splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_BCNT1_I32_B64: splitScalar64BitBCNT(Worklist, Inst); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_BFE_I64: { splitScalar64BitBFE(Worklist, Inst); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; } case AMDGPU::S_LSHL_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHLREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_ASHRREV_I32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHRREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHL_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHLREV_B64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_ASHRREV_I64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHRREV_B64; swapOperands(Inst); } @@ -2536,9 +2577,18 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { case AMDGPU::S_ABS_I32: lowerScalarAbs(Worklist, Inst); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: + // Clear unused bits of vcc + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), + AMDGPU::VCC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + break; + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); @@ -2553,34 +2603,36 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // Use the new VALU Opcode. const MCInstrDesc &NewDesc = get(NewOpcode); - Inst->setDesc(NewDesc); + Inst.setDesc(NewDesc); // Remove any references to SCC. Vector instructions can't read from it, and // We're just about to add the implicit use / defs of VCC, and we don't want // both. - for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { - MachineOperand &Op = Inst->getOperand(i); - if (Op.isReg() && Op.getReg() == AMDGPU::SCC) - Inst->RemoveOperand(i); + for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { + MachineOperand &Op = Inst.getOperand(i); + if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { + Inst.RemoveOperand(i); + addSCCDefUsersToVALUWorklist(Inst, Worklist); + } } if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { // We are converting these to a BFE, so we need to add the missing // operands for the size and offset. unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(Size)); + Inst.addOperand(MachineOperand::CreateImm(0)); + Inst.addOperand(MachineOperand::CreateImm(Size)); } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { // The VALU version adds the second operand to the result, so insert an // extra 0 operand. - Inst->addOperand(MachineOperand::CreateImm(0)); + Inst.addOperand(MachineOperand::CreateImm(0)); } - Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); + Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = Inst->getOperand(2); + const MachineOperand &OffsetWidthOp = Inst.getOperand(2); // If we need to move this to VGPRs, we need to unpack the second operand // back into the 2 separate ones for bit offset and width. assert(OffsetWidthOp.isImm() && @@ -2589,50 +2641,41 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - Inst->RemoveOperand(2); // Remove old immediate. - Inst->addOperand(MachineOperand::CreateImm(Offset)); - Inst->addOperand(MachineOperand::CreateImm(BitWidth)); + Inst.RemoveOperand(2); // Remove old immediate. + Inst.addOperand(MachineOperand::CreateImm(Offset)); + Inst.addOperand(MachineOperand::CreateImm(BitWidth)); } - // Update the destination register class. - const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); - if (!NewDstRC) - continue; + bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); + unsigned NewDstReg = AMDGPU::NoRegister; + if (HasDst) { + // Update the destination register class. + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); + if (!NewDstRC) + continue; - unsigned DstReg = Inst->getOperand(0).getReg(); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); + unsigned DstReg = Inst.getOperand(0).getReg(); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + } // Legalize the operands legalizeOperands(Inst); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + if (HasDst) + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } } -//===----------------------------------------------------------------------===// -// Indirect addressing callbacks -//===----------------------------------------------------------------------===// - -unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const { - assert(Channel == 0); - return RegIndex; -} - -const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::VGPR_32RegClass; -} - void SIInstrInfo::lowerScalarAbs(SmallVectorImpl &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); + DebugLoc DL = Inst.getDebugLoc(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src = Inst->getOperand(1); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src = Inst.getOperand(1); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -2649,15 +2692,14 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl &Worklist, } void SIInstrInfo::splitScalar64BitUnaryOp( - SmallVectorImpl &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { - MachineBasicBlock &MBB = *Inst->getParent(); + SmallVectorImpl &Worklist, MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src0 = Inst->getOperand(1); - DebugLoc DL = Inst->getDebugLoc(); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + DebugLoc DL = Inst.getDebugLoc(); MachineBasicBlock::iterator MII = Inst; @@ -2703,16 +2745,15 @@ void SIInstrInfo::splitScalar64BitUnaryOp( } void SIInstrInfo::splitScalar64BitBinaryOp( - SmallVectorImpl &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { - MachineBasicBlock &MBB = *Inst->getParent(); + SmallVectorImpl &Worklist, MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src0 = Inst->getOperand(1); - MachineOperand &Src1 = Inst->getOperand(2); - DebugLoc DL = Inst->getDebugLoc(); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + DebugLoc DL = Inst.getDebugLoc(); MachineBasicBlock::iterator MII = Inst; @@ -2738,9 +2779,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp( const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); - MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0) - .addOperand(SrcReg1Sub0); + MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0) + .addOperand(SrcReg1Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); @@ -2748,9 +2789,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp( AMDGPU::sub1, Src1SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); - MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1) - .addOperand(SrcReg1Sub1); + MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1) + .addOperand(SrcReg1Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -2770,16 +2811,16 @@ void SIInstrInfo::splitScalar64BitBinaryOp( addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); +void SIInstrInfo::splitScalar64BitBCNT( + SmallVectorImpl &Worklist, MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); + DebugLoc DL = Inst.getDebugLoc(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src = Inst->getOperand(1); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src = Inst.getOperand(1); const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); const TargetRegisterClass *SrcRC = Src.isReg() ? @@ -2812,24 +2853,22 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl &Worklist } void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); + DebugLoc DL = Inst.getDebugLoc(); - MachineOperand &Dest = Inst->getOperand(0); - uint32_t Imm = Inst->getOperand(2).getImm(); + MachineOperand &Dest = Inst.getOperand(0); + uint32_t Imm = Inst.getOperand(2).getImm(); uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. (void) Offset; // Only sext_inreg cases handled. - assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && - BitWidth <= 32 && - Offset == 0 && - "Not implemented"); + assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && + Offset == 0 && "Not implemented"); if (BitWidth < 32) { unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -2837,9 +2876,9 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl &Worklist, unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) - .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) - .addImm(0) - .addImm(BitWidth); + .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) + .addImm(0) + .addImm(BitWidth); BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) .addImm(31) @@ -2856,7 +2895,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl &Worklist, return; } - MachineOperand &Src = Inst->getOperand(1); + MachineOperand &Src = Inst.getOperand(1); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); @@ -2887,6 +2926,22 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } +void SIInstrInfo::addSCCDefUsersToVALUWorklist( + MachineInstr &SCCDefInst, SmallVectorImpl &Worklist) const { + // This assumes that all the users of SCC are in the same block + // as the SCC def. + for (MachineInstr &MI : + llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), + SCCDefInst.getParent()->end())) { + // Exit if we find another SCC def. + if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) + return; + + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) + Worklist.push_back(&MI); + } +} + const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( const MachineInstr &Inst) const { const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); @@ -2912,9 +2967,9 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( } // Find the one SGPR operand we are allowed to use. -unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, +unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const { - const MCInstrDesc &Desc = MI->getDesc(); + const MCInstrDesc &Desc = MI.getDesc(); // Find the one SGPR operand we are allowed to use. // @@ -2925,19 +2980,19 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, // // If the operand's class is an SGPR, we can never move it. - unsigned SGPRReg = findImplicitSGPRRead(*MI); + unsigned SGPRReg = findImplicitSGPRRead(MI); if (SGPRReg != AMDGPU::NoRegister) return SGPRReg; unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0; i < 3; ++i) { int Idx = OpIndices[i]; if (Idx == -1) break; - const MachineOperand &MO = MI->getOperand(Idx); + const MachineOperand &MO = MI.getOperand(Idx); if (!MO.isReg()) continue; @@ -2981,70 +3036,6 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, return SGPRReg; } -MachineInstrBuilder SIInstrInfo::buildIndirectWrite( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, unsigned OffsetReg) const { - const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( - getIndirectIndexBegin(*MBB->getParent())); - - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) - .addReg(IndirectBaseReg, RegState::Define) - .addOperand(I->getOperand(0)) - .addReg(IndirectBaseReg) - .addReg(OffsetReg) - .addImm(0) - .addReg(ValueReg); -} - -MachineInstrBuilder SIInstrInfo::buildIndirectRead( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, unsigned OffsetReg) const { - const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( - getIndirectIndexBegin(*MBB->getParent())); - - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addReg(IndirectBaseReg) - .addReg(OffsetReg) - .addImm(0); - -} - -void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const { - int End = getIndirectIndexEnd(MF); - int Begin = getIndirectIndexBegin(MF); - - if (End == -1) - return; - - - for (int Index = Begin; Index <= End; ++Index) - Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); -} - MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, unsigned OperandName) const { int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); @@ -3059,9 +3050,9 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { if (ST.isAmdHsaOS()) { RsrcDataFormat |= (1ULL << 56); - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - // Set MTYPE = 2 - RsrcDataFormat |= (2ULL << 59); + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + // Set MTYPE = 2 + RsrcDataFormat |= (2ULL << 59); } return RsrcDataFormat; @@ -3072,22 +3063,103 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size; + uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + + Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | + // IndexStride = 64 + (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); + // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; return Rsrc23; } -bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); +bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); return isSMRD(Opc); } -bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); +bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); } + +unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); + const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); + unsigned DescSize = Desc.getSize(); + + // If we have a definitive size, we can use it. Otherwise we need to inspect + // the operands to know the size. + if (DescSize == 8 || DescSize == 4) + return DescSize; + + assert(DescSize == 0); + + // 4-byte instructions may have a 32-bit literal encoded after them. Check + // operands that coud ever be literals. + if (isVALU(MI) || isSALU(MI)) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) + return 4; // No operands. + + if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx))) + return 8; + + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) + return 4; + + if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx))) + return 8; + + return 4; + } + + switch (Opc) { + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::DBG_VALUE: + case TargetOpcode::BUNDLE: + case TargetOpcode::EH_LABEL: + return 0; + case TargetOpcode::INLINEASM: { + const MachineFunction *MF = MI.getParent()->getParent(); + const char *AsmStr = MI.getOperand(0).getSymbolName(); + return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); + } + default: + llvm_unreachable("unable to find instruction size"); + } +} + +ArrayRef> +SIInstrInfo::getSerializableTargetIndices() const { + static const std::pair TargetIndices[] = { + {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; + return makeArrayRef(TargetIndices); +} + +/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The +/// post-RA version of misched uses CreateTargetMIHazardRecognizer. +ScheduleHazardRecognizer * +SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const { + return new GCNHazardRecognizer(DAG->MF); +} + +/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer +/// pass. +ScheduleHazardRecognizer * +SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { + return new GCNHazardRecognizer(MF); +} diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index cce1ae725611..227b817227c2 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H -#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H +#define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H #include "AMDGPUInstrInfo.h" #include "SIDefines.h" @@ -22,9 +22,24 @@ namespace llvm { -class SIInstrInfo : public AMDGPUInstrInfo { +class SIInstrInfo final : public AMDGPUInstrInfo { private: const SIRegisterInfo RI; + const SISubtarget &ST; + + // The the inverse predicate should have the negative value. + enum BranchPredicate { + INVALID_BR = 0, + SCC_TRUE = 1, + SCC_FALSE = -1, + VCCNZ = 2, + VCCZ = -2, + EXECNZ = -3, + EXECZ = 3 + }; + + static unsigned getBranchOpcode(BranchPredicate Cond); + static BranchPredicate getBranchPredicate(unsigned Opcode); unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, @@ -39,87 +54,89 @@ private: unsigned SubIdx, const TargetRegisterClass *SubRC) const; - void swapOperands(MachineBasicBlock::iterator Inst) const; + void swapOperands(MachineInstr &Inst) const; void lowerScalarAbs(SmallVectorImpl &Worklist, - MachineInstr *Inst) const; + MachineInstr &Inst) const; void splitScalar64BitUnaryOp(SmallVectorImpl &Worklist, - MachineInstr *Inst, unsigned Opcode) const; + MachineInstr &Inst, unsigned Opcode) const; void splitScalar64BitBinaryOp(SmallVectorImpl &Worklist, - MachineInstr *Inst, unsigned Opcode) const; + MachineInstr &Inst, unsigned Opcode) const; void splitScalar64BitBCNT(SmallVectorImpl &Worklist, - MachineInstr *Inst) const; + MachineInstr &Inst) const; void splitScalar64BitBFE(SmallVectorImpl &Worklist, - MachineInstr *Inst) const; + MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist( unsigned Reg, MachineRegisterInfo &MRI, SmallVectorImpl &Worklist) const; + void + addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst, + SmallVectorImpl &Worklist) const; + const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; - bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, - MachineInstr *MIb) const; + bool checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const; - unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; + unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; protected: - MachineInstr *commuteInstructionImpl(MachineInstr *MI, - bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; public: - explicit SIInstrInfo(const AMDGPUSubtarget &st); - const SIRegisterInfo &getRegisterInfo() const override { + enum TargetOperandFlags { + MO_NONE = 0, + MO_GOTPCREL = 1 + }; + + explicit SIInstrInfo(const SISubtarget &); + + const SIRegisterInfo &getRegisterInfo() const { return RI; } - bool isReallyTriviallyReMaterializable(const MachineInstr *MI, + bool isReallyTriviallyReMaterializable(const MachineInstr &MI, AliasAnalysis *AA) const override; bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override; - bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, + bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, + int64_t &Offset, const TargetRegisterInfo *TRI) const final; - bool shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const final; + bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, + unsigned NumLoads) const final; - void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; - unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - RegScavenger *RS, - unsigned TmpReg, - unsigned Offset, - unsigned Size) const; + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineInstr &MI, + RegScavenger *RS, unsigned TmpReg, + unsigned Offset, unsigned Size) const; void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIndex, + MachineBasicBlock::iterator MI, unsigned SrcReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, + MachineBasicBlock::iterator MI, unsigned DestReg, + int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + bool expandPostRAPseudo(MachineInstr &MI) const override; // \brief Returns an opcode that can be used to move a value to a \p DstRC // register. If there is no hardware instruction that can store to \p @@ -129,28 +146,40 @@ public: LLVM_READONLY int commuteOpcode(const MachineInstr &MI) const; - bool findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, + bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; - bool areMemAccessesTriviallyDisjoint( - MachineInstr *MIa, MachineInstr *MIb, - AliasAnalysis *AA = nullptr) const override; + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify) const override; + + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef Cond, + const DebugLoc &DL) const override; - MachineInstr *buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const override; - bool isMov(unsigned Opcode) const override; + bool ReverseBranchCondition( + SmallVectorImpl &Cond) const override; - bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, - unsigned Reg, MachineRegisterInfo *MRI) const final; + bool + areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, + AliasAnalysis *AA = nullptr) const override; + + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, + MachineRegisterInfo *MRI) const final; unsigned getMachineCSELookAheadLimit() const override { return 500; } MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB, - MachineBasicBlock::iterator &MI, + MachineInstr &MI, LiveVariables *LV) const override; + bool isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const override; + static bool isSALU(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SALU; } @@ -167,6 +196,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VALU; } + static bool isVMEM(const MachineInstr &MI) { + return isMUBUF(MI) || isMTBUF(MI) || isMIMG(MI); + } + + bool isVMEM(uint16_t Opcode) const { + return isMUBUF(Opcode) || isMTBUF(Opcode) || isMIMG(Opcode); + } + static bool isSOP1(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SOP1; } @@ -279,6 +316,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::MIMG; } + static bool isGather4(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::Gather4; + } + + bool isGather4(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::Gather4; + } + static bool isFLAT(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::FLAT; } @@ -303,11 +348,35 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; } + static bool isDPP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DPP; + } + + bool isDPP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DPP; + } + + static bool isScalarUnit(const MachineInstr &MI) { + return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD); + } + + static bool usesVM_CNT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT; + } + + bool isVGPRCopy(const MachineInstr &MI) const { + assert(MI.isCopy()); + unsigned Dest = MI.getOperand(0).getReg(); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return !RI.isSGPRReg(MRI, Dest); + } + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; - bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const; /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. @@ -326,7 +395,7 @@ public: bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const; - bool verifyInstruction(const MachineInstr *MI, + bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override; static unsigned getVALUOp(const MachineInstr &MI); @@ -374,11 +443,11 @@ public: /// /// If the operand being legalized is a register, then a COPY will be used /// instead of MOV. - void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const; + void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const; /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand /// for \p MI. - bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO = nullptr) const; /// \brief Check if \p MO would be a valid operand for the given operand @@ -396,52 +465,38 @@ public: /// \brief Legalize operands in \p MI by either commuting it or inserting a /// copy of src1. - void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const; + void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const; /// \brief Fix operands in \p MI to satisfy constant bus requirements. - void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const; + void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const; - /// \brief Legalize all operands in this instruction. This function may - /// create new instruction and insert them before \p MI. - void legalizeOperands(MachineInstr *MI) const; + /// Copy a value from a VGPR (\p SrcReg) to SGPR. This function can only + /// be used when it is know that the value in SrcReg is same across all + /// threads in the wave. + /// \returns The SGPR register that \p SrcReg was copied to. + unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, + MachineRegisterInfo &MRI) const; - /// \brief Split an SMRD instruction into two smaller loads of half the - // size storing the results in \p Lo and \p Hi. - void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const; + void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const; - void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI, - SmallVectorImpl &Worklist) const; + /// \brief Legalize all operands in this instruction. This function may + /// create new instruction and insert them before \p MI. + void legalizeOperands(MachineInstr &MI) const; /// \brief Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the /// VALU if necessary. void moveToVALU(MachineInstr &MI) const; - unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const override; - - const TargetRegisterClass *getIndirectAddrRegClass() const override; + void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI, + int Count) const; - MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, - unsigned OffsetReg) const override; + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; - MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, - unsigned OffsetReg) const override; - void reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const; - - void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, - unsigned SavReg, unsigned IndexReg) const; - - void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const; + /// \brief Return the number of wait states that result from executing this + /// instruction. + unsigned getNumWaitStates(const MachineInstr &MI) const; /// \brief Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. @@ -463,8 +518,26 @@ public: uint64_t getDefaultRsrcDataFormat() const; uint64_t getScratchRsrcWords23() const; - bool isLowLatencyInstruction(const MachineInstr *MI) const; - bool isHighLatencyInstruction(const MachineInstr *MI) const; + bool isLowLatencyInstruction(const MachineInstr &MI) const; + bool isHighLatencyInstruction(const MachineInstr &MI) const; + + /// \brief Return the descriptor of the target-specific machine instruction + /// that corresponds to the specified pseudo or native opcode. + const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { + return get(pseudoToMCOpcode(Opcode)); + } + + unsigned getInstSizeInBytes(const MachineInstr &MI) const; + + ArrayRef> + getSerializableTargetIndices() const override; + + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const override; + + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override; }; namespace AMDGPU { @@ -490,8 +563,9 @@ namespace AMDGPU { int getAtomicNoRetOp(uint16_t Opcode); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; - const uint64_t RSRC_TID_ENABLE = 1LL << 55; - + const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); + const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); + const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23); } // End namespace AMDGPU namespace SI { diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 8735277149a6..253cc32b27e4 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -7,9 +7,9 @@ // //===----------------------------------------------------------------------===// def isCI : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SEA_ISLANDS">; + ">= SISubtarget::SEA_ISLANDS">; def isCIOnly : Predicate<"Subtarget->getGeneration() ==" - "AMDGPUSubtarget::SEA_ISLANDS">, + "SISubtarget::SEA_ISLANDS">, AssemblerPredicate <"FeatureSeaIslands">; def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; @@ -69,6 +69,11 @@ class sopk si, bits<5> vi = si> { field bits<5> VI = vi; } +class dsop si, bits<8> vi = si> { + field bits<8> SI = si; + field bits<8> VI = vi; +} + // Specify an SMRD opcode for SI and SMEM opcode for VI // FIXME: This should really be bits<5> si, Tablegen crashes if @@ -78,9 +83,9 @@ class smrd si, bits<8> vi = si> { field bits<8> VI = vi; } -// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum -// in AMDGPUInstrInfo.cpp -def SISubtarget { +// Execpt for the NONE field, this must be kept in sync with the +// SIEncodingFamily enum in AMDGPUInstrInfo.cpp +def SIEncodingFamily { int NONE = -1; int SI = 0; int VI = 1; @@ -95,6 +100,14 @@ def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", [SDNPMayLoad, SDNPMemOperand] >; +def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTypeProfile<0, 13, [SDTCisVT<0, v4i32>, // rsrc(SGPR) @@ -120,7 +133,7 @@ def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", >; class SDSample : SDNode , SDTCisVT<2, v32i8>, + SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>, SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> >; @@ -129,9 +142,8 @@ def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; -def SIconstdata_ptr : SDNode< - "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>, - SDTCisVT<0, i64>]> +def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", + SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]> >; //===----------------------------------------------------------------------===// @@ -140,12 +152,14 @@ def SIconstdata_ptr : SDNode< class flat_ld : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ - return isFlatLoad(dyn_cast(N)) || - isGlobalLoad(dyn_cast(N)) || - isConstantLoad(cast(N), -1); + const MemSDNode *LD = cast(N); + return LD->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; }]>; def flat_load : flat_ld ; +def atomic_flat_load : flat_ld; def flat_az_extloadi8 : flat_ld ; def flat_sextloadi8 : flat_ld ; def flat_az_extloadi16 : flat_ld ; @@ -153,25 +167,49 @@ def flat_sextloadi16 : flat_ld ; class flat_st : PatFrag<(ops node:$val, node:$ptr), (st node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast(N)) || - isGlobalStore(dyn_cast(N)); + const MemSDNode *ST = cast(N); + return ST->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + ST->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; def flat_store: flat_st ; +def atomic_flat_store: flat_st ; def flat_truncstorei8 : flat_st ; def flat_truncstorei16 : flat_st ; +class MubufLoad : PatFrag < + (ops node:$ptr), (op node:$ptr), [{ -def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ - return isGlobalLoad(cast(N)) || - isConstantLoad(cast(N), -1); + const MemSDNode *LD = cast(N); + return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; }]>; +def mubuf_load : MubufLoad ; +def mubuf_az_extloadi8 : MubufLoad ; +def mubuf_sextloadi8 : MubufLoad ; +def mubuf_az_extloadi16 : MubufLoad ; +def mubuf_sextloadi16 : MubufLoad ; + +def mubuf_load_atomic : MubufLoad ; + def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ - return isConstantLoad(cast(N), -1) && - static_cast(getTargetLowering())->isMemOpUniform(N); + auto Ld = cast(N); + return Ld->getAlignment() >= 4 && + Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + static_cast(getTargetLowering())->isMemOpUniform(N); }]>; +//===----------------------------------------------------------------------===// +// PatFrags for global memory operations +//===----------------------------------------------------------------------===// + +def atomic_inc_global : global_binary_atomic_op; +def atomic_dec_global : global_binary_atomic_op; + +def atomic_inc_flat : flat_binary_atomic_op; +def atomic_dec_flat : flat_binary_atomic_op; + //===----------------------------------------------------------------------===// // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 // to be glued to the memory instructions. @@ -182,7 +220,7 @@ def SIld_local : SDNode <"ISD::LOAD", SDTLoad, >; def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ - return isLocalLoad(cast(N)); + return cast(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ @@ -219,7 +257,7 @@ def SIst_local : SDNode <"ISD::STORE", SDTStore, def si_st_local : PatFrag < (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ - return isLocalStore(cast(N)); + return cast(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; def si_store_local : PatFrag < @@ -247,9 +285,34 @@ def si_truncstore_local_i16 : PatFrag < return cast(N)->getMemoryVT() == MVT::i16; }]>; -multiclass SIAtomicM0Glue2 { +def si_setcc_uniform : PatFrag < + (ops node:$lhs, node:$rhs, node:$cond), + (setcc node:$lhs, node:$rhs, node:$cond), [{ + for (SDNode *Use : N->uses()) { + if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg) + return false; + + unsigned Reg = cast(Use->getOperand(1))->getReg(); + if (Reg != AMDGPU::SCC) + return false; + } + return true; +}]>; + +def si_uniform_br : PatFrag < + (ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{ + return isUniformBr(N); +}]>; + +def si_uniform_br_scc : PatFrag < + (ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{ + return isCBranchSCC(N); +}]>; + +multiclass SIAtomicM0Glue2 { - def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, + def _glue : SDNode < + !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; @@ -257,11 +320,13 @@ multiclass SIAtomicM0Glue2 { } defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; +defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm si_atomic_inc : SIAtomicM0Glue2 <"INC", 1>; +defm si_atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; -defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; @@ -347,6 +412,10 @@ def IMM16bit : PatLeaf <(imm), [{return isUInt<16>(N->getZExtValue());}] >; +def SIMM16bit : PatLeaf <(imm), + [{return isInt<16>(N->getSExtValue());}] +>; + def IMM20bit : PatLeaf <(imm), [{return isUInt<20>(N->getZExtValue());}] >; @@ -369,7 +438,7 @@ class InlineFPImm : PatLeaf <(vt fpimm), [{ }]>; class SGPRImm : PatLeafgetGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) { return false; } const SIRegisterInfo *SIRI = @@ -402,188 +471,133 @@ def sopp_brtarget : Operand { let ParserMatchClass = SoppBrTarget; } -def const_ga : Operand; - -include "SIInstrFormats.td" -include "VIInstrFormats.td" +def si_ga : Operand; -def MubufOffsetMatchClass : AsmOperandClass { - let Name = "MubufOffset"; - let ParserMethod = "parseMubufOptionalOps"; - let RenderMethod = "addImmOperands"; +def InterpSlot : Operand { + let PrintMethod = "printInterpSlot"; } -class DSOffsetBaseMatchClass : AsmOperandClass { - let Name = "DSOffset"#parser; - let ParserMethod = parser; +def SendMsgMatchClass : AsmOperandClass { + let Name = "SendMsg"; + let PredicateMethod = "isSendMsg"; + let ParserMethod = "parseSendMsgOp"; let RenderMethod = "addImmOperands"; - let PredicateMethod = "isDSOffset"; } -def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">; -def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">; - -def DSOffset01MatchClass : AsmOperandClass { - let Name = "DSOffset1"; - let ParserMethod = "parseDSOff01OptionalOps"; - let RenderMethod = "addImmOperands"; - let PredicateMethod = "isDSOffset01"; +def SendMsgImm : Operand { + let PrintMethod = "printSendMsg"; + let ParserMatchClass = SendMsgMatchClass; } -class GDSBaseMatchClass : AsmOperandClass { - let Name = "GDS"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; +def SWaitMatchClass : AsmOperandClass { + let Name = "SWaitCnt"; let RenderMethod = "addImmOperands"; + let ParserMethod = "parseSWaitCntOps"; } -def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">; -def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; - -class GLCBaseMatchClass : AsmOperandClass { - let Name = "GLC"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; +def WAIT_FLAG : Operand { + let ParserMatchClass = SWaitMatchClass; + let PrintMethod = "printWaitFlag"; } -def GLCMubufMatchClass : GLCBaseMatchClass <"parseMubufOptionalOps">; -def GLCFlatMatchClass : GLCBaseMatchClass <"parseFlatOptionalOps">; +include "SIInstrFormats.td" +include "VIInstrFormats.td" -class SLCBaseMatchClass : AsmOperandClass { - let Name = "SLC"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; +class NamedMatchClass : AsmOperandClass { + let Name = "Imm"#CName; + let PredicateMethod = "is"#CName; + let ParserMethod = !if(Optional, "parseOptionalOperand", "parse"#CName); let RenderMethod = "addImmOperands"; + let IsOptional = Optional; + let DefaultMethod = !if(Optional, "default"#CName, ?); } -def SLCMubufMatchClass : SLCBaseMatchClass <"parseMubufOptionalOps">; -def SLCFlatMatchClass : SLCBaseMatchClass <"parseFlatOptionalOps">; -def SLCFlatAtomicMatchClass : SLCBaseMatchClass <"parseFlatAtomicOptionalOps">; - -class TFEBaseMatchClass : AsmOperandClass { - let Name = "TFE"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; +class NamedOperandBit : Operand { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; } -def TFEMubufMatchClass : TFEBaseMatchClass <"parseMubufOptionalOps">; -def TFEFlatMatchClass : TFEBaseMatchClass <"parseFlatOptionalOps">; -def TFEFlatAtomicMatchClass : TFEBaseMatchClass <"parseFlatAtomicOptionalOps">; - -def OModMatchClass : AsmOperandClass { - let Name = "OMod"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseVOP3OptionalOps"; - let RenderMethod = "addImmOperands"; +class NamedOperandU8 : Operand { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; } -def ClampMatchClass : AsmOperandClass { - let Name = "Clamp"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseVOP3OptionalOps"; - let RenderMethod = "addImmOperands"; +class NamedOperandU16 : Operand { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; } -class SMRDOffsetBaseMatchClass : AsmOperandClass { - let Name = "SMRDOffset"#predicate; - let PredicateMethod = predicate; - let RenderMethod = "addImmOperands"; +class NamedOperandU32 : Operand { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; } -def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">; -def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass < - "isSMRDLiteralOffset" ->; - let OperandType = "OPERAND_IMMEDIATE" in { -def offen : Operand { - let PrintMethod = "printOffen"; -} -def idxen : Operand { - let PrintMethod = "printIdxen"; -} -def addr64 : Operand { - let PrintMethod = "printAddr64"; -} -def mbuf_offset : Operand { - let PrintMethod = "printMBUFOffset"; - let ParserMatchClass = MubufOffsetMatchClass; -} -class ds_offset_base : Operand { - let PrintMethod = "printDSOffset"; - let ParserMatchClass = mc; -} -def ds_offset : ds_offset_base ; -def ds_offset_gds : ds_offset_base ; +def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>; +def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>; +def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>; -def ds_offset0 : Operand { - let PrintMethod = "printDSOffset0"; - let ParserMatchClass = DSOffset01MatchClass; -} -def ds_offset1 : Operand { - let PrintMethod = "printDSOffset1"; - let ParserMatchClass = DSOffset01MatchClass; -} -class gds_base : Operand { - let PrintMethod = "printGDS"; - let ParserMatchClass = mc; -} -def gds : gds_base ; +def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>; +def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>; +def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>; -def gds01 : gds_base ; +def gds : NamedOperandBit<"GDS", NamedMatchClass<"GDS">>; -class glc_base : Operand { - let PrintMethod = "printGLC"; - let ParserMatchClass = mc; -} +def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>; +def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>; -def glc : glc_base ; -def glc_flat : glc_base ; +def smrd_offset : NamedOperandU32<"SMRDOffset", NamedMatchClass<"SMRDOffset">>; +def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset", NamedMatchClass<"SMRDLiteralOffset">>; -class slc_base : Operand { - let PrintMethod = "printSLC"; - let ParserMatchClass = mc; -} +def glc : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; +def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; +def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; +def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; +def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>; +def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>; +def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; -def slc : slc_base ; -def slc_flat : slc_base ; -def slc_flat_atomic : slc_base ; +def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; -class tfe_base : Operand { - let PrintMethod = "printTFE"; - let ParserMatchClass = mc; -} +def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; +def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>; +def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>; +def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>; -def tfe : tfe_base ; -def tfe_flat : tfe_base ; -def tfe_flat_atomic : tfe_base ; +def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>; +def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>; +def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>; +def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>; -def omod : Operand { - let PrintMethod = "printOModSI"; - let ParserMatchClass = OModMatchClass; -} +def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>; + +} // End OperandType = "OPERAND_IMMEDIATE" -def ClampMod : Operand { - let PrintMethod = "printClampSI"; - let ParserMatchClass = ClampMatchClass; -} -def smrd_offset : Operand { - let PrintMethod = "printU32ImmOperand"; - let ParserMatchClass = SMRDOffsetMatchClass; +def VOPDstS64 : VOPDstOperand ; + +def FPInputModsMatchClass : AsmOperandClass { + let Name = "RegOrImmWithFPInputMods"; + let ParserMethod = "parseRegOrImmWithFPInputMods"; + let PredicateMethod = "isRegOrImmWithInputMods"; } -def smrd_literal_offset : Operand { - let PrintMethod = "printU32ImmOperand"; - let ParserMatchClass = SMRDLiteralOffsetMatchClass; +def FPInputMods : Operand { + let PrintMethod = "printOperandAndFPInputMods"; + let ParserMatchClass = FPInputModsMatchClass; } -} // End OperandType = "OPERAND_IMMEDIATE" +def IntInputModsMatchClass : AsmOperandClass { + let Name = "RegOrImmWithIntInputMods"; + let ParserMethod = "parseRegOrImmWithIntInputMods"; + let PredicateMethod = "isRegOrImmWithInputMods"; +} -def VOPDstS64 : VOPDstOperand ; +def IntInputMods: Operand { + let PrintMethod = "printOperandAndIntInputMods"; + let ParserMatchClass = IntInputModsMatchClass; +} //===----------------------------------------------------------------------===// // Complex patterns @@ -595,9 +609,13 @@ def DS64Bit4ByteAligned : ComplexPattern; def MUBUFAddr32 : ComplexPattern; def MUBUFAddr64 : ComplexPattern; def MUBUFAddr64Atomic : ComplexPattern; +def FLATAtomic : ComplexPattern; def MUBUFScratch : ComplexPattern; def MUBUFOffset : ComplexPattern; +def MUBUFOffsetNoGLC : ComplexPattern; def MUBUFOffsetAtomic : ComplexPattern; +def MUBUFIntrinsicOffset : ComplexPattern; +def MUBUFIntrinsicVOffset : ComplexPattern; def SMRDImm : ComplexPattern; def SMRDImm32 : ComplexPattern; @@ -606,6 +624,8 @@ def SMRDBufferImm : ComplexPattern; def SMRDBufferImm32 : ComplexPattern; def SMRDBufferSgpr : ComplexPattern; +def MOVRELOffset : ComplexPattern; + def VOP3Mods0 : ComplexPattern; def VOP3NoMods0 : ComplexPattern; def VOP3Mods0Clamp : ComplexPattern; @@ -670,17 +690,24 @@ class EXPCommon : InstSI< let EXP_CNT = 1; let Uses = [EXEC]; + let SchedRW = [WriteExport]; } multiclass EXP_m { let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; + def "" : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.NONE> ; } - def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe; + def _si : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.SI>, EXPe { + let DecoderNamespace="SICI"; + let DisableDecoder = DisableSIDecoder; + } - def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi; + def _vi : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.VI>, EXPe_vi { + let DecoderNamespace="VI"; + let DisableDecoder = DisableVIDecoder; + } } //===----------------------------------------------------------------------===// @@ -689,7 +716,7 @@ multiclass EXP_m { class SOP1_Pseudo pattern> : SOP1 , - SIMCInstr { + SIMCInstr { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -697,17 +724,21 @@ class SOP1_Pseudo pattern> : class SOP1_Real_si : SOP1 , SOP1e , - SIMCInstr { + SIMCInstr { let isCodeGenOnly = 0; let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } class SOP1_Real_vi : SOP1 , SOP1e , - SIMCInstr { + SIMCInstr { let isCodeGenOnly = 0; let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } multiclass SOP1_m pattern> : SOP1_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), - opName#" $dst, $src0", pattern + op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0), + opName#" $sdst, $src0", pattern >; multiclass SOP1_64 pattern> : SOP1_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern + op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0), + opName#" $sdst, $src0", pattern >; // no input, 64-bit output. multiclass SOP1_64_0 pattern> { - def "" : SOP1_Pseudo ; + def "" : SOP1_Pseudo ; - def _si : SOP1_Real_si { - let ssrc0 = 0; + def _si : SOP1_Real_si { + let src0 = 0; } - def _vi : SOP1_Real_vi { - let ssrc0 = 0; + def _vi : SOP1_Real_vi { + let src0 = 0; } } @@ -763,13 +794,19 @@ multiclass SOP1_1 pattern> { // 64-bit input, 32-bit output. multiclass SOP1_32_64 pattern> : SOP1_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern + op, opName, (outs SReg_32:$sdst), (ins SSrc_64:$src0), + opName#" $sdst, $src0", pattern +>; + +// 32-bit input, 64-bit output. +multiclass SOP1_64_32 pattern> : SOP1_m < + op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0), + opName#" $sdst, $src0", pattern >; class SOP2_Pseudo pattern> : SOP2, - SIMCInstr { + SIMCInstr { let isPseudo = 1; let isCodeGenOnly = 1; let Size = 4; @@ -784,15 +821,19 @@ class SOP2_Pseudo pattern> : class SOP2_Real_si : SOP2, SOP2e, - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } class SOP2_Real_vi : SOP2, SOP2e, - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } multiclass SOP2_m pattern> : SOP2_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern + op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $sdst, $src0, $src1", pattern >; multiclass SOP2_64 pattern> : SOP2_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), - opName#" $dst, $src0, $src1", pattern + op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_64:$src1), + opName#" $sdst, $src0, $src1", pattern >; multiclass SOP2_64_32 pattern> : SOP2_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern + op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_32:$src1), + opName#" $sdst, $src0, $src1", pattern >; -class SOPC_Helper op, RegisterOperand rc, ValueType vt, - string opName, PatLeaf cond> : SOPC < - op, (outs), (ins rc:$src0, rc:$src1), - opName#" $src0, $src1", []> { +multiclass SOP2_64_32_32 pattern> : SOP2_m < + op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $sdst, $src0, $src1", pattern +>; + +class SOPC_Base op, RegisterOperand rc0, RegisterOperand rc1, + string opName, list pattern = []> : SOPC < + op, (outs), (ins rc0:$src0, rc1:$src1), + opName#" $src0, $src1", pattern > { let Defs = [SCC]; } +class SOPC_Helper op, RegisterOperand rc, ValueType vt, + string opName, PatLeaf cond> : SOPC_Base < + op, rc, rc, opName, + [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > { +} -class SOPC_32 op, string opName, PatLeaf cond = COND_NULL> +class SOPC_CMP_32 op, string opName, PatLeaf cond = COND_NULL> : SOPC_Helper; -class SOPC_64 op, string opName, PatLeaf cond = COND_NULL> - : SOPC_Helper; +class SOPC_32 op, string opName, list pattern = []> + : SOPC_Base; + +class SOPC_64_32 op, string opName, list pattern = []> + : SOPC_Base; class SOPK_Pseudo pattern> : SOPK , - SIMCInstr { + SIMCInstr { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -844,16 +898,20 @@ class SOPK_Pseudo pattern> : class SOPK_Real_si : SOPK , SOPKe , - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; let isCodeGenOnly = 0; } class SOPK_Real_vi : SOPK , SOPKe , - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; let isCodeGenOnly = 0; } @@ -868,14 +926,14 @@ multiclass SOPK_m pattern> { - def "" : SOPK_Pseudo ; - def _si : SOPK_Real_si ; + def _si : SOPK_Real_si ; - def _vi : SOPK_Real_vi ; + def _vi : SOPK_Real_vi ; } multiclass SOPK_SCC pattern> { @@ -908,15 +966,19 @@ multiclass SOPK_IMM32 , SOPK64e , - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; let isCodeGenOnly = 0; } def _vi : SOPK , SOPK64e , - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; let isCodeGenOnly = 0; } } @@ -926,86 +988,145 @@ multiclass SOPK_IMM32 pattern> : SMRD , - SIMCInstr { + SIMCInstr { let isPseudo = 1; let isCodeGenOnly = 1; } -class SMRD_Real_si op, string opName, bit imm, dag outs, dag ins, - string asm> : +class SMRD_IMM_Real_si op, string opName, dag outs, dag ins, + string asm> : + SMRD , + SMRD_IMMe , + SIMCInstr { + let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + +class SMRD_SOFF_Real_si op, string opName, dag outs, dag ins, + string asm> : SMRD , - SMRDe , - SIMCInstr { + SMRD_SOFFe , + SIMCInstr { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + + +class SMRD_IMM_Real_vi op, string opName, dag outs, dag ins, + string asm, list pattern = []> : + SMRD , + SMEM_IMMe_vi , + SIMCInstr { + let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } -class SMRD_Real_vi op, string opName, bit imm, dag outs, dag ins, - string asm, list pattern = []> : +class SMRD_SOFF_Real_vi op, string opName, dag outs, dag ins, + string asm, list pattern = []> : SMRD , - SMEMe_vi , - SIMCInstr { + SMEM_SOFFe_vi , + SIMCInstr { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } -multiclass SMRD_m pattern> { def "" : SMRD_Pseudo ; - def _si : SMRD_Real_si ; + def _si : SMRD_IMM_Real_si ; // glc is only applicable to scalar stores, which are not yet // implemented. let glc = 0 in { - def _vi : SMRD_Real_vi ; + def _vi : SMRD_IMM_Real_vi ; } } -multiclass SMRD_Inval { - let hasSideEffects = 1, mayStore = 1 in { - def "" : SMRD_Pseudo ; +multiclass SMRD_SOFF_m pattern> { - let sbase = 0, offset = 0 in { - let sdst = 0 in { - def _si : SMRD_Real_si ; - } + def "" : SMRD_Pseudo ; + + def _si : SMRD_SOFF_Real_si ; + + // glc is only applicable to scalar stores, which are not yet + // implemented. + let glc = 0 in { + def _vi : SMRD_SOFF_Real_vi ; + } +} + +multiclass SMRD_Special pattern = []> { + let hasSideEffects = 1 in { + def "" : SMRD_Pseudo ; + + let sbase = 0, soff = 0, sdst = sdst_ in { + def _si : SMRD_SOFF_Real_si ; - let glc = 0, sdata = 0 in { - def _vi : SMRD_Real_vi ; + let glc = 0 in { + def _vi : SMRD_SOFF_Real_vi ; } } } } +multiclass SMRD_Inval { + let mayStore = 1 in { + defm : SMRD_Special; + } +} + class SMEM_Inval op, string opName, SDPatternOperator node> : - SMRD_Real_vi { + SMRD_SOFF_Real_vi { let hasSideEffects = 1; let mayStore = 1; let sbase = 0; - let sdata = 0; + let sdst = 0; + let glc = 0; + let soff = 0; +} + +class SMEM_Ret op, string opName, SDPatternOperator node> : + SMRD_SOFF_Real_vi { + let hasSideEffects = 1; + let mayStore = ?; + let mayLoad = ?; + let sbase = 0; let glc = 0; - let offset = 0; + let soff = 0; } multiclass SMRD_Helper { - defm _IMM : SMRD_m < - op, opName#"_IMM", 1, (outs dstClass:$dst), + defm _IMM : SMRD_IMM_m < + op, opName#"_IMM", (outs dstClass:$sdst), (ins baseClass:$sbase, smrd_offset:$offset), - opName#" $dst, $sbase, $offset", [] + opName#" $sdst, $sbase, $offset", [] >; def _IMM_ci : SMRD < - (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset), - opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci { + (outs dstClass:$sdst), (ins baseClass:$sbase, smrd_literal_offset:$offset), + opName#" $sdst, $sbase, $offset", []>, SMRD_IMMe_ci { let AssemblerPredicates = [isCIOnly]; + let DecoderNamespace = "CI"; } - defm _SGPR : SMRD_m < - op, opName#"_SGPR", 0, (outs dstClass:$dst), + defm _SGPR : SMRD_SOFF_m < + op, opName#"_SGPR", (outs dstClass:$sdst), (ins baseClass:$sbase, SReg_32:$soff), - opName#" $dst, $sbase, $soff", [] + opName#" $sdst, $sbase, $soff", [] >; } @@ -1013,20 +1134,6 @@ multiclass SMRD_Helper { - let PrintMethod = "printOperandAndMods"; -} - -def InputModsMatchClass : AsmOperandClass { - let Name = "RegWithInputMods"; -} - -def InputModsNoDefault : Operand { - let PrintMethod = "printOperandAndMods"; - let ParserMatchClass = InputModsMatchClass; -} - class getNumSrcArgs { int ret = !if (!eq(Src0.Value, untyped.Value), 0, @@ -1050,12 +1157,12 @@ class getVOPSrc0ForVT { RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32); } -// Returns the register class to use for source 1 of VOP[12C] for the -// given VT. -class getVOPSrc1ForVT { +// Returns the vreg register class to use for source operand given VT +class getVregSrcForVT { RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32); } + // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT { @@ -1072,8 +1179,10 @@ class getVOP3SrcForVT { // Returns 1 if the source arguments have modifiers, 0 if they do not. // XXX - do f16 instructions? class hasModifiers { - bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, - !if(!eq(SrcVT.Value, f64.Value), 1, 0)); + bit ret = + !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, + 0)); } // Returns the input arguments for VOP[12C] instructions for the given SrcVT. @@ -1089,11 +1198,15 @@ class getIns64 { dag ret = + !if (!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP, V_CLREXCP) + (ins), + /* else */ !if (!eq(NumSrcArgs, 1), !if (!eq(HasModifiers, 1), // VOP1 with modifiers - (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, - ClampMod:$clamp, omod:$omod) + (ins FPInputMods:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, omod:$omod) /* else */, // VOP1 without modifiers (ins Src0RC:$src0) @@ -1101,9 +1214,9 @@ class getIns64 { + + dag ret = !if (!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP) + (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl), + !if (!eq(NumSrcArgs, 1), + !if (!eq(HasModifiers, 1), + // VOP1_DPP with modifiers + (ins FPInputMods:$src0_modifiers, Src0RC:$src0, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) + /* else */, + // VOP1_DPP without modifiers + (ins Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) + /* endif */) + /* NumSrcArgs == 2 */, + !if (!eq(HasModifiers, 1), + // VOP2_DPP with modifiers + (ins FPInputMods:$src0_modifiers, Src0RC:$src0, + FPInputMods:$src1_modifiers, Src1RC:$src1, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) + /* else */, + // VOP2_DPP without modifiers + (ins Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl, + row_mask:$row_mask, bank_mask:$bank_mask, + bound_ctrl:$bound_ctrl) + /* endif */))); +} + +class getInsSDWA { + + dag ret = !if(!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP) + (ins), + !if(!eq(NumSrcArgs, 1), + !if(HasFloatModifiers, + // VOP1_SDWA with float modifiers + (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel) + /* else */, + // VOP1_SDWA with sext modifier + (ins IntInputMods:$src0_imodifiers, Src0RC:$src0, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel) + /* endif */) + /* NumSrcArgs == 2 */, + !if(HasFloatModifiers, + !if(!eq(DstVT.Size, 1), + // VOPC_SDWA with float modifiers + (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0, + FPInputMods:$src1_fmodifiers, Src1RC:$src1, + clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA or VOPC_SDWA with float modifiers + (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0, + FPInputMods:$src1_fmodifiers, Src1RC:$src1, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel) + ), + /* else */ + !if(!eq(DstVT.Size, 1), + // VOPC_SDWA with sext modifiers + (ins IntInputMods:$src0_imodifiers, Src0RC:$src0, + IntInputMods:$src1_imodifiers, Src1RC:$src1, + clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA or VOPC_SDWA with sext modifier + (ins IntInputMods:$src0_imodifiers, Src0RC:$src0, + IntInputMods:$src1_imodifiers, Src1RC:$src1, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel) + ) + /* endif */))); +} + +// Outs for DPP and SDWA +class getOutsExt { + dag ret = !if(HasDst, + !if(!eq(DstVT.Size, 1), + (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions + (outs DstRCDPP:$vdst)), + (outs)); // V_NOP } // Returns the assembly string for the inputs and outputs of a VOP[12C] // instruction. This does not add the _e32 suffix, so it can be reused // by getAsm64. -class getAsm32 { - string dst = "$dst"; +class getAsm32 { + string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC string src0 = ", $src0"; string src1 = ", $src1"; string src2 = ", $src2"; @@ -1137,7 +1338,8 @@ class getAsm32 { // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. -class getAsm64 { +class getAsm64 { + string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); string src1 = !if(!eq(NumSrcArgs, 1), "", !if(!eq(NumSrcArgs, 2), " $src1_modifiers", @@ -1145,8 +1347,71 @@ class getAsm64 { string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); string ret = !if(!eq(HasModifiers, 0), - getAsm32.ret, - "$dst, "#src0#src1#src2#"$clamp"#"$omod"); + getAsm32.ret, + dst#", "#src0#src1#src2#"$clamp"#"$omod"); +} + +class getAsmDPP { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + "$sdst", + "$vdst"), + ""); // use $sdst for VOPC + string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string args = !if(!eq(HasModifiers, 0), + getAsm32<0, NumSrcArgs, DstVT>.ret, + ", "#src0#src1); + string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; +} + +class getAsmSDWA { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + " vcc", // use vcc token as dst for VOPC instructioins + "$vdst"), + ""); + string src0 = !if(HasFloatModifiers, "$src0_fmodifiers", "$src0_imodifiers"); + string src1 = !if(HasFloatModifiers, "$src1_fmodifiers", "$src1_imodifiers"); + string args = !if(!eq(NumSrcArgs, 0), + "", + !if(!eq(NumSrcArgs, 1), + ", "#src0#"$clamp", + ", "#src0#", "#src1#"$clamp" + ) + ); + string sdwa = !if(!eq(NumSrcArgs, 0), + "", + !if(!eq(NumSrcArgs, 1), + " $dst_sel $dst_unused $src0_sel", + !if(!eq(DstVT.Size, 1), + " $src0_sel $src1_sel", // No dst_sel and dst_unused for VOPC + " $dst_sel $dst_unused $src0_sel $src1_sel" + ) + ) + ); + string ret = dst#args#sdwa; +} + +// Function that checks if instruction supports DPP and SDWA +class getHasExt { + bit ret = !if(!eq(NumSrcArgs, 3), + 0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3 + !if(!eq(DstVT.Size, 64), + 0, // 64-bit dst - No DPP or SDWA for 64-bit operands + !if(!eq(Src0VT.Size, 64), + 0, // 64-bit src0 + !if(!eq(Src0VT.Size, 64), + 0, // 64-bit src2 + 1 + ) + ) + ) + ); } class VOPProfile _ArgVT> { @@ -1158,30 +1423,48 @@ class VOPProfile _ArgVT> { field ValueType Src1VT = ArgVT[2]; field ValueType Src2VT = ArgVT[3]; field RegisterOperand DstRC = getVALUDstForVT.ret; + field RegisterOperand DstRCDPP = getVALUDstForVT.ret; + field RegisterOperand DstRCSDWA = getVALUDstForVT.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT.ret; - field RegisterClass Src1RC32 = getVOPSrc1ForVT.ret; + field RegisterClass Src1RC32 = getVregSrcForVT.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT.ret; field RegisterOperand Src1RC64 = getVOP3SrcForVT.ret; field RegisterOperand Src2RC64 = getVOP3SrcForVT.ret; + field RegisterClass Src0DPP = getVregSrcForVT.ret; + field RegisterClass Src1DPP = getVregSrcForVT.ret; + field RegisterClass Src0SDWA = getVregSrcForVT.ret; + field RegisterClass Src1SDWA = getVregSrcForVT.ret; field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); field bit HasDst32 = HasDst; field int NumSrcArgs = getNumSrcArgs.ret; field bit HasModifiers = hasModifiers.ret; - field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs)); + field bit HasExt = getHasExt.ret; + + field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs)); // VOP3b instructions are a special case with a second explicit // output. This is manually overridden for them. field dag Outs32 = Outs; field dag Outs64 = Outs; + field dag OutsDPP = getOutsExt.ret; + field dag OutsSDWA = getOutsExt.ret; field dag Ins32 = getIns32.ret; field dag Ins64 = getIns64.ret; + field dag InsDPP = getInsDPP.ret; + field dag InsSDWA = getInsSDWA.ret; + + field string Asm32 = getAsm32.ret; + field string Asm64 = getAsm64.ret; + field string AsmDPP = getAsmDPP.ret; + field string AsmSDWA = getAsmSDWA.ret; +} - field string Asm32 = getAsm32.ret; - field string Asm64 = getAsm64.ret; +class VOP_NO_EXT : VOPProfile { + let HasExt = 0; } // FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order @@ -1194,6 +1477,9 @@ def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; +def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; + def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; @@ -1216,10 +1502,10 @@ def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; // Write out to vcc or arbitrary SGPR. def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { - let Asm32 = "$dst, vcc, $src0, $src1"; - let Asm64 = "$dst, $sdst, $src0, $src1"; - let Outs32 = (outs DstRC:$dst); - let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); + let Asm32 = "$vdst, vcc, $src0, $src1"; + let Asm64 = "$vdst, $sdst, $src0, $src1"; + let Outs32 = (outs DstRC:$vdst); + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); } // Write out to vcc or arbitrary SGPR and read in from vcc or @@ -1231,10 +1517,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { // restriction. SGPRs are still allowed because it should // technically be possible to use VCC again as src0. let Src0RC32 = VCSrc_32; - let Asm32 = "$dst, vcc, $src0, $src1, vcc"; - let Asm64 = "$dst, $sdst, $src0, $src1, $src2"; - let Outs32 = (outs DstRC:$dst); - let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); + let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; + let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; + let Outs32 = (outs DstRC:$vdst); + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + + // Suppress src2 implied by type since the 32-bit encoding uses an + // implicit VCC use. + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); +} + +// Read in from vcc or arbitrary SGPR +def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { + let Src0RC32 = VCSrc_32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above. + let Asm32 = "$vdst, $src0, $src1, vcc"; + let Asm64 = "$vdst, $src0, $src1, $src2"; + let Outs32 = (outs DstRC:$vdst); + let Outs64 = (outs DstRC:$vdst); // Suppress src2 implied by type since the 32-bit encoding uses an // implicit VCC use. @@ -1263,11 +1562,17 @@ class VOPC_Profile : VOPProfile <[i1, vt0, v let Asm32 = "vcc, $src0, $src1"; // The destination for 32-bit encoding is implicit. let HasDst32 = 0; + let Outs64 = (outs DstRC:$sdst); } class VOPC_Class_Profile : VOPC_Profile { - let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = "$dst, $src0_modifiers, $src1"; + let Ins64 = (ins FPInputMods:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = "$sdst, $src0_modifiers, $src1"; + let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC64:$src0, + IntInputMods:$src1_imodifiers, Src1RC64:$src1, + clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); + let AsmSDWA = " vcc, $src0_fmodifiers, $src1_imodifiers$clamp $src0_sel $src1_sel"; + } def VOPC_I1_F32_F32 : VOPC_Profile; @@ -1281,28 +1586,42 @@ def VOPC_I1_F64_I32 : VOPC_Class_Profile; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; -def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); - let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); - let Asm64 = "$dst, $src0, $src1, $src2"; -} def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; -def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { - field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); - field string Asm = "$dst, $src0, $vsrc1, $src2"; +def VOP_MADAK : VOPProfile <[f32, f32, f32, f32]> { + field dag Ins32 = (ins VCSrc_32:$src0, VGPR_32:$src1, u32kimm:$imm); + field string Asm32 = "$vdst, $src0, $src1, $imm"; + field bit HasExt = 0; +} +def VOP_MADMK : VOPProfile <[f32, f32, f32, f32]> { + field dag Ins32 = (ins VCSrc_32:$src0, u32kimm:$imm, VGPR_32:$src1); + field string Asm32 = "$vdst, $src0, $imm, $src1"; + field bit HasExt = 0; } def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64, 3, HasModifiers>.ret; - let Asm32 = getAsm32<1, 2>.ret; - let Asm64 = getAsm64<1, 2, HasModifiers>.ret; + let InsDPP = (ins FPInputMods:$src0_modifiers, Src0RC32:$src0, + FPInputMods:$src1_modifiers, Src1RC32:$src1, + VGPR_32:$src2, // stub argument + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC32:$src0, + FPInputMods:$src1_fmodifiers, Src1RC32:$src1, + VGPR_32:$src2, // stub argument + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); + let Asm32 = getAsm32<1, 2, f32>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret; + let AsmDPP = getAsmDPP<1, 2, HasModifiers, f32>.ret; + let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, f32>.ret; } def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; +// This class is used only with VOPC instructions. Use $sdst for out operand class SIInstAlias : InstAlias , PredicateControl { @@ -1313,13 +1632,13 @@ class SIInstAlias : !if (p.HasDst32, !if (!eq(p.NumSrcArgs, 0), // 1 dst, 0 src - (inst p.DstRC:$dst), + (inst p.DstRC:$sdst), !if (!eq(p.NumSrcArgs, 1), // 1 dst, 1 src - (inst p.DstRC:$dst, p.Src0RC32:$src0), + (inst p.DstRC:$sdst, p.Src0RC32:$src0), !if (!eq(p.NumSrcArgs, 2), // 1 dst, 2 src - (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1), + (inst p.DstRC:$sdst, p.Src0RC32:$src0, p.Src1RC32:$src1), // else - unreachable (inst)))), // else @@ -1368,7 +1687,7 @@ class AtomicNoRet { class VOP1_Pseudo pattern, string opName> : VOP1Common , VOP , - SIMCInstr , + SIMCInstr , MnemonicAlias { let isPseudo = 1; let isCodeGenOnly = 1; @@ -1379,14 +1698,18 @@ class VOP1_Pseudo pattern, string opName> : class VOP1_Real_si : VOP1, - SIMCInstr { + SIMCInstr { let AssemblerPredicate = SIAssemblerPredicate; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } class VOP1_Real_vi : VOP1, - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } multiclass VOP1_m pattern, @@ -1399,6 +1722,49 @@ multiclass VOP1_m pattern, } +class VOP1_DPP : + VOP1_DPPe , + VOP_DPP { + let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); + let DecoderNamespace = "DPP"; + let DisableDecoder = DisableVIDecoder; + let src0_modifiers = !if(p.HasModifiers, ?, 0); + let src1_modifiers = 0; +} + +class SDWADisableFields { + bits<8> src0 = !if(!eq(p.NumSrcArgs, 0), 0, ?); + bits<3> src0_sel = !if(!eq(p.NumSrcArgs, 0), 6, ?); + bits<2> src0_fmodifiers = !if(!eq(p.NumSrcArgs, 0), + 0, + !if(p.HasModifiers, ?, 0)); + bits<1> src0_imodifiers = !if(!eq(p.NumSrcArgs, 0), + 0, + !if(p.HasModifiers, 0, ?)); + bits<3> src1_sel = !if(!eq(p.NumSrcArgs, 0), 6, + !if(!eq(p.NumSrcArgs, 1), 6, + ?)); + bits<2> src1_fmodifiers = !if(!eq(p.NumSrcArgs, 0), 0, + !if(!eq(p.NumSrcArgs, 1), 0, + !if(p.HasModifiers, ?, 0))); + bits<1> src1_imodifiers = !if(!eq(p.NumSrcArgs, 0), 0, + !if(!eq(p.NumSrcArgs, 1), 0, + !if(p.HasModifiers, 0, ?))); + bits<3> dst_sel = !if(p.HasDst, ?, 6); + bits<2> dst_unused = !if(p.HasDst, ?, 2); + bits<1> clamp = !if(!eq(p.NumSrcArgs, 0), 0, ?); +} + +class VOP1_SDWA : + VOP1_SDWAe , + VOP_SDWA , + SDWADisableFields

{ + let AsmMatchConverter = "cvtSdwaVOP1"; + let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); + let DecoderNamespace = "SDWA"; + let DisableDecoder = DisableVIDecoder; +} + multiclass VOP1SI_m pattern, string asm = opName#p.Asm32> { @@ -1410,7 +1776,7 @@ multiclass VOP1SI_m pattern, class VOP2_Pseudo pattern, string opName> : VOP2Common , VOP , - SIMCInstr, + SIMCInstr, MnemonicAlias { let isPseudo = 1; let isCodeGenOnly = 1; @@ -1418,14 +1784,18 @@ class VOP2_Pseudo pattern, string opName> : class VOP2_Real_si : VOP2 , - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } class VOP2_Real_vi : VOP2 , - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } multiclass VOP2SI_m pattern, @@ -1449,6 +1819,26 @@ multiclass VOP2_m pattern, } +class VOP2_DPP : + VOP2_DPPe , + VOP_DPP { + let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); + let DecoderNamespace = "DPP"; + let DisableDecoder = DisableVIDecoder; + let src0_modifiers = !if(p.HasModifiers, ?, 0); + let src1_modifiers = !if(p.HasModifiers, ?, 0); +} + +class VOP2_SDWA : + VOP2_SDWAe , + VOP_SDWA , + SDWADisableFields

{ + let AsmMatchConverter = "cvtSdwaVOP2"; + let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); + let DecoderNamespace = "SDWA"; + let DisableDecoder = DisableVIDecoder; +} + class VOP3DisableFields { bits<2> src0_modifiers = !if(HasModifiers, ?, 0); @@ -1471,10 +1861,11 @@ class VOP3DisableModFields clamp = !if(HasOutputMods, ?, 0); } -class VOP3_Pseudo pattern, string opName> : - VOP3Common , +class VOP3_Pseudo pattern, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common , VOP , - SIMCInstr, + SIMCInstr, MnemonicAlias { let isPseudo = 1; let isCodeGenOnly = 1; @@ -1483,44 +1874,96 @@ class VOP3_Pseudo pattern, string opName> : field bit src0; } -class VOP3_Real_si op, dag outs, dag ins, string asm, string opName> : - VOP3Common , +class VOP3_Real_si op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common , VOP3e , - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } -class VOP3_Real_vi op, dag outs, dag ins, string asm, string opName> : - VOP3Common , +class VOP3_Real_vi op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common , VOP3e_vi , - SIMCInstr { + SIMCInstr { + let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; +} + +class VOP3_C_Real_si op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common , + VOP3ce , + SIMCInstr { + let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + +class VOP3_C_Real_vi op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common , + VOP3ce_vi , + SIMCInstr { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } -class VOP3b_Real_si op, dag outs, dag ins, string asm, string opName> : - VOP3Common , +class VOP3b_Real_si op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common , VOP3be , - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } -class VOP3b_Real_vi op, dag outs, dag ins, string asm, string opName> : - VOP3Common , +class VOP3b_Real_vi op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common , VOP3be_vi , - SIMCInstr { + SIMCInstr { + let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; +} + +class VOP3e_Real_si op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common , + VOP3e , + SIMCInstr { + let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + +class VOP3e_Real_vi op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common , + VOP3e_vi , + SIMCInstr { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } multiclass VOP3_m pattern, - string opName, int NumSrcArgs, bit HasMods = 1> { + string opName, int NumSrcArgs, bit HasMods = 1, bit VOP3Only = 0> { def "" : VOP3_Pseudo ; - def _si : VOP3_Real_si , + def _si : VOP3_Real_si , VOP3DisableFields; - def _vi : VOP3_Real_vi , + def _vi : VOP3_Real_vi , VOP3DisableFields; @@ -1529,21 +1972,21 @@ multiclass VOP3_m pattern, multiclass VOP3_1_m pattern, string opName, bit HasMods = 1> { - def "" : VOP3_Pseudo ; + def "" : VOP3_Pseudo ; - def _si : VOP3_Real_si , + def _si : VOP3_Real_si , VOP3DisableFields<0, 0, HasMods>; - def _vi : VOP3_Real_vi , + def _vi : VOP3_Real_vi , VOP3DisableFields<0, 0, HasMods>; } multiclass VOP3SI_1_m pattern, string opName, bit HasMods = 1> { - def "" : VOP3_Pseudo ; + def "" : VOP3_Pseudo ; - def _si : VOP3_Real_si , + def _si : VOP3_Real_si , VOP3DisableFields<0, 0, HasMods>; // No VI instruction. This class is for SI only. } @@ -1552,13 +1995,13 @@ multiclass VOP3_2_m pattern, string opName, string revOp, bit HasMods = 1> { - def "" : VOP3_Pseudo , + def "" : VOP3_Pseudo , VOP2_REV; - def _si : VOP3_Real_si , + def _si : VOP3_Real_si , VOP3DisableFields<1, 0, HasMods>; - def _vi : VOP3_Real_vi , + def _vi : VOP3_Real_vi , VOP3DisableFields<1, 0, HasMods>; } @@ -1566,10 +2009,10 @@ multiclass VOP3SI_2_m pattern, string opName, string revOp, bit HasMods = 1> { - def "" : VOP3_Pseudo , + def "" : VOP3_Pseudo , VOP2_REV; - def _si : VOP3_Real_si , + def _si : VOP3_Real_si , VOP3DisableFields<1, 0, HasMods>; // No VI instruction. This class is for SI only. @@ -1579,13 +2022,26 @@ multiclass VOP3SI_2_m pattern, string opName, string revOp, - bit HasMods = 1, bit useSrc2Input = 0> { - def "" : VOP3_Pseudo ; + bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> { + def "" : VOP3_Pseudo ; + + def _si : VOP3b_Real_si , + VOP3DisableFields<1, useSrc2Input, HasMods>; + + def _vi : VOP3b_Real_vi , + VOP3DisableFields<1, useSrc2Input, HasMods>; +} - def _si : VOP3b_Real_si , +// Same as VOP3b_2_3_m but no 2nd destination (sdst), e.g. v_cndmask_b32. +multiclass VOP3e_2_3_m pattern, string opName, string revOp, + bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> { + def "" : VOP3_Pseudo ; + + def _si : VOP3e_Real_si , VOP3DisableFields<1, useSrc2Input, HasMods>; - def _vi : VOP3b_Real_vi , + def _vi : VOP3e_Real_vi , VOP3DisableFields<1, useSrc2Input, HasMods>; } @@ -1594,19 +2050,19 @@ multiclass VOP3_C_m sched> { - def "" : VOP3_Pseudo , + def "" : VOP3_Pseudo , VOP2_REV { let Defs = !if(defExec, [EXEC], []); let SchedRW = sched; } - def _si : VOP3_Real_si , + def _si : VOP3_C_Real_si , VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); let SchedRW = sched; } - def _vi : VOP3_Real_vi , + def _vi : VOP3_C_Real_vi , VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); let SchedRW = sched; @@ -1618,19 +2074,23 @@ multiclass VOP2SI_3VI_m pattern = []> { let isPseudo = 1, isCodeGenOnly = 1 in { def "" : VOPAnyCommon , - SIMCInstr; + SIMCInstr; } def _si : VOP2 , - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } def _vi : VOP3Common , VOP3e_vi , VOP3DisableFields <1, 0, 0>, - SIMCInstr { + SIMCInstr { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } } @@ -1641,15 +2101,19 @@ multiclass VOP1_Helper pat32, defm _e64 : VOP3_1_m ; + + def _dpp : VOP1_DPP ; + + def _sdwa : VOP1_SDWA ; } multiclass VOP1Inst : VOP1_Helper < op, opName, P, [], !if(P.HasModifiers, - [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]) + [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]) >; multiclass VOP1InstSI ; } @@ -1672,6 +2136,10 @@ multiclass VOP2_Helper pat32, defm _e64 : VOP3_2_m ; + + def _dpp : VOP2_DPP ; + + def _sdwa : VOP2_SDWA ; } multiclass VOP2Inst : VOP2_Helper < op, opName, P, [], !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), revOp >; @@ -1695,14 +2163,41 @@ multiclass VOP2InstSI ; } +multiclass VOP2e_Helper pat32, list pat64, + string revOp, bit useSGPRInput> { + + let SchedRW = [Write32Bit] in { + let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in { + defm _e32 : VOP2_m ; + } + + defm _e64 : VOP3e_2_3_m ; + } +} + +multiclass VOP2eInst : VOP2e_Helper < + op, opName, P, [], + !if(P.HasModifiers, + [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, !eq(P.NumSrcArgs, 3) +>; + multiclass VOP2b_Helper pat32, list pat64, string revOp, bit useSGPRInput> { @@ -1722,11 +2217,11 @@ multiclass VOP2bInst : VOP2b_Helper < op, opName, P, [], !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), revOp, !eq(P.NumSrcArgs, 3) >; @@ -1746,31 +2241,35 @@ multiclass VOP2_VI3_Inst ; -multiclass VOP2MADK pattern = []> { +multiclass VOP2MADK pattern = []> { - def "" : VOP2_Pseudo ; + def "" : VOP2_Pseudo ; let isCodeGenOnly = 0 in { - def _si : VOP2Common , - SIMCInstr , + def _si : VOP2Common , + SIMCInstr , VOP2_MADKe { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } - def _vi : VOP2Common , - SIMCInstr , + def _vi : VOP2Common , + SIMCInstr , VOP2_MADKe { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } } // End isCodeGenOnly = 0 } @@ -1778,37 +2277,55 @@ let isCodeGenOnly = 0 in { class VOPC_Pseudo pattern, string opName> : VOPCCommon , VOP , - SIMCInstr { + SIMCInstr { let isPseudo = 1; let isCodeGenOnly = 1; } +class VOPC_SDWA : + VOPC_SDWAe , + VOP_SDWA , + SDWADisableFields

{ + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let AsmMatchConverter = "cvtSdwaVOPC"; + let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); + let DecoderNamespace = "SDWA"; + let DisableDecoder = DisableVIDecoder; +} + multiclass VOPC_m pattern, string opName, bit DefExec, VOPProfile p, list sched, string revOpName = "", string asm = opName#"_e32 "#op_asm, string alias_asm = opName#" "#op_asm> { - def "" : VOPC_Pseudo { + def "" : VOPC_Pseudo , + VOP2_REV { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = sched; + let isConvergent = DefExec; } let AssemblerPredicates = [isSICI] in { def _si : VOPC, - SIMCInstr { + SIMCInstr { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let hasSideEffects = DefExec; + let isConvergent = DefExec; let SchedRW = sched; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } } // End AssemblerPredicates = [isSICI] let AssemblerPredicates = [isVI] in { def _vi : VOPC, - SIMCInstr { + SIMCInstr { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let hasSideEffects = DefExec; + let isConvergent = DefExec; let SchedRW = sched; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } } // End AssemblerPredicates = [isVI] @@ -1819,10 +2336,13 @@ multiclass VOPC_m pattern, multiclass VOPC_Helper pat32, list pat64, bit DefExec, string revOp, VOPProfile p, list sched> { - defm _e32 : VOPC_m ; + defm _e32 : VOPC_m ; - defm _e64 : VOP3_C_m ; + + def _sdwa : VOPC_SDWA ; } // Special case for class instructions which only have modifiers on @@ -1832,9 +2352,14 @@ multiclass VOPC_Class_Helper pat32, VOPProfile p, list sched> { defm _e32 : VOPC_m ; - defm _e64 : VOP3_C_m , VOP3DisableModFields<1, 0, 0>; + + def _sdwa : VOPC_SDWA { + let src1_fmodifiers = 0; + let src1_imodifiers = ?; + } } multiclass VOPCInst ; @@ -1859,9 +2384,9 @@ multiclass VOPCClassInst sched> : VOPC_Class_Helper < op, opName, [], !if(P.HasModifiers, - [(set i1:$dst, + [(set i1:$sdst, (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], - [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), + [(set i1:$sdst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), DefExec, opName, P, sched >; @@ -1897,10 +2422,6 @@ multiclass VOPCX_I32 : multiclass VOPCX_I64 : VOPCX ; -multiclass VOP3_Helper pat, int NumSrcArgs, bit HasMods> : VOP3_m < - op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods ->; multiclass VOPC_CLASS_F32 : VOPCClassInst ; @@ -1914,32 +2435,40 @@ multiclass VOPC_CLASS_F64 : multiclass VOPCX_CLASS_F64 : VOPCClassInst ; + +multiclass VOP3_Helper pat, int NumSrcArgs, bit HasMods, + bit VOP3Only = 0> : VOP3_m < + op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods, VOP3Only +>; + multiclass VOP3Inst : VOP3_Helper < - op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64, + SDPatternOperator node = null_frag, bit VOP3Only = 0> : + VOP3_Helper < + op, opName, (outs P.DstRC.RegClass:$vdst), P.Ins64, P.Asm64, !if(!eq(P.NumSrcArgs, 3), !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))]), !if(!eq(P.NumSrcArgs, 2), !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) /* P.NumSrcArgs == 1 */, !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), - P.NumSrcArgs, P.HasModifiers + [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]))), + P.NumSrcArgs, P.HasModifiers, VOP3Only >; // Special case for v_div_fmas_{f32|f64}, since it seems to be the @@ -1948,14 +2477,14 @@ multiclass VOP3_VCC_Inst : VOP3_Helper < op, opName, - (outs P.DstRC.RegClass:$dst), - (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, - InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, - InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, - ClampMod:$clamp, + (outs P.DstRC.RegClass:$vdst), + (ins FPInputMods:$src0_modifiers, P.Src0RC64:$src0, + FPInputMods:$src1_modifiers, P.Src1RC64:$src1, + FPInputMods:$src2_modifiers, P.Src2RC64:$src2, + clampmod:$clamp, omod:$omod), - "$dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", - [(set P.DstVT:$dst, + "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), @@ -1964,11 +2493,11 @@ multiclass VOP3_VCC_Inst ; -multiclass VOP3bInst pattern = []> : +multiclass VOP3bInst pattern = [], bit VOP3Only = 0> : VOP3b_2_3_m < op, P.Outs64, P.Ins64, opName#" "#P.Asm64, pattern, - opName, "", 1, 1 + opName, "", 1, 1, VOP3Only >; class Vop3ModPat : Pat< @@ -1987,7 +2516,7 @@ class Vop3ModPat : Pat< class VINTRP_Pseudo pattern> : VINTRPCommon , - SIMCInstr { + SIMCInstr { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -1996,13 +2525,21 @@ class VINTRP_Real_si op, string opName, dag outs, dag ins, string asm> : VINTRPCommon , VINTRPe , - SIMCInstr; + SIMCInstr { + let AssemblerPredicate = SIAssemblerPredicate; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} class VINTRP_Real_vi op, string opName, dag outs, dag ins, string asm> : VINTRPCommon , VINTRPe_vi , - SIMCInstr; + SIMCInstr { + let AssemblerPredicate = VIAssemblerPredicate; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; +} multiclass VINTRP_m op, dag outs, dag ins, string asm, list pattern = []> { @@ -2019,7 +2556,7 @@ multiclass VINTRP_m op, dag outs, dag ins, string asm, class DS_Pseudo pattern> : DS , - SIMCInstr { + SIMCInstr { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -2027,14 +2564,22 @@ class DS_Pseudo pattern> : class DS_Real_si op, string opName, dag outs, dag ins, string asm> : DS , DSe , - SIMCInstr { + SIMCInstr { let isCodeGenOnly = 0; + let AssemblerPredicates = [isSICI]; + let DecoderNamespace="SICI"; + let DisableDecoder = DisableSIDecoder; } class DS_Real_vi op, string opName, dag outs, dag ins, string asm> : DS , DSe_vi , - SIMCInstr ; + SIMCInstr { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isVI]; + let DecoderNamespace="VI"; + let DisableDecoder = DisableVIDecoder; +} class DS_Off16_Real_si op, string opName, dag outs, dag ins, string asm> : DS_Real_si { @@ -2043,7 +2588,6 @@ class DS_Off16_Real_si op, string opName, dag outs, dag ins, string asm bits<16> offset; let offset0 = offset{7-0}; let offset1 = offset{15-8}; - let isCodeGenOnly = 0; } class DS_Off16_Real_vi op, string opName, dag outs, dag ins, string asm> : @@ -2055,9 +2599,24 @@ class DS_Off16_Real_vi op, string opName, dag outs, dag ins, string asm let offset1 = offset{15-8}; } +multiclass DS_1A_RET_ { + + def "" : DS_Pseudo ; + + let data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si ; + def _vi : DS_Off16_Real_vi ; + } +} + +// TODO: DS_1A_RET can be inherited from DS_1A_RET_ but its not working +// for some reason. In fact we can remove this class if use dsop everywhere multiclass DS_1A_RET op, string opName, RegisterClass rc, dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds), string asm = opName#" $vdst, $addr"#"$offset$gds"> { def "" : DS_Pseudo ; @@ -2070,8 +2629,8 @@ multiclass DS_1A_RET op, string opName, RegisterClass rc, multiclass DS_1A_Off8_RET op, string opName, RegisterClass rc, dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, - gds01:$gds), + dag ins = (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, + gds:$gds), string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { def "" : DS_Pseudo ; @@ -2084,7 +2643,7 @@ multiclass DS_1A_Off8_RET op, string opName, RegisterClass rc, multiclass DS_1A1D_NORET op, string opName, RegisterClass rc, dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), string asm = opName#" $addr, $data0"#"$offset$gds"> { def "" : DS_Pseudo , @@ -2096,11 +2655,25 @@ multiclass DS_1A1D_NORET op, string opName, RegisterClass rc, } } -multiclass DS_1A1D_Off8_NORET op, string opName, RegisterClass rc, +multiclass DS_1A_Off8_NORET op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, + offset0:$offset0, offset1:$offset1, gds:$gds), + string asm = opName#" $addr $offset0"#"$offset1$gds"> { + + def "" : DS_Pseudo ; + + let data0 = 0, data1 = 0, vdst = 0, AsmMatchConverter = "cvtDSOffset01" in { + def _si : DS_Real_si ; + def _vi : DS_Real_vi ; + } +} + +multiclass DS_1A2D_Off8_NORET op, string opName, RegisterClass rc, dag outs = (outs), dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds), - string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> { + offset0:$offset0, offset1:$offset1, gds:$gds), + string asm = opName#" $addr, $data0, $data1$offset0$offset1$gds"> { def "" : DS_Pseudo ; @@ -2113,7 +2686,7 @@ multiclass DS_1A1D_Off8_NORET op, string opName, RegisterClass rc, multiclass DS_1A1D_RET op, string opName, RegisterClass rc, string noRetOp = "", dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { let hasPostISelHook = 1 in { @@ -2127,6 +2700,23 @@ multiclass DS_1A1D_RET op, string opName, RegisterClass rc, } } +multiclass DS_1A1D_PERMUTE op, string opName, RegisterClass rc, + SDPatternOperator node = null_frag, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset), + string asm = opName#" $vdst, $addr, $data0"#"$offset"> { + + let mayLoad = 0, mayStore = 0, isConvergent = 1 in { + def "" : DS_Pseudo ; + + let data1 = 0, gds = 0 in { + def "_vi" : DS_Off16_Real_vi ; + } + } +} + multiclass DS_1A2D_RET_m op, string opName, RegisterClass rc, string noRetOp = "", dag ins, dag outs = (outs rc:$vdst), @@ -2145,14 +2735,14 @@ multiclass DS_1A2D_RET op, string asm, RegisterClass rc, string noRetOp = "", RegisterClass src = rc> : DS_1A2D_RET_m ; multiclass DS_1A2D_NORET op, string opName, RegisterClass rc, string noRetOp = opName, dag outs = (outs), dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - ds_offset:$offset, gds:$gds), + offset:$offset, gds:$gds), string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { def "" : DS_Pseudo , @@ -2166,7 +2756,7 @@ multiclass DS_1A2D_NORET op, string opName, RegisterClass rc, multiclass DS_0A_RET op, string opName, dag outs = (outs VGPR_32:$vdst), - dag ins = (ins ds_offset:$offset, gds:$gds), + dag ins = (ins offset:$offset, gds:$gds), string asm = opName#" $vdst"#"$offset"#"$gds"> { let mayLoad = 1, mayStore = 1 in { @@ -2181,7 +2771,7 @@ multiclass DS_0A_RET op, string opName, multiclass DS_1A_RET_GDS op, string opName, dag outs = (outs VGPR_32:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset), + dag ins = (ins VGPR_32:$addr, offset:$offset), string asm = opName#" $vdst, $addr"#"$offset gds"> { def "" : DS_Pseudo ; @@ -2207,7 +2797,7 @@ multiclass DS_1A_GDS op, string opName, multiclass DS_1A op, string opName, dag outs = (outs), - dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds), string asm = opName#" $addr"#"$offset"#"$gds"> { let mayLoad = 1, mayStore = 1 in { @@ -2226,7 +2816,7 @@ multiclass DS_1A op, string opName, class MTBUF_Pseudo pattern> : MTBUF , - SIMCInstr { + SIMCInstr { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -2235,12 +2825,18 @@ class MTBUF_Real_si op, string opName, dag outs, dag ins, string asm> : MTBUF , MTBUFe , - SIMCInstr; + SIMCInstr { + let DecoderNamespace="SICI"; + let DisableDecoder = DisableSIDecoder; +} class MTBUF_Real_vi op, string opName, dag outs, dag ins, string asm> : MTBUF , MTBUFe_vi , - SIMCInstr ; + SIMCInstr { + let DecoderNamespace="VI"; + let DisableDecoder = DisableVIDecoder; +} multiclass MTBUF_m op, string opName, dag outs, dag ins, string asm, list pattern> { @@ -2311,7 +2907,7 @@ class MUBUFAddr64Table { class MUBUF_Pseudo pattern> : MUBUF , - SIMCInstr { + SIMCInstr { let isPseudo = 1; let isCodeGenOnly = 1; @@ -2329,16 +2925,22 @@ class MUBUF_Real_si : MUBUF , MUBUFe , - SIMCInstr { + SIMCInstr { let lds = 0; + let AssemblerPredicate = SIAssemblerPredicate; + let DecoderNamespace="SICI"; + let DisableDecoder = DisableSIDecoder; } class MUBUF_Real_vi : MUBUF , MUBUFe_vi , - SIMCInstr { + SIMCInstr { let lds = 0; + let AssemblerPredicate = VIAssemblerPredicate; + let DecoderNamespace="VI"; + let DisableDecoder = DisableVIDecoder; } multiclass MUBUF_m pattern, bit is_return> { + + def "" : MUBUF_Pseudo , + AtomicNoRet; + + let tfe = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si ; + } + + def _vi : MUBUF_Real_vi ; + } +} + multiclass MUBUF_Atomic { - let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in { // No return variants - let glc = 0 in { + let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in { defm _ADDR64 : MUBUFAtomicAddr64_m < op, name#"_addr64", (outs), (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 + SCSrc_32:$soffset, offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$slc", [], 0 >; defm _OFFSET : MUBUFAtomicOffset_m < op, name#"_offset", (outs), - (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, + (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, offset:$offset, slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 + name#" $vdata, off, $srsrc, $soffset$offset$slc", [], 0 >; + + let offen = 1, idxen = 0 in { + defm _OFFEN : MUBUFAtomicOther_m < + op, name#"_offen", (outs), + (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$slc", [], 0 + >; + } + + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUFAtomicOther_m < + op, name#"_idxen", (outs), + (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$slc", [], 0 + >; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUFAtomicOther_m < + op, name#"_bothen", (outs), + (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$slc", + [], 0 + >; + } } // glc = 0 // Variant that return values let glc = 1, Constraints = "$vdata = $vdata_in", + AsmMatchConverter = "cvtMubufAtomicReturn", DisableEncoding = "$vdata_in" in { defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < op, name#"_rtn_addr64", (outs rc:$vdata), (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", + SCSrc_32:$soffset, offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset glc$slc", [(set vt:$vdata, (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$vdata_in))], 1 @@ -2439,13 +3085,42 @@ multiclass MUBUF_Atomic ; + let offen = 1, idxen = 0 in { + defm _RTN_OFFEN : MUBUFAtomicOther_m < + op, name#"_rtn_offen", (outs rc:$vdata), + (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset offen$offset glc$slc", + [], 1 + >; + } + + let offen = 0, idxen = 1 in { + defm _RTN_IDXEN : MUBUFAtomicOther_m < + op, name#"_rtn_idxen", (outs rc:$vdata), + (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset glc$slc", + [], 1 + >; + } + + let offen = 1, idxen = 1 in { + defm _RTN_BOTHEN : MUBUFAtomicOther_m < + op, name#"_rtn_bothen", (outs rc:$vdata), + (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset glc$slc", + [], 1 + >; + } } // glc = 1 } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 @@ -2461,8 +3136,8 @@ multiclass MUBUF_Load_Helper ; @@ -2471,33 +3146,32 @@ multiclass MUBUF_Load_Helper ; + name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$glc$slc$tfe", []>; } let offen = 0, idxen = 1 in { defm _IDXEN : MUBUF_m ; + name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>; } let offen = 1, idxen = 1 in { defm _BOTHEN : MUBUF_m ; + offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>; } let offen = 0, idxen = 0 in { defm _ADDR64 : MUBUFAddr64_m { let mayLoad = 0, mayStore = 1 in { - defm : MUBUF_m ; - let offen = 0, idxen = 0, vaddr = 0 in { defm _OFFSET : MUBUF_m ; } // offen = 0, idxen = 0, vaddr = 0 @@ -2528,35 +3195,35 @@ multiclass MUBUF_Store_Helper ; + name#" $vdata, $vaddr, $srsrc, $soffset offen"# + "$offset$glc$slc$tfe", []>; } // end offen = 1, idxen = 0 let offen = 0, idxen = 1 in { defm _IDXEN : MUBUF_m ; + name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>; } let offen = 1, idxen = 1 in { defm _BOTHEN : MUBUF_m ; + offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>; } let offen = 0, idxen = 0 in { defm _ADDR64 : MUBUFAddr64_m ci, bits<7> vi = ci> { class FLAT_Pseudo pattern> : FLAT <0, outs, ins, "", pattern>, - SIMCInstr { + SIMCInstr { let isPseudo = 1; let isCodeGenOnly = 1; } class FLAT_Real_ci op, string opName, dag outs, dag ins, string asm> : FLAT , - SIMCInstr { + SIMCInstr { let AssemblerPredicate = isCIOnly; + let DecoderNamespace="CI"; } class FLAT_Real_vi op, string opName, dag outs, dag ins, string asm> : FLAT , - SIMCInstr { + SIMCInstr { let AssemblerPredicate = VIAssemblerPredicate; + let DecoderNamespace="VI"; + let DisableDecoder = DisableVIDecoder; } multiclass FLAT_AtomicRet_m { + dag ins = (ins VReg_64:$addr, glc:$glc, slc:$slc, tfe:$tfe), + string asm = asm_name#" $vdst, $addr$glc$slc$tfe"> { let data = 0, mayLoad = 1 in { @@ -2639,9 +3309,9 @@ multiclass FLAT_Load_Helper { + dag ins = (ins VReg_64:$addr, vdataClass:$data, glc:$glc, + slc:$slc, tfe:$tfe), + string asm = asm_name#" $addr, $data$glc$slc$tfe"> { let mayLoad = 0, mayStore = 1, vdst = 0 in { @@ -2654,32 +3324,36 @@ multiclass FLAT_Store_Helper { let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in { - def "" : FLAT_Pseudo , + slc:$slc, tfe:$tfe), []>, AtomicNoRet ; - def _ci : FLAT_Real_ci ; - def _vi : FLAT_Real_vi ; } let glc = 1, hasPostISelHook = 1 in { - defm _RTN : FLAT_AtomicRet_m ; + defm _RTN : FLAT_AtomicRet_m < + op, (outs vdst_rc:$vdst), + (ins VReg_64:$addr, data_rc:$data, slc:$slc, tfe:$tfe), + asm_name#" $vdst, $addr, $data glc$slc$tfe", + [(set vt:$vdst, + (atomic (FLATAtomic i64:$addr, i1:$slc, i1:$tfe), data_vt:$data))] + >; } } @@ -2688,27 +3362,39 @@ class MIMG_Mask { int Channels = channels; } +class mimg si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +class MIMG_Helper : MIMG { + let mayLoad = 1; + let mayStore = 0; + let hasPostISelHook = 1; + let DecoderNamespace = dns; + let isAsmParserOnly = !if(!eq(dns,""), 1, 0); + let AsmMatchConverter = "cvtMIMG"; +} + class MIMG_NoSampler_Helper op, string asm, RegisterClass dst_rc, - RegisterClass src_rc> : MIMG < - op, + RegisterClass addr_rc, + string dns=""> : MIMG_Helper < (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc", - []> { + (ins addr_rc:$vaddr, SReg_256:$srsrc, + dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", + dns>, MIMGe { let ssamp = 0; - let mayLoad = 1; - let mayStore = 0; - let hasPostISelHook = 1; } multiclass MIMG_NoSampler_Src_Helper op, string asm, RegisterClass dst_rc, int channels> { - def _V1 : MIMG_NoSampler_Helper , + def _V1 : MIMG_NoSampler_Helper , MIMG_Mask; def _V2 : MIMG_NoSampler_Helper , MIMG_Mask; @@ -2723,27 +3409,116 @@ multiclass MIMG_NoSampler op, string asm> { defm _V4 : MIMG_NoSampler_Src_Helper ; } +class MIMG_Store_Helper op, string asm, + RegisterClass data_rc, + RegisterClass addr_rc> : MIMG_Helper < + (outs), + (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + >, MIMGe { + let ssamp = 0; + let mayLoad = 1; // TableGen requires this for matching with the intrinsics + let mayStore = 1; + let hasSideEffects = 1; + let hasPostISelHook = 0; +} + +multiclass MIMG_Store_Addr_Helper op, string asm, + RegisterClass data_rc, + int channels> { + def _V1 : MIMG_Store_Helper , + MIMG_Mask; + def _V2 : MIMG_Store_Helper , + MIMG_Mask; + def _V4 : MIMG_Store_Helper , + MIMG_Mask; +} + +multiclass MIMG_Store op, string asm> { + defm _V1 : MIMG_Store_Addr_Helper ; + defm _V2 : MIMG_Store_Addr_Helper ; + defm _V3 : MIMG_Store_Addr_Helper ; + defm _V4 : MIMG_Store_Addr_Helper ; +} + +class MIMG_Atomic_Helper : MIMG_Helper < + (outs data_rc:$vdst), + (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + > { + let mayStore = 1; + let hasSideEffects = 1; + let hasPostISelHook = 0; + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; +} + +class MIMG_Atomic_Real_si : + MIMG_Atomic_Helper, + SIMCInstr, + MIMGe { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + +class MIMG_Atomic_Real_vi : + MIMG_Atomic_Helper, + SIMCInstr, + MIMGe { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; +} + +multiclass MIMG_Atomic_Helper_m { + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : MIMG_Atomic_Helper, + SIMCInstr; + } + + let ssamp = 0 in { + def _si : MIMG_Atomic_Real_si; + + def _vi : MIMG_Atomic_Real_vi; + } +} + +multiclass MIMG_Atomic { + defm _V1 : MIMG_Atomic_Helper_m ; + defm _V2 : MIMG_Atomic_Helper_m ; + defm _V4 : MIMG_Atomic_Helper_m ; +} + class MIMG_Sampler_Helper op, string asm, RegisterClass dst_rc, - RegisterClass src_rc, int wqm> : MIMG < - op, + RegisterClass src_rc, + int wqm, + string dns=""> : MIMG_Helper < (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc, SReg_128:$ssamp), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", - []> { - let mayLoad = 1; - let mayStore = 0; - let hasPostISelHook = 1; + (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + dns>, MIMGe { let WQM = wqm; } multiclass MIMG_Sampler_Src_Helper op, string asm, RegisterClass dst_rc, int channels, int wqm> { - def _V1 : MIMG_Sampler_Helper , + def _V1 : MIMG_Sampler_Helper , MIMG_Mask; def _V2 : MIMG_Sampler_Helper , MIMG_Mask; @@ -2755,31 +3530,24 @@ multiclass MIMG_Sampler_Src_Helper op, string asm, MIMG_Mask; } -multiclass MIMG_Sampler op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper; - defm _V2 : MIMG_Sampler_Src_Helper; - defm _V3 : MIMG_Sampler_Src_Helper; - defm _V4 : MIMG_Sampler_Src_Helper; +multiclass MIMG_Sampler op, string asm, int wqm=0> { + defm _V1 : MIMG_Sampler_Src_Helper; + defm _V2 : MIMG_Sampler_Src_Helper; + defm _V3 : MIMG_Sampler_Src_Helper; + defm _V4 : MIMG_Sampler_Src_Helper; } -multiclass MIMG_Sampler_WQM op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper; - defm _V2 : MIMG_Sampler_Src_Helper; - defm _V3 : MIMG_Sampler_Src_Helper; - defm _V4 : MIMG_Sampler_Src_Helper; -} +multiclass MIMG_Sampler_WQM op, string asm> : MIMG_Sampler; class MIMG_Gather_Helper op, string asm, RegisterClass dst_rc, RegisterClass src_rc, int wqm> : MIMG < - op, (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc, SReg_128:$ssamp), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", - []> { + (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + []>, MIMGe { let mayLoad = 1; let mayStore = 0; @@ -2789,10 +3557,12 @@ class MIMG_Gather_Helper op, string asm, // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns // (red,red,red,red) etc.) The ISA document doesn't mention // this. - // Therefore, disable all code which updates DMASK by setting these two: - let MIMG = 0; + // Therefore, disable all code which updates DMASK by setting this: + let Gather4 = 1; let hasPostISelHook = 0; let WQM = wqm; + + let isAsmParserOnly = 1; // TBD: fix it later } multiclass MIMG_Gather_Src_Helper op, string asm, @@ -2810,19 +3580,14 @@ multiclass MIMG_Gather_Src_Helper op, string asm, MIMG_Mask; } -multiclass MIMG_Gather op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper; - defm _V2 : MIMG_Gather_Src_Helper; - defm _V3 : MIMG_Gather_Src_Helper; - defm _V4 : MIMG_Gather_Src_Helper; +multiclass MIMG_Gather op, string asm, int wqm=0> { + defm _V1 : MIMG_Gather_Src_Helper; + defm _V2 : MIMG_Gather_Src_Helper; + defm _V3 : MIMG_Gather_Src_Helper; + defm _V4 : MIMG_Gather_Src_Helper; } -multiclass MIMG_Gather_WQM op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper; - defm _V2 : MIMG_Gather_Src_Helper; - defm _V3 : MIMG_Gather_Src_Helper; - defm _V4 : MIMG_Gather_Src_Helper; -} +multiclass MIMG_Gather_WQM op, string asm> : MIMG_Gather; //===----------------------------------------------------------------------===// // Vector instruction mappings @@ -2894,8 +3659,9 @@ def getMCOpcodeGen : InstrMapping { let FilterClass = "SIMCInstr"; let RowFields = ["PseudoInstr"]; let ColFields = ["Subtarget"]; - let KeyCol = [!cast(SISubtarget.NONE)]; - let ValueCols = [[!cast(SISubtarget.SI)],[!cast(SISubtarget.VI)]]; + let KeyCol = [!cast(SIEncodingFamily.NONE)]; + let ValueCols = [[!cast(SIEncodingFamily.SI)], + [!cast(SIEncodingFamily.VI)]]; } def getAddr64Inst : InstrMapping { diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 89692ab71f4d..6427db87cd6f 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -18,35 +18,17 @@ int P20 = 1; } def INTERP : InterpSlots; -def InterpSlot : Operand { - let PrintMethod = "printInterpSlot"; -} - -def SendMsgImm : Operand { - let PrintMethod = "printSendMsg"; -} - def isGCN : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, + ">= SISubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureGCN">; def isSI : Predicate<"Subtarget->getGeneration() " - "== AMDGPUSubtarget::SOUTHERN_ISLANDS">, + "== SISubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureSouthernIslands">; def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; -def SWaitMatchClass : AsmOperandClass { - let Name = "SWaitCnt"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseSWaitCntOps"; -} - -def WAIT_FLAG : InstFlag<"printWaitFlag"> { - let ParserMatchClass = SWaitMatchClass; -} - let SubtargetPredicate = isGCN in { //===----------------------------------------------------------------------===// @@ -59,17 +41,17 @@ defm EXP : EXP_m; // SMRD Instructions //===----------------------------------------------------------------------===// -// We are using the SGPR_32 and not the SReg_32 register class for 32-bit -// SMRD instructions, because the SGPR_32 register class does not include M0 +// We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit +// SMRD instructions, because the SReg_32_XM0 register class does not include M0 // and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper , "s_load_dword", SReg_64, SGPR_32>; +defm S_LOAD_DWORD : SMRD_Helper , "s_load_dword", SReg_64, SReg_32_XM0>; defm S_LOAD_DWORDX2 : SMRD_Helper , "s_load_dwordx2", SReg_64, SReg_64>; defm S_LOAD_DWORDX4 : SMRD_Helper , "s_load_dwordx4", SReg_64, SReg_128>; defm S_LOAD_DWORDX8 : SMRD_Helper , "s_load_dwordx8", SReg_64, SReg_256>; defm S_LOAD_DWORDX16 : SMRD_Helper , "s_load_dwordx16", SReg_64, SReg_512>; defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32 + smrd<0x08>, "s_buffer_load_dword", SReg_128, SReg_32_XM0 >; defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < @@ -88,7 +70,15 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512 >; -//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; +let mayStore = ? in { +// FIXME: mayStore = ? is a workaround for tablegen bug for different +// inferred mayStore flags for the instruction pattern vs. standalone +// Pat. Each considers the other contradictory. + +defm S_MEMTIME : SMRD_Special , "s_memtime", + (outs SReg_64:$sdst), ?, " $sdst", [(set i64:$sdst, (int_amdgcn_s_memtime))] +>; +} defm S_DCACHE_INV : SMRD_Inval , "s_dcache_inv", int_amdgcn_s_dcache_inv>; @@ -101,7 +91,7 @@ let isMoveImm = 1 in { let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm S_MOV_B32 : SOP1_32 , "s_mov_b32", []>; defm S_MOV_B64 : SOP1_64 , "s_mov_b64", []>; - } // let isRematerializeable = 1 + } // End isRematerializeable = 1 let Uses = [SCC] in { defm S_CMOV_B32 : SOP1_32 , "s_cmov_b32", []>; @@ -111,11 +101,11 @@ let isMoveImm = 1 in { let Defs = [SCC] in { defm S_NOT_B32 : SOP1_32 , "s_not_b32", - [(set i32:$dst, (not i32:$src0))] + [(set i32:$sdst, (not i32:$src0))] >; defm S_NOT_B64 : SOP1_64 , "s_not_b64", - [(set i64:$dst, (not i64:$src0))] + [(set i64:$sdst, (not i64:$src0))] >; defm S_WQM_B32 : SOP1_32 , "s_wqm_b32", []>; defm S_WQM_B64 : SOP1_64 , "s_wqm_b64", []>; @@ -123,7 +113,7 @@ let Defs = [SCC] in { defm S_BREV_B32 : SOP1_32 , "s_brev_b32", - [(set i32:$dst, (bitreverse i32:$src0))] + [(set i32:$sdst, (bitreverse i32:$src0))] >; defm S_BREV_B64 : SOP1_64 , "s_brev_b64", []>; @@ -131,7 +121,7 @@ let Defs = [SCC] in { defm S_BCNT0_I32_B32 : SOP1_32 , "s_bcnt0_i32_b32", []>; defm S_BCNT0_I32_B64 : SOP1_32_64 , "s_bcnt0_i32_b64", []>; defm S_BCNT1_I32_B32 : SOP1_32 , "s_bcnt1_i32_b32", - [(set i32:$dst, (ctpop i32:$src0))] + [(set i32:$sdst, (ctpop i32:$src0))] >; defm S_BCNT1_I32_B64 : SOP1_32_64 , "s_bcnt1_i32_b64", []>; } // End Defs = [SCC] @@ -139,34 +129,34 @@ let Defs = [SCC] in { defm S_FF0_I32_B32 : SOP1_32 , "s_ff0_i32_b32", []>; defm S_FF0_I32_B64 : SOP1_32_64 , "s_ff0_i32_b64", []>; defm S_FF1_I32_B32 : SOP1_32 , "s_ff1_i32_b32", - [(set i32:$dst, (cttz_zero_undef i32:$src0))] + [(set i32:$sdst, (cttz_zero_undef i32:$src0))] >; defm S_FF1_I32_B64 : SOP1_32_64 , "s_ff1_i32_b64", []>; defm S_FLBIT_I32_B32 : SOP1_32 , "s_flbit_i32_b32", - [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))] + [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] >; defm S_FLBIT_I32_B64 : SOP1_32_64 , "s_flbit_i32_b64", []>; defm S_FLBIT_I32 : SOP1_32 , "s_flbit_i32", - [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))] + [(set i32:$sdst, (int_AMDGPU_flbit_i32 i32:$src0))] >; defm S_FLBIT_I32_I64 : SOP1_32_64 , "s_flbit_i32_i64", []>; defm S_SEXT_I32_I8 : SOP1_32 , "s_sext_i32_i8", - [(set i32:$dst, (sext_inreg i32:$src0, i8))] + [(set i32:$sdst, (sext_inreg i32:$src0, i8))] >; defm S_SEXT_I32_I16 : SOP1_32 , "s_sext_i32_i16", - [(set i32:$dst, (sext_inreg i32:$src0, i16))] + [(set i32:$sdst, (sext_inreg i32:$src0, i16))] >; defm S_BITSET0_B32 : SOP1_32 , "s_bitset0_b32", []>; -defm S_BITSET0_B64 : SOP1_64 , "s_bitset0_b64", []>; +defm S_BITSET0_B64 : SOP1_64_32 , "s_bitset0_b64", []>; defm S_BITSET1_B32 : SOP1_32 , "s_bitset1_b32", []>; -defm S_BITSET1_B64 : SOP1_64 , "s_bitset1_b64", []>; +defm S_BITSET1_B64 : SOP1_64_32 , "s_bitset1_b64", []>; defm S_GETPC_B64 : SOP1_64_0 , "s_getpc_b64", []>; -defm S_SETPC_B64 : SOP1_64 , "s_setpc_b64", []>; +defm S_SETPC_B64 : SOP1_1 , "s_setpc_b64", []>; defm S_SWAPPC_B64 : SOP1_64 , "s_swappc_b64", []>; -defm S_RFE_B64 : SOP1_64 , "s_rfe_b64", []>; +defm S_RFE_B64 : SOP1_1 , "s_rfe_b64", []>; let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { @@ -206,36 +196,36 @@ let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { defm S_ADD_U32 : SOP2_32 , "s_add_u32", []>; defm S_ADD_I32 : SOP2_32 , "s_add_i32", - [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] + [(set i32:$sdst, (add SSrc_32:$src0, SSrc_32:$src1))] >; } // End isCommutable = 1 defm S_SUB_U32 : SOP2_32 , "s_sub_u32", []>; defm S_SUB_I32 : SOP2_32 , "s_sub_i32", - [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] + [(set i32:$sdst, (sub SSrc_32:$src0, SSrc_32:$src1))] >; let Uses = [SCC] in { // Carry in comes from SCC let isCommutable = 1 in { defm S_ADDC_U32 : SOP2_32 , "s_addc_u32", - [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; + [(set i32:$sdst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End isCommutable = 1 defm S_SUBB_U32 : SOP2_32 , "s_subb_u32", - [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; + [(set i32:$sdst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End Uses = [SCC] defm S_MIN_I32 : SOP2_32 , "s_min_i32", - [(set i32:$dst, (smin i32:$src0, i32:$src1))] + [(set i32:$sdst, (smin i32:$src0, i32:$src1))] >; defm S_MIN_U32 : SOP2_32 , "s_min_u32", - [(set i32:$dst, (umin i32:$src0, i32:$src1))] + [(set i32:$sdst, (umin i32:$src0, i32:$src1))] >; defm S_MAX_I32 : SOP2_32 , "s_max_i32", - [(set i32:$dst, (smax i32:$src0, i32:$src1))] + [(set i32:$sdst, (smax i32:$src0, i32:$src1))] >; defm S_MAX_U32 : SOP2_32 , "s_max_u32", - [(set i32:$dst, (umax i32:$src0, i32:$src1))] + [(set i32:$sdst, (umax i32:$src0, i32:$src1))] >; } // End Defs = [SCC] @@ -247,27 +237,27 @@ let Uses = [SCC] in { let Defs = [SCC] in { defm S_AND_B32 : SOP2_32 , "s_and_b32", - [(set i32:$dst, (and i32:$src0, i32:$src1))] + [(set i32:$sdst, (and i32:$src0, i32:$src1))] >; defm S_AND_B64 : SOP2_64 , "s_and_b64", - [(set i64:$dst, (and i64:$src0, i64:$src1))] + [(set i64:$sdst, (and i64:$src0, i64:$src1))] >; defm S_OR_B32 : SOP2_32 , "s_or_b32", - [(set i32:$dst, (or i32:$src0, i32:$src1))] + [(set i32:$sdst, (or i32:$src0, i32:$src1))] >; defm S_OR_B64 : SOP2_64 , "s_or_b64", - [(set i64:$dst, (or i64:$src0, i64:$src1))] + [(set i64:$sdst, (or i64:$src0, i64:$src1))] >; defm S_XOR_B32 : SOP2_32 , "s_xor_b32", - [(set i32:$dst, (xor i32:$src0, i32:$src1))] + [(set i32:$sdst, (xor i32:$src0, i32:$src1))] >; defm S_XOR_B64 : SOP2_64 , "s_xor_b64", - [(set i64:$dst, (xor i64:$src0, i64:$src1))] + [(set i64:$sdst, (xor i64:$src0, i64:$src1))] >; defm S_ANDN2_B32 : SOP2_32 , "s_andn2_b32", []>; defm S_ANDN2_B64 : SOP2_64 , "s_andn2_b64", []>; @@ -286,30 +276,30 @@ let AddedComplexity = 1 in { let Defs = [SCC] in { defm S_LSHL_B32 : SOP2_32 , "s_lshl_b32", - [(set i32:$dst, (shl i32:$src0, i32:$src1))] + [(set i32:$sdst, (shl i32:$src0, i32:$src1))] >; defm S_LSHL_B64 : SOP2_64_32 , "s_lshl_b64", - [(set i64:$dst, (shl i64:$src0, i32:$src1))] + [(set i64:$sdst, (shl i64:$src0, i32:$src1))] >; defm S_LSHR_B32 : SOP2_32 , "s_lshr_b32", - [(set i32:$dst, (srl i32:$src0, i32:$src1))] + [(set i32:$sdst, (srl i32:$src0, i32:$src1))] >; defm S_LSHR_B64 : SOP2_64_32 , "s_lshr_b64", - [(set i64:$dst, (srl i64:$src0, i32:$src1))] + [(set i64:$sdst, (srl i64:$src0, i32:$src1))] >; defm S_ASHR_I32 : SOP2_32 , "s_ashr_i32", - [(set i32:$dst, (sra i32:$src0, i32:$src1))] + [(set i32:$sdst, (sra i32:$src0, i32:$src1))] >; defm S_ASHR_I64 : SOP2_64_32 , "s_ashr_i64", - [(set i64:$dst, (sra i64:$src0, i32:$src1))] + [(set i64:$sdst, (sra i64:$src0, i32:$src1))] >; } // End Defs = [SCC] defm S_BFM_B32 : SOP2_32 , "s_bfm_b32", - [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; -defm S_BFM_B64 : SOP2_64 , "s_bfm_b64", []>; + [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>; +defm S_BFM_B64 : SOP2_64_32_32 , "s_bfm_b64", []>; defm S_MUL_I32 : SOP2_32 , "s_mul_i32", - [(set i32:$dst, (mul i32:$src0, i32:$src1))] + [(set i32:$sdst, (mul i32:$src0, i32:$src1))] >; } // End AddedComplexity = 1 @@ -317,7 +307,7 @@ defm S_MUL_I32 : SOP2_32 , "s_mul_i32", let Defs = [SCC] in { defm S_BFE_U32 : SOP2_32 , "s_bfe_u32", []>; defm S_BFE_I32 : SOP2_32 , "s_bfe_i32", []>; -defm S_BFE_U64 : SOP2_64 , "s_bfe_u64", []>; +defm S_BFE_U64 : SOP2_64_32 , "s_bfe_u64", []>; defm S_BFE_I64 : SOP2_64_32 , "s_bfe_i64", []>; } // End Defs = [SCC] @@ -336,23 +326,23 @@ defm S_ABSDIFF_I32 : SOP2_32 , "s_absdiff_i32", []>; // SOPC Instructions //===----------------------------------------------------------------------===// -def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; -def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; -def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; -def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; -def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; -def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; -def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; -def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; -def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; -def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; -def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; -def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; -////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; -////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; -////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; -////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>; -//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>; +def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00000000, "s_cmp_eq_i32", COND_EQ>; +def S_CMP_LG_I32 : SOPC_CMP_32 <0x00000001, "s_cmp_lg_i32", COND_NE>; +def S_CMP_GT_I32 : SOPC_CMP_32 <0x00000002, "s_cmp_gt_i32", COND_SGT>; +def S_CMP_GE_I32 : SOPC_CMP_32 <0x00000003, "s_cmp_ge_i32", COND_SGE>; +def S_CMP_LT_I32 : SOPC_CMP_32 <0x00000004, "s_cmp_lt_i32", COND_SLT>; +def S_CMP_LE_I32 : SOPC_CMP_32 <0x00000005, "s_cmp_le_i32", COND_SLE>; +def S_CMP_EQ_U32 : SOPC_CMP_32 <0x00000006, "s_cmp_eq_u32", COND_EQ>; +def S_CMP_LG_U32 : SOPC_CMP_32 <0x00000007, "s_cmp_lg_u32", COND_NE >; +def S_CMP_GT_U32 : SOPC_CMP_32 <0x00000008, "s_cmp_gt_u32", COND_UGT>; +def S_CMP_GE_U32 : SOPC_CMP_32 <0x00000009, "s_cmp_ge_u32", COND_UGE>; +def S_CMP_LT_U32 : SOPC_CMP_32 <0x0000000a, "s_cmp_lt_u32", COND_ULT>; +def S_CMP_LE_U32 : SOPC_CMP_32 <0x0000000b, "s_cmp_le_u32", COND_ULE>; +def S_BITCMP0_B32 : SOPC_32 <0x0000000c, "s_bitcmp0_b32">; +def S_BITCMP1_B32 : SOPC_32 <0x0000000d, "s_bitcmp1_b32">; +def S_BITCMP0_B64 : SOPC_64_32 <0x0000000e, "s_bitcmp0_b64">; +def S_BITCMP1_B64 : SOPC_64_32 <0x0000000f, "s_bitcmp1_b64">; +def S_SETVSKIP : SOPC_32 <0x00000010, "s_setvskip">; //===----------------------------------------------------------------------===// // SOPK Instructions @@ -408,16 +398,23 @@ defm S_CBRANCH_I_FORK : SOPK_m < sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs), (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16" >; -defm S_GETREG_B32 : SOPK_32 , "s_getreg_b32", []>; + +let mayLoad = 1 in { +defm S_GETREG_B32 : SOPK_m < + sopk<0x12, 0x11>, "s_getreg_b32", (outs SReg_32:$sdst), + (ins hwreg:$simm16), " $sdst, $simm16" +>; +} + defm S_SETREG_B32 : SOPK_m < sopk<0x13, 0x12>, "s_setreg_b32", (outs), - (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16" + (ins SReg_32:$sdst, hwreg:$simm16), " $simm16, $sdst" >; // FIXME: Not on SI? //defm S_GETREG_REGRD_B32 : SOPK_32 , "s_getreg_regrd_b32", []>; defm S_SETREG_IMM32_B32 : SOPK_IMM32 < sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs), - (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16" + (ins i32imm:$imm, hwreg:$simm16), " $simm16, $imm" >; //===----------------------------------------------------------------------===// @@ -429,10 +426,11 @@ def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; let isTerminator = 1 in { def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", - [(IL_retflag)]> { + [(AMDGPUendpgm)]> { let simm16 = 0; let isBarrier = 1; let hasCtrlDep = 1; + let hasSideEffects = 1; } let isBranch = 1 in { @@ -449,7 +447,8 @@ def S_CBRANCH_SCC0 : SOPP < >; def S_CBRANCH_SCC1 : SOPP < 0x00000005, (ins sopp_brtarget:$simm16), - "s_cbranch_scc1 $simm16" + "s_cbranch_scc1 $simm16", + [(si_uniform_br_scc SCC, bb:$simm16)] >; } // End Uses = [SCC] @@ -481,7 +480,7 @@ def S_CBRANCH_EXECNZ : SOPP < let hasSideEffects = 1 in { def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", - [(int_AMDGPU_barrier_local)] + [(int_amdgcn_s_barrier)] > { let SchedRW = [WriteBarrier]; let simm16 = 0; @@ -490,18 +489,31 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", let isConvergent = 1; } +let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; -def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; -def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; + +// On SI the documentation says sleep for approximately 64 * low 2 +// bits, consistent with the reported maximum of 448. On VI the +// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the +// maximum really 15 on VI? +def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16), + "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> { + let hasSideEffects = 1; + let mayLoad = 1; + let mayStore = 1; +} + +def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">; let Uses = [EXEC, M0] in { + // FIXME: Should this be mayLoad+mayStore? def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", [(AMDGPUsendmsg (i32 imm:$simm16))] >; } // End Uses = [EXEC, M0] -def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; +def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">; def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { let simm16 = 0; @@ -770,8 +782,8 @@ defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; let mayLoad = 0 in { defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; -defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; -defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; +defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; +defm DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; } defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; @@ -811,7 +823,11 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; -defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; + +let Uses = [EXEC], mayLoad =0, mayStore = 0, isConvergent = 1 in { +defm DS_SWIZZLE_B32 : DS_1A_RET_ , "ds_swizzle_b32", VGPR_32>; +} + let mayStore = 0 in { defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; @@ -839,8 +855,8 @@ defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; let mayLoad = 0 in { defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; -defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; -defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; +defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; +defm DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; } defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; @@ -886,7 +902,7 @@ defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; -defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">; +defm DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET <0x8d, "ds_write_src2_b32">; defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; @@ -903,7 +919,7 @@ defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; -defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; +defm DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET <0xcd, "ds_write_src2_b64">; defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; @@ -937,16 +953,16 @@ defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 >; defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < - mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global + mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 >; defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < - mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global + mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8 >; defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < - mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global + mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16 >; defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < - mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global + mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16 >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load @@ -981,7 +997,9 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global >; -//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ , "buffer_atomic_cmpswap", []>; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Atomic < + mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag +>; defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global >; @@ -1010,30 +1028,61 @@ defm BUFFER_ATOMIC_OR : MUBUF_Atomic < defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global >; -//def BUFFER_ATOMIC_INC : MUBUF_ , "buffer_atomic_inc", []>; -//def BUFFER_ATOMIC_DEC : MUBUF_ , "buffer_atomic_dec", []>; -//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ , "buffer_atomic_fcmpswap", []>; // isn't on VI -//def BUFFER_ATOMIC_FMIN : MUBUF_ , "buffer_atomic_fmin", []>; // isn't on VI -//def BUFFER_ATOMIC_FMAX : MUBUF_ , "buffer_atomic_fmax", []>; // isn't on VI -//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 , "buffer_atomic_swap_x2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_cmpswap_x2", []>; -//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 , "buffer_atomic_add_x2", []>; -//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 , "buffer_atomic_sub_x2", []>; -//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 , "buffer_atomic_rsub_x2", []>; // isn't on CI & VI -//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 , "buffer_atomic_smin_x2", []>; -//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 , "buffer_atomic_umin_x2", []>; -//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 , "buffer_atomic_smax_x2", []>; -//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 , "buffer_atomic_umax_x2", []>; -//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 , "buffer_atomic_and_x2", []>; -//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 , "buffer_atomic_or_x2", []>; -//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 , "buffer_atomic_xor_x2", []>; -//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 , "buffer_atomic_inc_x2", []>; -//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 , "buffer_atomic_dec_x2", []>; +defm BUFFER_ATOMIC_INC : MUBUF_Atomic < + mubuf<0x3c, 0x4b>, "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global +>; +defm BUFFER_ATOMIC_DEC : MUBUF_Atomic < + mubuf<0x3d, 0x4c>, "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global +>; + +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_Atomic , "buffer_atomic_fcmpswap", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN : MUBUF_Atomic , "buffer_atomic_fmin", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX : MUBUF_Atomic , "buffer_atomic_fmax", []>; // isn't on VI +defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Atomic < + mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global +>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic < + mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag +>; +defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Atomic < + mubuf<0x52, 0x62>, "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global +>; +defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Atomic < + mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global +>; +//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Atomic , "buffer_atomic_rsub_x2", []>; // isn't on CI & VI +defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Atomic < + mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global +>; +defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Atomic < + mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global +>; +defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Atomic < + mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global +>; +defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Atomic < + mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global +>; +defm BUFFER_ATOMIC_AND_X2 : MUBUF_Atomic < + mubuf<0x59, 0x68>, "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global +>; +defm BUFFER_ATOMIC_OR_X2 : MUBUF_Atomic < + mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global +>; +defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Atomic < + mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global +>; +defm BUFFER_ATOMIC_INC_X2 : MUBUF_Atomic < + mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global +>; +defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Atomic < + mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global +>; //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_fcmpswap_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 , "buffer_atomic_fmin_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 , "buffer_atomic_fmax_x2", []>; // isn't on VI -let SubtargetPredicate = isSI in { +let SubtargetPredicate = isSI, DisableVIDecoder = 1 in { defm BUFFER_WBINVL1_SC : MUBUF_Invalidate , "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI } @@ -1062,28 +1111,28 @@ defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; //def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; -//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>; -//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>; +defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">; +defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">; //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; -//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>; -//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>; -//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>; -//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>; -//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>; -//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>; -//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>; -//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>; -//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>; -//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>; -//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>; -//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>; -//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>; -//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; +defm IMAGE_ATOMIC_SWAP : MIMG_Atomic , "image_atomic_swap">; +defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic , "image_atomic_cmpswap", VReg_64>; +defm IMAGE_ATOMIC_ADD : MIMG_Atomic , "image_atomic_add">; +defm IMAGE_ATOMIC_SUB : MIMG_Atomic , "image_atomic_sub">; +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI +defm IMAGE_ATOMIC_SMIN : MIMG_Atomic , "image_atomic_smin">; +defm IMAGE_ATOMIC_UMIN : MIMG_Atomic , "image_atomic_umin">; +defm IMAGE_ATOMIC_SMAX : MIMG_Atomic , "image_atomic_smax">; +defm IMAGE_ATOMIC_UMAX : MIMG_Atomic , "image_atomic_umax">; +defm IMAGE_ATOMIC_AND : MIMG_Atomic , "image_atomic_and">; +defm IMAGE_ATOMIC_OR : MIMG_Atomic , "image_atomic_or">; +defm IMAGE_ATOMIC_XOR : MIMG_Atomic , "image_atomic_xor">; +defm IMAGE_ATOMIC_INC : MIMG_Atomic , "image_atomic_inc">; +defm IMAGE_ATOMIC_DEC : MIMG_Atomic , "image_atomic_dec">; +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; @@ -1171,10 +1220,12 @@ let Uses = [EXEC] in { def V_READFIRSTLANE_B32 : VOP1 < 0x00000002, (outs SReg_32:$vdst), - (ins VGPR_32:$src0), + (ins VS_32:$src0), "v_readfirstlane_b32 $vdst, $src0", [] ->; +> { + let isConvergent = 1; +} } @@ -1234,7 +1285,7 @@ defm V_CVT_F64_U32 : VOP1Inst , "v_cvt_f64_u32", VOP_F64_I32, uint_to_fp >; -} // let SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteQuarterRate32] defm V_FRACT_F32 : VOP1Inst , "v_fract_f32", VOP_F32_F32, AMDGPUfract @@ -1270,7 +1321,7 @@ defm V_RSQ_F32 : VOP1Inst , "v_rsq_f32", VOP_F32_F32, AMDGPUrsq >; -} //let SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteQuarterRate32] let SchedRW = [WriteDouble] in { @@ -1281,7 +1332,7 @@ defm V_RSQ_F64 : VOP1Inst , "v_rsq_f64", VOP_F64_F64, AMDGPUrsq >; -} // let SchedRW = [WriteDouble]; +} // End SchedRW = [WriteDouble]; defm V_SQRT_F32 : VOP1Inst , "v_sqrt_f32", VOP_F32_F32, fsqrt @@ -1312,34 +1363,34 @@ defm V_FFBH_U32 : VOP1Inst , "v_ffbh_u32", VOP_I32_I32>; defm V_FFBL_B32 : VOP1Inst , "v_ffbl_b32", VOP_I32_I32>; defm V_FFBH_I32 : VOP1Inst , "v_ffbh_i32", VOP_I32_I32>; defm V_FREXP_EXP_I32_F64 : VOP1Inst , "v_frexp_exp_i32_f64", - VOP_I32_F64 + VOP_I32_F64, int_amdgcn_frexp_exp >; let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_MANT_F64 : VOP1Inst , "v_frexp_mant_f64", - VOP_F64_F64 + VOP_F64_F64, int_amdgcn_frexp_mant >; defm V_FRACT_F64 : VOP1Inst , "v_fract_f64", - VOP_F64_F64 + VOP_F64_F64, AMDGPUfract >; } // End SchedRW = [WriteDoubleAdd] defm V_FREXP_EXP_I32_F32 : VOP1Inst , "v_frexp_exp_i32_f32", - VOP_I32_F32 + VOP_I32_F32, int_amdgcn_frexp_exp >; defm V_FREXP_MANT_F32 : VOP1Inst , "v_frexp_mant_f32", - VOP_F32_F32 + VOP_F32_F32, int_amdgcn_frexp_mant >; let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { -defm V_CLREXCP : VOP1Inst , "v_clrexcp", VOP_NONE>; +defm V_CLREXCP : VOP1Inst , "v_clrexcp", VOP_NO_EXT>; } let Uses = [M0, EXEC] in { -defm V_MOVRELD_B32 : VOP1Inst , "v_movreld_b32", VOP_I32_I32>; -defm V_MOVRELS_B32 : VOP1Inst , "v_movrels_b32", VOP_I32_I32>; -defm V_MOVRELSD_B32 : VOP1Inst , "v_movrelsd_b32", VOP_I32_I32>; +defm V_MOVRELD_B32 : VOP1Inst , "v_movreld_b32", VOP_NO_EXT>; +defm V_MOVRELS_B32 : VOP1Inst , "v_movrels_b32", VOP_NO_EXT>; +defm V_MOVRELSD_B32 : VOP1Inst , "v_movrelsd_b32", VOP_NO_EXT>; } // End Uses = [M0, EXEC] // These instruction only exist on SI and CI @@ -1348,11 +1399,12 @@ let SubtargetPredicate = isSICI in { let SchedRW = [WriteQuarterRate32] in { defm V_MOV_FED_B32 : VOP1InstSI , "v_mov_fed_b32", VOP_I32_I32>; -defm V_LOG_CLAMP_F32 : VOP1InstSI , "v_log_clamp_f32", VOP_F32_F32>; +defm V_LOG_CLAMP_F32 : VOP1InstSI , "v_log_clamp_f32", + VOP_F32_F32, int_amdgcn_log_clamp>; defm V_RCP_CLAMP_F32 : VOP1InstSI , "v_rcp_clamp_f32", VOP_F32_F32>; defm V_RCP_LEGACY_F32 : VOP1InstSI , "v_rcp_legacy_f32", VOP_F32_F32>; defm V_RSQ_CLAMP_F32 : VOP1InstSI , "v_rsq_clamp_f32", - VOP_F32_F32, AMDGPUrsq_clamped + VOP_F32_F32, AMDGPUrsq_clamp >; defm V_RSQ_LEGACY_F32 : VOP1InstSI , "v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy @@ -1364,7 +1416,7 @@ let SchedRW = [WriteDouble] in { defm V_RCP_CLAMP_F64 : VOP1InstSI , "v_rcp_clamp_f64", VOP_F64_F64>; defm V_RSQ_CLAMP_F64 : VOP1InstSI , "v_rsq_clamp_f64", - VOP_F64_F64, AMDGPUrsq_clamped + VOP_F64_F64, AMDGPUrsq_clamp >; } // End SchedRW = [WriteDouble] @@ -1394,11 +1446,11 @@ defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; } // End OtherPredicates = [has32BankLDS] -let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in { +let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 in { defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; -} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst" +} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in { @@ -1426,15 +1478,9 @@ defm V_INTERP_MOV_F32 : VINTRP_m < // VOP2 Instructions //===----------------------------------------------------------------------===// -multiclass V_CNDMASK { - defm _e32 : VOP2_m ; - - defm _e64 : VOP3_m < - op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, - name#!cast(VOP_CNDMASK.Asm64), [], name, 3>; -} - -defm V_CNDMASK_B32 : V_CNDMASK, "v_cndmask_b32">; +defm V_CNDMASK_B32 : VOP2eInst , "v_cndmask_b32", + VOP2e_I32_I32_I32_I1 +>; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst , "v_add_f32", @@ -1450,7 +1496,7 @@ defm V_SUBREV_F32 : VOP2Inst , "v_subrev_f32", let isCommutable = 1 in { defm V_MUL_LEGACY_F32 : VOP2Inst , "v_mul_legacy_f32", - VOP_F32_F32_F32, int_AMDGPU_mul + VOP_F32_F32_F32 >; defm V_MUL_F32 : VOP2Inst , "v_mul_f32", @@ -1501,16 +1547,16 @@ defm V_AND_B32 : VOP2Inst , "v_and_b32", VOP_I32_I32_I32>; defm V_OR_B32 : VOP2Inst , "v_or_b32", VOP_I32_I32_I32>; defm V_XOR_B32 : VOP2Inst , "v_xor_b32", VOP_I32_I32_I32>; -let Constraints = "$dst = $src2", DisableEncoding="$src2", +let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { defm V_MAC_F32 : VOP2Inst , "v_mac_f32", VOP_MAC>; } } // End isCommutable = 1 -defm V_MADMK_F32 : VOP2MADK , "v_madmk_f32">; +defm V_MADMK_F32 : VOP2MADK , "v_madmk_f32", VOP_MADMK>; let isCommutable = 1 in { -defm V_MADAK_F32 : VOP2MADK , "v_madak_f32">; +defm V_MADAK_F32 : VOP2MADK , "v_madak_f32", VOP_MADAK>; } // End isCommutable = 1 let isCommutable = 1 in { @@ -1540,11 +1586,14 @@ defm V_SUBBREV_U32 : VOP2bInst , "v_subbrev_u32", } // End isCommutable = 1 +// These are special and do not read the exec mask. +let isConvergent = 1, Uses = [] in { + defm V_READLANE_B32 : VOP2SI_3VI_m < vop3 <0x001, 0x289>, "v_readlane_b32", (outs SReg_32:$vdst), - (ins VGPR_32:$src0, SCSrc_32:$src1), + (ins VS_32:$src0, SCSrc_32:$src1), "v_readlane_b32 $vdst, $src0, $src1" >; @@ -1556,6 +1605,8 @@ defm V_WRITELANE_B32 : VOP2SI_3VI_m < "v_writelane_b32 $vdst, $src0, $src1" >; +} // End isConvergent = 1 + // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { @@ -1636,16 +1687,16 @@ defm V_MAD_U32_U24 : VOP3Inst , "v_mad_u32_u24", } // End isCommutable = 1 defm V_CUBEID_F32 : VOP3Inst , "v_cubeid_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, int_amdgcn_cubeid >; defm V_CUBESC_F32 : VOP3Inst , "v_cubesc_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, int_amdgcn_cubesc >; defm V_CUBETC_F32 : VOP3Inst , "v_cubetc_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, int_amdgcn_cubetc >; defm V_CUBEMA_F32 : VOP3Inst , "v_cubema_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, int_amdgcn_cubema >; defm V_BFE_U32 : VOP3Inst , "v_bfe_u32", @@ -1666,6 +1717,10 @@ defm V_FMA_F32 : VOP3Inst , "v_fma_f32", defm V_FMA_F64 : VOP3Inst , "v_fma_f64", VOP_F64_F64_F64_F64, fma >; + +defm V_LERP_U8 : VOP3Inst , "v_lerp_u8", + VOP_I32_I32_I32_I32, int_amdgcn_lerp +>; } // End isCommutable = 1 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; @@ -1695,13 +1750,13 @@ defm V_MAX3_U32 : VOP3Inst , "v_max3_u32", VOP_I32_I32_I32_I32, AMDGPUumax3 >; defm V_MED3_F32 : VOP3Inst , "v_med3_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, AMDGPUfmed3 >; defm V_MED3_I32 : VOP3Inst , "v_med3_i32", - VOP_I32_I32_I32_I32 + VOP_I32_I32_I32_I32, AMDGPUsmed3 >; defm V_MED3_U32 : VOP3Inst , "v_med3_u32", - VOP_I32_I32_I32_I32 + VOP_I32_I32_I32_I32, AMDGPUumed3 >; //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; @@ -1710,7 +1765,7 @@ defm V_MED3_U32 : VOP3Inst , "v_med3_u32", defm V_SAD_U32 : VOP3Inst , "v_sad_u32", VOP_I32_I32_I32_I32 >; -////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; +//def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; defm V_DIV_FIXUP_F32 : VOP3Inst < vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup >; @@ -1727,26 +1782,26 @@ let SchedRW = [WriteDoubleAdd] in { let isCommutable = 1 in { defm V_ADD_F64 : VOP3Inst , "v_add_f64", - VOP_F64_F64_F64, fadd + VOP_F64_F64_F64, fadd, 1 >; defm V_MUL_F64 : VOP3Inst , "v_mul_f64", - VOP_F64_F64_F64, fmul + VOP_F64_F64_F64, fmul, 1 >; defm V_MIN_F64 : VOP3Inst , "v_min_f64", - VOP_F64_F64_F64, fminnum + VOP_F64_F64_F64, fminnum, 1 >; defm V_MAX_F64 : VOP3Inst , "v_max_f64", - VOP_F64_F64_F64, fmaxnum + VOP_F64_F64_F64, fmaxnum, 1 >; -} // isCommutable = 1 +} // End isCommutable = 1 defm V_LDEXP_F64 : VOP3Inst , "v_ldexp_f64", - VOP_F64_F64_I32, AMDGPUldexp + VOP_F64_F64_I32, AMDGPUldexp, 1 >; -} // let SchedRW = [WriteDoubleAdd] +} // End let SchedRW = [WriteDoubleAdd] let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { @@ -1754,30 +1809,33 @@ defm V_MUL_LO_U32 : VOP3Inst , "v_mul_lo_u32", VOP_I32_I32_I32 >; defm V_MUL_HI_U32 : VOP3Inst , "v_mul_hi_u32", - VOP_I32_I32_I32 + VOP_I32_I32_I32, mulhu >; +let DisableVIDecoder=1 in { // removed from VI as identical to V_MUL_LO_U32 defm V_MUL_LO_I32 : VOP3Inst , "v_mul_lo_i32", VOP_I32_I32_I32 >; +} + defm V_MUL_HI_I32 : VOP3Inst , "v_mul_hi_i32", - VOP_I32_I32_I32 + VOP_I32_I32_I32, mulhs >; -} // isCommutable = 1, SchedRW = [WriteQuarterRate32] +} // End isCommutable = 1, SchedRW = [WriteQuarterRate32] let SchedRW = [WriteFloatFMA, WriteSALU] in { defm V_DIV_SCALE_F32 : VOP3bInst , "v_div_scale_f32", - VOP3b_F32_I1_F32_F32_F32 + VOP3b_F32_I1_F32_F32_F32, [], 1 >; } let SchedRW = [WriteDouble, WriteSALU] in { // Double precision division pre-scale. defm V_DIV_SCALE_F64 : VOP3bInst , "v_div_scale_f64", - VOP3b_F64_I1_F64_F64_F64 + VOP3b_F64_I1_F64_F64_F64, [], 1 >; -} // let SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble] let isCommutable = 1, Uses = [VCC, EXEC] in { @@ -1814,7 +1872,7 @@ defm V_TRIG_PREOP_F64 : VOP3Inst < vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop >; -} // let SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble] // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { @@ -1828,7 +1886,7 @@ defm V_MULLIT_F32 : VOP3Inst , "v_mullit_f32", } // End SubtargetPredicate = isSICI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isVI, DisableSIDecoder = 1 in { defm V_LSHLREV_B64 : VOP3Inst , "v_lshlrev_b64", VOP_I64_I32_I64 @@ -1845,113 +1903,145 @@ defm V_ASHRREV_I64 : VOP3Inst , "v_ashrrev_i64", //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// -let isCodeGenOnly = 1, isPseudo = 1 in { + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns -def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), - (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] ->; +def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), + (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} -let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // 64-bit vector move instruction. This is mainly used by the SIFoldOperands // pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; -} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 - -let hasSideEffects = 1, SALU = 1 in { -def SGPR_USE : InstSI <(outs),(ins), "", []>; +def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_64:$src0)> { + let VALU = 1; } +} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] + +let usesCustomInserter = 1, SALU = 1 in { +def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), + [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; +} // End let usesCustomInserter = 1, SALU = 1 // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. -let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { -let Uses = [EXEC], Defs = [EXEC] in { +let hasSideEffects = 1 in { + +// Dummy terminator instruction to use after control flow instructions +// replaced with exec mask operations. +def SI_MASK_BRANCH : PseudoInstSI < + (outs), (ins brtarget:$target, SReg_64:$dst)> { + let isBranch = 1; + let isTerminator = 1; + let isBarrier = 1; + let SALU = 1; +} + +let Uses = [EXEC], Defs = [EXEC, SCC] in { let isBranch = 1, isTerminator = 1 in { -def SI_IF: InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$vcc, brtarget:$target), - "", - [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))] ->; +def SI_IF: PseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), + [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))]> { + let Constraints = ""; +} -def SI_ELSE : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src, brtarget:$target), - "", - [(set i64:$dst, (int_SI_else i64:$src, bb:$target))] -> { +def SI_ELSE : PseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), + [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> { let Constraints = "$src = $dst"; } -def SI_LOOP : InstSI < - (outs), - (ins SReg_64:$saved, brtarget:$target), - "si_loop $saved, $target", - [(int_SI_loop i64:$saved, bb:$target)] +def SI_LOOP : PseudoInstSI < + (outs), (ins SReg_64:$saved, brtarget:$target), + [(int_amdgcn_loop i64:$saved, bb:$target)] >; -} // end isBranch = 1, isTerminator = 1 +} // End isBranch = 1, isTerminator = 1 -def SI_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src), - "si_else $dst, $src", - [(set i64:$dst, (int_SI_break i64:$src))] + +def SI_BREAK : PseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$src), + [(set i64:$dst, (int_amdgcn_break i64:$src))] >; -def SI_IF_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$vcc, SReg_64:$src), - "si_if_break $dst, $vcc, $src", - [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] +def SI_IF_BREAK : PseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), + [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))] >; -def SI_ELSE_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src0, SReg_64:$src1), - "si_else_break $dst, $src0, $src1", - [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] +def SI_ELSE_BREAK : PseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), + [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))] >; -def SI_END_CF : InstSI < - (outs), - (ins SReg_64:$saved), - "si_end_cf $saved", - [(int_SI_end_cf i64:$saved)] +def SI_END_CF : PseudoInstSI < + (outs), (ins SReg_64:$saved), + [(int_amdgcn_end_cf i64:$saved)] >; -} // End Uses = [EXEC], Defs = [EXEC] +} // End Uses = [EXEC], Defs = [EXEC, SCC] let Uses = [EXEC], Defs = [EXEC,VCC] in { -def SI_KILL : InstSI < - (outs), - (ins VSrc_32:$src), - "si_kill $src", - [(int_AMDGPU_kill f32:$src)] ->; +def SI_KILL : PseudoInstSI < + (outs), (ins VSrc_32:$src), + [(int_AMDGPU_kill f32:$src)]> { + let isConvergent = 1; + let usesCustomInserter = 1; +} + +def SI_KILL_TERMINATOR : PseudoInstSI < + (outs), (ins VSrc_32:$src)> { + let isTerminator = 1; +} + } // End Uses = [EXEC], Defs = [EXEC,VCC] -} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 +} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 -let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { +def SI_PS_LIVE : PseudoInstSI < + (outs SReg_64:$dst), (ins), + [(set i1:$dst, (int_amdgcn_ps_live))]> { + let SALU = 1; +} -class SI_INDIRECT_SRC : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins rc:$src, VSrc_32:$idx, i32imm:$off), - "si_indirect_src $dst, $temp, $src, $idx, $off", - [] ->; +// Used as an isel pseudo to directly emit initialization with an +// s_mov_b32 rather than a copy of another initialized +// register. MachineCSE skips copies, and we don't want to have to +// fold operands before it runs. +def SI_INIT_M0 : PseudoInstSI <(outs), (ins SSrc_32:$src)> { + let Defs = [M0]; + let usesCustomInserter = 1; + let isAsCheapAsAMove = 1; + let SALU = 1; + let isReMaterializable = 1; +} -class SI_INDIRECT_DST : InstSI < - (outs rc:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val), - "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", - [] -> { - let Constraints = "$src = $dst"; +def SI_RETURN : PseudoInstSI < + (outs), (ins variable_ops), [(AMDGPUreturn)]> { + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; + let hasSideEffects = 1; + let SALU = 1; + let hasNoSchedulingInfo = 1; +} + +let Uses = [EXEC], Defs = [EXEC, VCC, M0], + UseNamedOperandTable = 1 in { + +class SI_INDIRECT_SRC : PseudoInstSI < + (outs VGPR_32:$vdst, SReg_64:$sdst), + (ins rc:$src, VS_32:$idx, i32imm:$offset)>; + +class SI_INDIRECT_DST : PseudoInstSI < + (outs rc:$vdst, SReg_64:$sdst), + (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { + let Constraints = "$src = $vdst"; } // TODO: We can support indirect SGPR access. @@ -1967,25 +2057,20 @@ def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST; -} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0] +} // End Uses = [EXEC], Defs = [EXEC,VCC,M0] multiclass SI_SPILL_SGPR { - let UseNamedOperandTable = 1, Uses = [EXEC] in { - def _SAVE : InstSI < + def _SAVE : PseudoInstSI < (outs), - (ins sgpr_class:$src, i32imm:$frame_idx), - "", [] - > { + (ins sgpr_class:$src, i32imm:$frame_idx)> { let mayStore = 1; let mayLoad = 0; } - def _RESTORE : InstSI < + def _RESTORE : PseudoInstSI < (outs sgpr_class:$dst), - (ins i32imm:$frame_idx), - "", [] - > { + (ins i32imm:$frame_idx)> { let mayStore = 0; let mayLoad = 1; } @@ -1993,9 +2078,9 @@ multiclass SI_SPILL_SGPR { } // It's unclear whether you can use M0 as the output of v_readlane_b32 -// instructions, so use SGPR_32 register class for spills to prevent +// instructions, so use SReg_32_XM0 register class for spills to prevent // this from happening. -defm SI_SPILL_S32 : SI_SPILL_SGPR ; +defm SI_SPILL_S32 : SI_SPILL_SGPR ; defm SI_SPILL_S64 : SI_SPILL_SGPR ; defm SI_SPILL_S128 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; @@ -2003,21 +2088,18 @@ defm SI_SPILL_S512 : SI_SPILL_SGPR ; multiclass SI_SPILL_VGPR { let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in { - def _SAVE : InstSI < + def _SAVE : PseudoInstSI < (outs), (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset), - "", [] - > { + SReg_32:$scratch_offset, i32imm:$offset)> { let mayStore = 1; let mayLoad = 0; } - def _RESTORE : InstSI < + def _RESTORE : PseudoInstSI < (outs vgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), - "", [] - > { + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset, + i32imm:$offset)> { let mayStore = 0; let mayLoad = 1; } @@ -2033,29 +2115,19 @@ defm SI_SPILL_V512 : SI_SPILL_VGPR ; let Defs = [SCC] in { -def SI_CONSTDATA_PTR : InstSI < +def SI_PC_ADD_REL_OFFSET : PseudoInstSI < (outs SReg_64:$dst), - (ins const_ga:$ptr), - "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))] -> { + (ins si_ga:$ptr), + [(set SReg_64:$dst, (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr))))]> { let SALU = 1; } } // End Defs = [SCC] -} // end IsCodeGenOnly, isPseudo - -} // end SubtargetPredicate = isGCN +} // End SubtargetPredicate = isGCN let Predicates = [isGCN] in { -def : Pat< - (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), - (V_CNDMASK_B32_e64 $src2, $src1, - (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0, - DSTCLAMP.NONE, DSTOMOD.NONE)) ->; - def : Pat < (int_AMDGPU_kilp), (SI_KILL 0xbf800000) @@ -2067,7 +2139,6 @@ def : Pat< (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) >; -/* int_SI_export */ def : Pat < (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, f32:$src0, f32:$src1, f32:$src2, f32:$src3), @@ -2075,6 +2146,217 @@ def : Pat < $src0, $src1, $src2, $src3) >; +//===----------------------------------------------------------------------===// +// buffer_load/store_format patterns +//===----------------------------------------------------------------------===// + +multiclass MUBUF_LoadIntrinsicPat { + def : Pat< + (vt (name v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc)), + (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc)), + (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc)), + (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc)), + (!cast(opcode # _BOTHEN) + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; + +multiclass MUBUF_StoreIntrinsicPat { + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc), + (!cast(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc), + (!cast(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc), + (!cast(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc), + (!cast(opcode # _BOTHEN) + $vdata, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; + +//===----------------------------------------------------------------------===// +// buffer_atomic patterns +//===----------------------------------------------------------------------===// +multiclass BufferAtomicPatterns { + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (!cast(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (!cast(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (!cast(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (!cast(opcode # _RTN_BOTHEN) + $vdata_in, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) + >; +} + +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + + +//===----------------------------------------------------------------------===// +// S_GETREG_B32 Intrinsic Pattern. +//===----------------------------------------------------------------------===// +def : Pat < + (int_amdgcn_s_getreg imm:$simm16), + (S_GETREG_B32 (as_i16imm $simm16)) +>; + +//===----------------------------------------------------------------------===// +// DS_SWIZZLE Intrinsic Pattern. +//===----------------------------------------------------------------------===// +def : Pat < + (int_amdgcn_ds_swizzle i32:$src, imm:$offset16), + (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) +>; + //===----------------------------------------------------------------------===// // SMRD Patterns //===----------------------------------------------------------------------===// @@ -2109,7 +2391,6 @@ let AddedComplexity = 100 in { defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>; defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; @@ -2143,7 +2424,7 @@ def : Pat < def : Pat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, - (S_BCNT1_I32_B64 $src), sub0, + (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, (S_MOV_B32 0), sub1)) >; @@ -2168,8 +2449,8 @@ def : Pat < //===----------------------------------------------------------------------===// def : Pat < - (int_AMDGPU_barrier_global), - (S_BARRIER) + (int_amdgcn_s_waitcnt i32:$simm16), + (S_WAITCNT (as_i16imm $simm16)) >; //===----------------------------------------------------------------------===// @@ -2184,7 +2465,22 @@ let Predicates = [UnsafeFPMath] in { def : RsqPat; def : RsqPat; -} + +// Convert (x - floor(x)) to fract(x) +def : Pat < + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Convert (x + (-floor(x))) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [UnsafeFPMath] //===----------------------------------------------------------------------===// // VOP2 Patterns @@ -2217,9 +2513,9 @@ def : Pat < class SampleRawPattern : Pat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), - $addr, $rsrc, $sampler) + (opcode $addr, $rsrc, $sampler, + (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) >; multiclass SampleRawPatterns { @@ -2232,11 +2528,11 @@ multiclass SampleRawPatterns { // Image only class ImagePattern : Pat < - (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm, - i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), - $addr, $rsrc) + (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, + imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), + (opcode $addr, $rsrc, + (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) >; multiclass ImagePatterns { @@ -2245,6 +2541,54 @@ multiclass ImagePatterns { def : ImagePattern(opcode # _V4_V4), v4i32>; } +class ImageLoadPattern : Pat < + (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$r128, imm:$da, imm:$glc, + imm:$slc), + (opcode $addr, $rsrc, + (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $r128), 0, 0, (as_i1imm $da)) +>; + +multiclass ImageLoadPatterns { + def : ImageLoadPattern(opcode # _V4_V1), i32>; + def : ImageLoadPattern(opcode # _V4_V2), v2i32>; + def : ImageLoadPattern(opcode # _V4_V4), v4i32>; +} + +class ImageStorePattern : Pat < + (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, imm:$r128, imm:$da, + imm:$glc, imm:$slc), + (opcode $data, $addr, $rsrc, + (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $r128), 0, 0, (as_i1imm $da)) +>; + +multiclass ImageStorePatterns { + def : ImageStorePattern(opcode # _V4_V1), i32>; + def : ImageStorePattern(opcode # _V4_V2), v2i32>; + def : ImageStorePattern(opcode # _V4_V4), v4i32>; +} + +class ImageAtomicPattern : Pat < + (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), + (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) +>; + +multiclass ImageAtomicPatterns { + def : ImageAtomicPattern(opcode # _V1), i32>; + def : ImageAtomicPattern(opcode # _V2), v2i32>; + def : ImageAtomicPattern(opcode # _V4), v4i32>; +} + +class ImageAtomicCmpSwapPattern : Pat < + (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc, + imm:$r128, imm:$da, imm:$slc), + (EXTRACT_SUBREG + (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1), + $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)), + sub0) +>; + // Basic sample defm : SampleRawPatterns; defm : SampleRawPatterns; @@ -2341,38 +2685,57 @@ def : SampleRawPattern; def : ImagePattern; defm : ImagePatterns; defm : ImagePatterns; +defm : ImageLoadPatterns; +defm : ImageLoadPatterns; +defm : ImageStorePatterns; +defm : ImageStorePatterns; +defm : ImageAtomicPatterns; +def : ImageAtomicCmpSwapPattern; +def : ImageAtomicCmpSwapPattern; +def : ImageAtomicCmpSwapPattern; +defm : ImageAtomicPatterns; +defm : ImageAtomicPatterns; +defm : ImageAtomicPatterns; +defm : ImageAtomicPatterns; +defm : ImageAtomicPatterns; +defm : ImageAtomicPatterns; +defm : ImageAtomicPatterns; +defm : ImageAtomicPatterns; +defm : ImageAtomicPatterns; +defm : ImageAtomicPatterns; +defm : ImageAtomicPatterns; /* SIsample for simple 1D texture lookup */ def : Pat < - (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), - (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm), + (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; class SamplePattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; class SampleRectPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT), - (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT), + (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0) >; class SampleArrayPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) >; class SampleShadowPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; class SampleShadowArrayPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) >; /* SIsample* for texture lookups consuming more address parameters */ @@ -2422,68 +2785,10 @@ defm : SamplePatterns; -/* int_SI_imageload for texture fetches consuming varying address parameters */ -class ImageLoadPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, imm), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadArrayPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadMSAAPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadArrayMSAAPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) ->; - -multiclass ImageLoadPatterns { - def : ImageLoadPattern ; - def : ImageLoadArrayPattern ; -} - -multiclass ImageLoadMSAAPatterns { - def : ImageLoadMSAAPattern ; - def : ImageLoadArrayMSAAPattern ; -} - -defm : ImageLoadPatterns; -defm : ImageLoadPatterns; - -defm : ImageLoadMSAAPatterns; -defm : ImageLoadMSAAPatterns; - -/* Image resource information */ -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ -//def : Extract_Element; -//def : Extract_Element; -//def : Extract_Element; -//def : Extract_Element; - foreach Index = 0-2 in { def Extract_Element_v2i32_#Index : Extract_Element < i32, v2i32, Index, !cast(sub#Index) @@ -2548,50 +2853,47 @@ foreach Index = 0-15 in { >; } -def : BitConvert ; +// FIXME: Why do only some of these type combinations for SReg and +// VReg? +// 32-bit bitcast def : BitConvert ; - -def : BitConvert ; def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +// 64-bit bitcast def : BitConvert ; - def : BitConvert ; - -def : BitConvert ; def : BitConvert ; -def : BitConvert ; +def : BitConvert ; def : BitConvert ; -def : BitConvert ; +def : BitConvert ; def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +def : BitConvert ; def : BitConvert ; +def : BitConvert ; def : BitConvert ; -def : BitConvert ; +def : BitConvert ; def : BitConvert ; +def : BitConvert ; - +// 128-bit bitcast def : BitConvert ; def : BitConvert ; - def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; +def : BitConvert ; +def : BitConvert ; - - - -def : BitConvert ; +// 256-bit bitcast def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +def : BitConvert ; def : BitConvert ; def : BitConvert ; -def : BitConvert ; +// 512-bit bitcast def : BitConvert ; def : BitConvert ; @@ -2613,7 +2915,7 @@ def : Pat < def : Pat < (fneg (fabs f32:$src)), - (S_OR_B32 $src, 0x80000000) /* Set sign bit */ + (S_OR_B32 $src, 0x80000000) // Set sign bit >; // FIXME: Should use S_OR_B32 @@ -2703,14 +3005,8 @@ def : Pat < /********** Intrinsic Patterns **********/ /********** ================== **********/ -/* llvm.AMDGPU.pow */ def : POW_Common ; -def : Pat < - (int_AMDGPU_div f32:$src0, f32:$src1), - (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) ->; - def : Pat < (int_AMDGPU_cube v4f32:$src), (REG_SEQUENCE VReg_128, @@ -2745,7 +3041,7 @@ class Ext32Pat : Pat < def : Ext32Pat ; def : Ext32Pat ; -// Offset in an 32Bit VGPR +// Offset in an 32-bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) @@ -2759,12 +3055,6 @@ def : Pat < (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; -def : Pat < - (int_SI_tid), - (V_MBCNT_HI_U32_B32_e64 0xffffffff, - (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) ->; - //===----------------------------------------------------------------------===// // VOP3 Patterns //===----------------------------------------------------------------------===// @@ -2772,16 +3062,6 @@ def : Pat < def : IMad24Pat; def : UMad24Pat; -def : Pat < - (mulhu i32:$src0, i32:$src1), - (V_MUL_HI_U32 $src0, $src1) ->; - -def : Pat < - (mulhs i32:$src0, i32:$src1), - (V_MUL_HI_I32 $src0, $src1) ->; - defm : BFIPatterns ; def : ROTRPattern ; @@ -2839,19 +3119,6 @@ class DSAtomicRetPat : Pat < (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; -// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec -// -// We need to use something for the data0, so we set a register to -// -1. For the non-rtn variants, the manual says it does -// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max -// will always do the increment so I'm assuming it's the same. -class DSAtomicIncRetPat : Pat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), - (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0)) ->; - - class DSAtomicCmpXChg : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) @@ -2859,14 +3126,11 @@ class DSAtomicCmpXChg : Pat < // 32-bit atomics. -def : DSAtomicIncRetPat; -def : DSAtomicIncRetPat; - def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; @@ -2874,18 +3138,14 @@ def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; - def : DSAtomicCmpXChg; // 64-bit atomics. -def : DSAtomicIncRetPat; -def : DSAtomicIncRetPat; - def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; @@ -2901,20 +3161,35 @@ def : DSAtomicCmpXChg; // MUBUF Patterns //===----------------------------------------------------------------------===// -multiclass MUBUFLoad_Pattern { - def : Pat < +class MUBUFLoad_Pattern : Pat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) >; + +multiclass MUBUFLoad_Atomic_Pattern { + def : Pat < + (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) + >; + + def : Pat < + (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) + >; } let Predicates = [isSICI] in { -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; +def : MUBUFLoad_Pattern ; +def : MUBUFLoad_Pattern ; +def : MUBUFLoad_Pattern ; +def : MUBUFLoad_Pattern ; + +defm : MUBUFLoad_Atomic_Pattern ; +defm : MUBUFLoad_Atomic_Pattern ; } // End Predicates = [isSICI] class MUBUFScratchLoadPat : Pat < @@ -2975,6 +3250,25 @@ defm : MUBUF_Load_Dword ; +multiclass MUBUFStore_Atomic_Pattern { + // Store follows atomic op convention so address is forst + def : Pat < + (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc), vt:$val), + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) + >; + + def : Pat < + (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) + >; +} +let Predicates = [isSICI] in { +defm : MUBUFStore_Atomic_Pattern ; +defm : MUBUFStore_Atomic_Pattern ; +} // End Predicates = [isSICI] + class MUBUFScratchStorePat : Pat < (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), @@ -2987,22 +3281,6 @@ def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; -/* -class MUBUFStore_Pattern : Pat < - (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)), - (Instr $value, $srsrc, $vaddr, $offset) ->; - -let Predicates = [isSICI] in { -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -} // End Predicates = [isSICI] - -*/ - //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// @@ -3029,29 +3307,16 @@ def : MTBUF_StoreResource ; /********** ====================== **********/ multiclass SI_INDIRECT_Pattern { - - // 1. Extract with offset + // Extract with offset def : Pat< - (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))), - (!cast("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off) + (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), + (!cast("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) >; - // 2. Extract without offset + // Insert with offset def : Pat< - (eltvt (extractelt vt:$vec, i32:$idx)), - (!cast("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0) - >; - - // 3. Insert with offset - def : Pat< - (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (!cast("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val) - >; - - // 4. Insert without offset - def : Pat< - (insertelt vt:$vec, eltvt:$val, i32:$idx), - (!cast("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val) + (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), + (!cast("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) >; } @@ -3111,10 +3376,12 @@ def : ZExt_i64_i32_Pat; def : ZExt_i64_i1_Pat; def : ZExt_i64_i1_Pat; +// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that +// REG_SEQUENCE patterns don't support instructions with multiple outputs. def : Pat < (i64 (sext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, - (S_ASHR_I32 $src, 31), sub1) + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1) >; def : Pat < @@ -3214,6 +3481,23 @@ defm : BFMPatterns ; def : BFEPattern ; +let Predicates = [isSICI] in { +def : Pat < + (i64 (readcyclecounter)), + (S_MEMTIME) +>; +} + +def : Pat< + (fcanonicalize f32:$src), + (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0) +>; + +def : Pat< + (fcanonicalize f64:$src), + (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0) +>; + //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// @@ -3226,21 +3510,6 @@ let Predicates = [isSI] in { // The workaround for the V_FRACT bug is: // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) -// Convert (x + (-floor(x)) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_CNDMASK_B64_PSEUDO - (V_MIN_F64 - SRCMODS.NONE, - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), - SRCMODS.NONE, - (V_MOV_B64_PSEUDO 0x3fefffffffffffff), - DSTCLAMP.NONE, DSTOMOD.NONE), - $x, - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) ->; - // Convert floor(x) to (x - fract(x)) def : Pat < (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), @@ -3268,6 +3537,9 @@ def : Pat < def : SHA256MaPattern ; +def : IntMed3Pat; +def : IntMed3Pat; + //============================================================================// // Assembler aliases //============================================================================// diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td index 027a0a2f5167..a9b7c39096e7 100644 --- a/lib/Target/AMDGPU/SIIntrinsics.td +++ b/lib/Target/AMDGPU/SIIntrinsics.td @@ -13,8 +13,6 @@ let TargetPrefix = "SI", isTarget = 1 in { - - def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>; def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; @@ -50,9 +48,9 @@ let TargetPrefix = "SI", isTarget = 1 in { llvm_i32_ty, // glc(imm) llvm_i32_ty, // slc(imm) llvm_i32_ty], // tfe(imm) - [IntrReadArgMem]>; + [IntrReadMem, IntrArgMemOnly]>; - def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>; // Fully-flexible SAMPLE instruction. class SampleRaw : Intrinsic < @@ -172,28 +170,20 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_image_load_mip : Image; def int_SI_getresinfo : Image; - // Deprecated image and sample intrinsics. - class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - - def int_SI_sample : Sample; - def int_SI_sampleb : Sample; - def int_SI_sampled : Sample; - def int_SI_samplel : Sample; - def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - /* Interpolation Intrinsics */ def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; +} // End TargetPrefix = "SI", isTarget = 1 +let TargetPrefix = "amdgcn", isTarget = 1 in { /* Control flow Intrinsics */ - def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; - def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; - def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; - def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; - def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; - def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; - def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>; + def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; + def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; + def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; + def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; + def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; + def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; + def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], []>; } diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 1bdb1f0ee9f9..9e972a569a0f 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -26,7 +26,7 @@ // // - Live interval recomputing seems inefficient. This currently only matches // one pair, and recomputes live intervals and moves on to the next pair. It -// would be better to compute a list of all merges that need to occur +// would be better to compute a list of all merges that need to occur. // // - With a list of instructions to process, we can also merge more. If a // cluster of loads have offsets that are too large to fit in the 8-bit @@ -36,6 +36,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" @@ -61,7 +62,6 @@ private: MachineRegisterInfo *MRI; LiveIntervals *LIS; - static bool offsetsCanBeCombined(unsigned Offset0, unsigned Offset1, unsigned EltSize); @@ -69,10 +69,6 @@ private: MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, unsigned EltSize); - void updateRegDefsUses(unsigned SrcReg, - unsigned DstReg, - unsigned SubIdx); - MachineBasicBlock::iterator mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -193,17 +189,6 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, return E; } -void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg, - unsigned DstReg, - unsigned SubIdx) { - for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg), - E = MRI->reg_end(); I != E; ) { - MachineOperand &O = *I; - ++I; - O.substVirtReg(DstReg, SubIdx, *TRI); - } -} - MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -268,19 +253,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( .addOperand(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - LIS->InsertMachineInstrInMaps(Read2); + LIS->InsertMachineInstrInMaps(*Read2); // repairLiveintervalsInRange() doesn't handle physical register, so we have // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); // The new write to the original destination register is now the copy. Steal // the old SlotIndex. - LIS->ReplaceMachineInstrInMaps(I, Copy0); - LIS->ReplaceMachineInstrInMaps(Paired, Copy1); + LIS->ReplaceMachineInstrInMaps(*I, *Copy0); + LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1); I->eraseFromParent(); Paired->eraseFromParent(); @@ -291,7 +276,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( LIS->createAndComputeVirtRegInterval(DestReg); if (UpdateM0Range) { - SlotIndex Read2Index = LIS->getInstructionIndex(Read2); + SlotIndex Read2Index = LIS->getInstructionIndex(*Read2); M0Segment->end = Read2Index.getRegSlot(); } @@ -340,7 +325,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( // repairLiveintervalsInRange() doesn't handle physical register, so we have // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); @@ -359,8 +344,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( // XXX - How do we express subregisters here? unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; - LIS->RemoveMachineInstrFromMaps(I); - LIS->RemoveMachineInstrFromMaps(Paired); + LIS->RemoveMachineInstrFromMaps(*I); + LIS->RemoveMachineInstrFromMaps(*Paired); I->eraseFromParent(); Paired->eraseFromParent(); @@ -368,7 +353,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); if (UpdateM0Range) { - SlotIndex Write2Index = LIS->getInstructionIndex(Write2); + SlotIndex Write2Index = LIS->getInstructionIndex(*Write2); M0Segment->end = Write2Index.getRegSlot(); } @@ -423,9 +408,16 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { } bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { - const TargetSubtargetInfo &STM = MF.getSubtarget(); - TRI = static_cast(STM.getRegisterInfo()); - TII = static_cast(STM.getInstrInfo()); + if (skipFunction(*MF.getFunction())) + return false; + + const SISubtarget &STM = MF.getSubtarget(); + if (!STM.loadStoreOptEnabled()) + return false; + + TII = STM.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); LIS = &getAnalysis(); diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 126f6245dfc0..ee1d5dae70b7 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -52,6 +52,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -61,24 +62,24 @@ using namespace llvm; -namespace { +#define DEBUG_TYPE "si-lower-control-flow" -class SILowerControlFlowPass : public MachineFunctionPass { +namespace { +class SILowerControlFlow : public MachineFunctionPass { private: static const unsigned SkipThreshold = 12; - static char ID; const SIRegisterInfo *TRI; const SIInstrInfo *TII; bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); void Skip(MachineInstr &From, MachineOperand &To); - void SkipIfDead(MachineInstr &MI); + bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); void If(MachineInstr &MI); - void Else(MachineInstr &MI); + void Else(MachineInstr &MI, bool ExecModified); void Break(MachineInstr &MI); void IfBreak(MachineInstr &MI); void ElseBreak(MachineInstr &MI); @@ -88,56 +89,118 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); - void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); - void IndirectSrc(MachineInstr &MI); - void IndirectDst(MachineInstr &MI); + MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + std::pair + splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + + void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, + const MachineRegisterInfo &MRI, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + const MachineOperand &IdxReg); + + void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, + MachineInstr *MovRel, + const MachineOperand &IdxReg, + int Offset); + + bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); + std::pair computeIndirectRegAndOffset(unsigned VecReg, + int Offset) const; + bool indirectSrc(MachineInstr &MI); + bool indirectDst(MachineInstr &MI); public: - SILowerControlFlowPass(TargetMachine &tm) : + static char ID; + + SILowerControlFlow() : MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "SI Lower control flow instructions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); + return "SI Lower control flow pseudo instructions"; } }; } // End anonymous namespace -char SILowerControlFlowPass::ID = 0; +char SILowerControlFlow::ID = 0; + +INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, + "SI lower control flow", false, false) -FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { - return new SILowerControlFlowPass(tm); +char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID; + + +FunctionPass *llvm::createSILowerControlFlowPass() { + return new SILowerControlFlow(); } -bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, - MachineBasicBlock *To) { +static bool opcodeEmitsNoInsts(unsigned Opc) { + switch (Opc) { + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::BUNDLE: + case TargetOpcode::CFI_INSTRUCTION: + case TargetOpcode::EH_LABEL: + case TargetOpcode::GC_LABEL: + case TargetOpcode::DBG_VALUE: + return true; + default: + return false; + } +} + +bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From, + MachineBasicBlock *To) { + if (From->succ_empty()) + return false; unsigned NumInstr = 0; + MachineFunction *MF = From->getParent(); - for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); - MBB = *MBB->succ_begin()) { + for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); NumInstr < SkipThreshold && I != E; ++I) { + if (opcodeEmitsNoInsts(I->getOpcode())) + continue; + + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken + // when EXEC = 0. We should skip the loop lest it becomes infinite. + if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || + I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) + return true; + + if (I->isInlineAsm()) { + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + const char *AsmStr = I->getOperand(0).getSymbolName(); + + // inlineasm length estimate is number of bytes assuming the longest + // instruction. + uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); + NumInstr += MaxAsmSize / MAI->getMaxInstLength(); + } else { + ++NumInstr; + } - if (I->isBundle() || !I->isBundled()) - if (++NumInstr >= SkipThreshold) - return true; + if (NumInstr >= SkipThreshold) + return true; } } return false; } -void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { +void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) return; @@ -147,40 +210,44 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { .addOperand(To); } -void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { - +bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MBB.getParent(); - if (MBB.getParent()->getInfo()->getShaderType() != - ShaderType::PIXEL || + if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || !shouldSkip(&MBB, &MBB.getParent()->back())) - return; + return false; + + MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); + MBB.addSuccessor(SkipBB); - MachineBasicBlock::iterator Insert = &MI; - ++Insert; + const DebugLoc &DL = MI.getDebugLoc(); // If the exec mask is non-zero, skip the next two instructions - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3); + BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(&NextBB); + + MachineBasicBlock::iterator Insert = SkipBB->begin(); // Exec mask is zero: Export to NULL target... - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0); - - // ... and terminate wavefront - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef); + + // ... and terminate wavefront. + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + + return true; } -void SILowerControlFlowPass::If(MachineInstr &MI) { +void SILowerControlFlow::If(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Reg = MI.getOperand(0).getReg(); @@ -195,10 +262,15 @@ void SILowerControlFlowPass::If(MachineInstr &MI) { Skip(MI, MI.getOperand(2)); + // Insert a pseudo terminator to help keep the verifier happy. + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addOperand(MI.getOperand(2)) + .addReg(Reg); + MI.eraseFromParent(); } -void SILowerControlFlowPass::Else(MachineInstr &MI) { +void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); @@ -208,22 +280,36 @@ void SILowerControlFlowPass::Else(MachineInstr &MI) { TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) .addReg(Src); // Saved EXEC + if (ExecModified) { + // Adjust the saved exec to account for the modifications during the flow + // block that contains the ELSE. This can happen when WQM mode is switched + // off. + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) + .addReg(AMDGPU::EXEC) + .addReg(Dst); + } + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) .addReg(Dst); Skip(MI, MI.getOperand(2)); + // Insert a pseudo terminator to help keep the verifier happy. + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addOperand(MI.getOperand(2)) + .addReg(Dst); + MI.eraseFromParent(); } -void SILowerControlFlowPass::Break(MachineInstr &MI) { +void SILowerControlFlow::Break(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Src = MI.getOperand(1).getReg(); - + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) .addReg(AMDGPU::EXEC) .addReg(Src); @@ -231,14 +317,14 @@ void SILowerControlFlowPass::Break(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { +void SILowerControlFlow::IfBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Vcc = MI.getOperand(1).getReg(); unsigned Src = MI.getOperand(2).getReg(); - + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) .addReg(Vcc) .addReg(Src); @@ -246,14 +332,14 @@ void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { +void SILowerControlFlow::ElseBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Saved = MI.getOperand(1).getReg(); unsigned Src = MI.getOperand(2).getReg(); - + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) .addReg(Saved) .addReg(Src); @@ -261,7 +347,7 @@ void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::Loop(MachineInstr &MI) { +void SILowerControlFlow::Loop(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Src = MI.getOperand(0).getReg(); @@ -276,7 +362,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::EndCf(MachineInstr &MI) { +void SILowerControlFlow::EndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Reg = MI.getOperand(0).getReg(); @@ -289,24 +375,24 @@ void SILowerControlFlowPass::EndCf(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::Branch(MachineInstr &MI) { - if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) +void SILowerControlFlow::Branch(MachineInstr &MI) { + MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); + if (MBB == MI.getParent()->getNextNode()) MI.eraseFromParent(); // If these aren't equal, this is probably an infinite loop. } -void SILowerControlFlowPass::Kill(MachineInstr &MI) { +void SILowerControlFlow::Kill(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Op = MI.getOperand(0); #ifndef NDEBUG - const SIMachineFunctionInfo *MFI - = MBB.getParent()->getInfo(); + CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); // Kill is only allowed in pixel / geometry shaders. - assert(MFI->getShaderType() == ShaderType::PIXEL || - MFI->getShaderType() == ShaderType::GEOMETRY); + assert(CallConv == CallingConv::AMDGPU_PS || + CallConv == CallingConv::AMDGPU_GS); #endif // Clear this thread from the exec mask if the operand is negative @@ -325,94 +411,209 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { +// All currently live registers must remain so in the remainder block. +void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, + const MachineRegisterInfo &MRI, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + const MachineOperand &IdxReg) { + // Add reg defined in loop body. + RemainderLiveRegs.addReg(SaveReg); + + if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) { + if (!Val->isUndef()) { + RemainderLiveRegs.addReg(Val->getReg()); + LoopBB.addLiveIn(Val->getReg()); + } + } + + for (unsigned Reg : RemainderLiveRegs) { + if (MRI.isAllocatable(Reg)) + RemainderBB.addLiveIn(Reg); + } + + const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); + if (!Src->isUndef()) + LoopBB.addLiveIn(Src->getReg()); + + if (!IdxReg.isUndef()) + LoopBB.addLiveIn(IdxReg.getReg()); + LoopBB.sortUniqueLiveIns(); +} + +void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, + DebugLoc DL, + MachineInstr *MovRel, + const MachineOperand &IdxReg, + int Offset) { + MachineBasicBlock::iterator I = LoopBB.begin(); + + // Read the next variant into VCC (lower 32 bits) <- also loop target + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO) + .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); + + // Move index from VCC into M0 + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(AMDGPU::VCC_LO); + + // Compare the just read M0 value to all possible Idx values + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) + .addReg(AMDGPU::M0) + .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); + + // Update EXEC, save the original EXEC value to VCC + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) + .addReg(AMDGPU::VCC); + + if (Offset != 0) { + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(AMDGPU::M0) + .addImm(Offset); + } + + // Do the actual move + LoopBB.insert(I, MovRel); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1 + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(&LoopBB); +} + +MachineBasicBlock *SILowerControlFlow::insertSkipBlock( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + MachineFunction *MF = MBB.getParent(); + + MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, SkipBB); + + return SkipBB; +} + +std::pair +SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + MachineFunction *MF = MBB.getParent(); + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessors(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + + return std::make_pair(LoopBB, RemainderBB); +} + +// Returns true if a new block was inserted. +bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); - MachineBasicBlock::iterator I = MI; + MachineBasicBlock::iterator I(&MI); - unsigned Save = MI.getOperand(1).getReg(); - unsigned Idx = MI.getOperand(3).getReg(); + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - if (AMDGPU::SReg_32RegClass.contains(Idx)) { - if (Offset) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(Idx) - .addImm(Offset); + if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { + if (Offset != 0) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) + .addImm(Offset); } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(Idx); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())); } + MBB.insert(I, MovRel); - } else { + MI.eraseFromParent(); + return false; + } - assert(AMDGPU::SReg_64RegClass.contains(Save)); - assert(AMDGPU::VGPR_32RegClass.contains(Idx)); + MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + SaveOp->setIsDead(false); + unsigned Save = SaveOp->getReg(); - // Save the EXEC mask - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) - .addReg(AMDGPU::EXEC); + // Reading from a VGPR requires looping over all workitems in the wavefront. + assert(AMDGPU::SReg_64RegClass.contains(Save) && + AMDGPU::VGPR_32RegClass.contains(Idx->getReg())); - // Read the next variant into VCC (lower 32 bits) <- also loop target - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - AMDGPU::VCC_LO) - .addReg(Idx); + // Save the EXEC mask + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) + .addReg(AMDGPU::EXEC); - // Move index from VCC into M0 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(AMDGPU::VCC_LO); + LivePhysRegs RemainderLiveRegs(TRI); - // Compare the just read M0 value to all possible Idx values - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) - .addReg(AMDGPU::M0) - .addReg(Idx); + RemainderLiveRegs.addLiveOuts(MBB); - // Update EXEC, save the original EXEC value to VCC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; - if (Offset) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(AMDGPU::M0) - .addImm(Offset); - } - // Do the actual move - MBB.insert(I, MovRel); + std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); - // Update EXEC, switch all done bits to 0 and all todo bits to 1 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); + for (const MachineInstr &Inst : reverse(*RemainderBB)) + RemainderLiveRegs.stepBackward(Inst); - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(-7); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LoopBB->addSuccessor(RemainderBB); + LoopBB->addSuccessor(LoopBB); - // Restore EXEC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(Save); + splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, + *RemainderBB, Save, *Idx); + + emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); + + MachineBasicBlock::iterator First = RemainderBB->begin(); + BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(Save); - } MI.eraseFromParent(); + return true; } -/// \param @VecReg The register which holds element zero of the vector -/// being addressed into. -/// \param[out] @Reg The base register to use in the indirect addressing instruction. -/// \param[in,out] @Offset As an input, this is the constant offset part of the -// indirect Index. e.g. v0 = v[VecReg + Offset] -// As an output, this is a constant value that needs -// to be added to the value stored in M0. -void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, - unsigned &Reg, - int &Offset) { +/// \param @VecReg The register which holds element zero of the vector being +/// addressed into. +// +/// \param[in] @Idx The index operand from the movrel instruction. This must be +// a register, but may be NoRegister. +/// +/// \param[in] @Offset As an input, this is the constant offset part of the +// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant +// value that needs to be added to the value stored in M0. +std::pair +SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const { unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); if (!SubReg) SubReg = VecReg; + const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg); const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); - int RegIdx = TRI->getHWRegIndex(SubReg) + Offset; + int NumElts = SuperRC->getSize() / RC->getSize(); + + int BaseRegIdx = TRI->getHWRegIndex(SubReg); + + // Skip out of bounds offsets, or else we would end up using an undefined + // register. + if (Offset >= NumElts) + return std::make_pair(RC->getRegister(BaseRegIdx), Offset); + int RegIdx = BaseRegIdx + Offset; if (RegIdx < 0) { Offset = RegIdx; RegIdx = 0; @@ -420,77 +621,102 @@ void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, Offset = 0; } - Reg = RC->getRegister(RegIdx); + unsigned Reg = RC->getRegister(RegIdx); + return std::make_pair(Reg, Offset); } -void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { - +// Return true if a new block was inserted. +bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vec = MI.getOperand(2).getReg(); - int Off = MI.getOperand(4).getImm(); + const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); unsigned Reg; - computeIndirectRegAndOffset(Vec, Reg, Off); + std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset); + + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + if (Idx->getReg() == AMDGPU::NoRegister) { + // Only had a constant offset, copy the register directly. + BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) + .addReg(Reg, getUndefRegState(SrcVec->isUndef())); + MI.eraseFromParent(); + return false; + } MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(Reg) - .addReg(Vec, RegState::Implicit); + .addReg(Reg, getUndefRegState(SrcVec->isUndef())) + .addReg(SrcVec->getReg(), RegState::Implicit); - LoadM0(MI, MovRel, Off); + return loadM0(MI, MovRel, Offset); } -void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { - +// Return true if a new block was inserted. +bool SILowerControlFlow::indirectDst(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - int Off = MI.getOperand(4).getImm(); - unsigned Val = MI.getOperand(5).getReg(); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); unsigned Reg; - computeIndirectRegAndOffset(Dst, Reg, Off); + const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); + std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset); - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) - .addReg(Reg, RegState::Define) - .addReg(Val) - .addReg(Dst, RegState::Implicit); + MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + if (Idx->getReg() == AMDGPU::NoRegister) { + // Only had a constant offset, copy the register directly. + BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg) + .addOperand(*Val); + MI.eraseFromParent(); + return false; + } + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg) + .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) + .addReg(Dst, RegState::Implicit); - LoadM0(MI, MovRel, Off); + return loadM0(MI, MovRel, Offset); } -bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - TRI = - static_cast(MF.getSubtarget().getRegisterInfo()); +bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); bool HaveKill = false; - bool NeedWQM = false; bool NeedFlat = false; unsigned Depth = 0; - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + MachineFunction::iterator NextBB; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; BI = NextBB) { + NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; + + MachineBasicBlock *EmptyMBBAtEnd = nullptr; MachineBasicBlock::iterator I, Next; + bool ExecModified = false; + for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; - if (TII->isWQM(MI) || TII->isDS(MI)) - NeedWQM = true; // Flat uses m0 in case it needs to access LDS. if (TII->isFLAT(MI)) NeedFlat = true; + if (I->modifiesRegister(AMDGPU::EXEC, TRI)) + ExecModified = true; + switch (MI.getOpcode()) { default: break; case AMDGPU::SI_IF: @@ -499,7 +725,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { break; case AMDGPU::SI_ELSE: - Else(MI); + Else(MI, ExecModified); break; case AMDGPU::SI_BREAK: @@ -521,16 +747,20 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_END_CF: if (--Depth == 0 && HaveKill) { - SkipIfDead(MI); HaveKill = false; + // TODO: Insert skip if exec is 0? } + EndCf(MI); break; - case AMDGPU::SI_KILL: - if (Depth == 0) - SkipIfDead(MI); - else + case AMDGPU::SI_KILL_TERMINATOR: + if (Depth == 0) { + if (skipIfDead(MI, *NextBB)) { + NextBB = std::next(BI); + BE = MF.end(); + } + } else HaveKill = true; Kill(MI); break; @@ -544,7 +774,15 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_INDIRECT_SRC_V4: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V16: - IndirectSrc(MI); + if (indirectSrc(MI)) { + // The block was split at this point. We can safely skip the middle + // inserted block to the following which contains the rest of this + // block's instructions. + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + break; case AMDGPU::SI_INDIRECT_DST_V1: @@ -552,55 +790,46 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: - IndirectDst(MI); + if (indirectDst(MI)) { + // The block was split at this point. We can safely skip the middle + // inserted block to the following which contains the rest of this + // block's instructions. + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + break; + + case AMDGPU::SI_RETURN: { + assert(!MF.getInfo()->returnsVoid()); + + // Graphics shaders returning non-void shouldn't contain S_ENDPGM, + // because external bytecode will be appended at the end. + if (BI != --MF.end() || I != MBB.getFirstTerminator()) { + // SI_RETURN is not the last instruction. Add an empty block at + // the end and jump there. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB.addSuccessor(EmptyMBBAtEnd); + BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + I->eraseFromParent(); + } + break; + } } } } - if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { - MachineBasicBlock &MBB = MF.front(); - BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC).addReg(AMDGPU::EXEC); - } - - // FIXME: This seems inappropriate to do here. if (NeedFlat && MFI->IsKernel) { - // Insert the prologue initializing the SGPRs pointing to the scratch space - // for flat accesses. - const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - // TODO: What to use with function calls? - - // FIXME: This is reporting stack size that is used in a scratch buffer - // rather than registers as well. - uint64_t StackSizeBytes = FrameInfo->getStackSize(); - - int IndirectBegin - = static_cast(TII)->getIndirectIndexBegin(MF); - // Convert register index to 256-byte unit. - uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); - - assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && - "Stack limits should be smaller than 16-bits"); - - // Initialize the flat scratch register pair. - // TODO: Can we use one s_mov_b64 here? - - // Offset is in units of 256-bytes. - MachineBasicBlock &MBB = MF.front(); - DebugLoc NoDL; - MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); - const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); - - assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); - - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) - .addImm(StackOffset); - - // Documentation says size is "per-thread scratch size in bytes" - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) - .addImm(StackSizeBytes); + // We will need to Initialize the flat scratch register pair. + if (NeedFlat) + MFI->setHasFlatInstructions(true); } return true; diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index a2fa5fd93aad..dc1d20ddb274 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -18,7 +18,6 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -47,8 +46,6 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -56,11 +53,8 @@ public: } // End anonymous namespace. -INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, - "SI Lower i1 Copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, - "SI Lower i1 Copies", false, false) +INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE, + "SI Lower i1 Copies", false, false) char SILowerI1Copies::ID = 0; @@ -72,9 +66,10 @@ FunctionPass *llvm::createSILowerI1CopiesPass() { bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); + std::vector I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 49677fc2b0a3..4d12a1ef9a93 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -1,19 +1,17 @@ -//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===// +//===-- SIMachineFunctionInfo.cpp -------- SI Machine Function Info -------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -/// \file //===----------------------------------------------------------------------===// - #include "SIMachineFunctionInfo.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -22,6 +20,11 @@ using namespace llvm; +static cl::opt EnableSpillSGPRToVGPR( + "amdgpu-spill-sgpr-to-vgpr", + cl::desc("Enable spilling VGPRs to SGPRs"), + cl::ReallyHidden, + cl::init(true)); // Pin the vtable to this file. void SIMachineFunctionInfo::anchor() {} @@ -48,12 +51,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), PSInputAddr(0), ReturnsVoid(true), + MaximumWorkGroupSize(0), + DebuggerReservedVGPRCount(0), + DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}), + DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}), LDSWaveSpillSize(0), PSInputEna(0), NumUserSGPRs(0), NumSystemSGPRs(0), HasSpilledSGPRs(false), HasSpilledVGPRs(false), + HasNonSpillStackObjects(false), + HasFlatInstructions(false), + NumSpilledSGPRs(0), + NumSpilledVGPRs(0), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), @@ -63,37 +74,45 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) GridWorkgroupCountX(false), GridWorkgroupCountY(false), GridWorkgroupCountZ(false), - WorkGroupIDX(true), + WorkGroupIDX(false), WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false), PrivateSegmentWaveByteOffset(false), - WorkItemIDX(true), + WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false) { - const AMDGPUSubtarget &ST = MF.getSubtarget(); + const SISubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - if (getShaderType() == ShaderType::COMPUTE) + if (!AMDGPU::isShader(F->getCallingConv())) { KernargSegmentPtr = true; + WorkGroupIDX = true; + WorkItemIDX = true; + } - if (F->hasFnAttribute("amdgpu-work-group-id-y")) + if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue()) WorkGroupIDY = true; - if (F->hasFnAttribute("amdgpu-work-group-id-z")) + if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue()) WorkGroupIDZ = true; - if (F->hasFnAttribute("amdgpu-work-item-id-y")) + if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue()) WorkItemIDY = true; - if (F->hasFnAttribute("amdgpu-work-item-id-z")) + if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue()) WorkItemIDZ = true; - bool MaySpill = ST.isVGPRSpillingEnabled(this); + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(*F); bool HasStackObjects = FrameInfo->hasStackObjects(); if (HasStackObjects || MaySpill) @@ -105,12 +124,25 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F->hasFnAttribute("amdgpu-dispatch-ptr")) DispatchPtr = true; + + if (F->hasFnAttribute("amdgpu-queue-ptr")) + QueuePtr = true; } - // X, XY, and XYZ are the only supported combinations, so make sure Y is - // enabled if Z is. - if (WorkItemIDZ) - WorkItemIDY = true; + // We don't need to worry about accessing spills with flat instructions. + // TODO: On VI where we must use flat for global, we should be able to omit + // this if it is never used for generic access. + if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS && + ST.isAmdHsaOS()) + FlatScratchInit = true; + + if (AMDGPU::isCompute(F->getCallingConv())) + MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F); + else + MaximumWorkGroupSize = ST.getWavefrontSize(); + + if (ST.debuggerReserveRegs()) + DebuggerReservedVGPRCount = 4; } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( @@ -142,13 +174,24 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) return KernargSegmentPtrUserSGPR; } -SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( +unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { + FlatScratchInitUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return FlatScratchInitUserSGPR; +} + +SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( MachineFunction *MF, unsigned FrameIndex, unsigned SubIdx) { - const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIRegisterInfo *TRI = static_cast( - MF->getSubtarget().getRegisterInfo()); + if (!EnableSpillSGPRToVGPR) + return SpilledReg(); + + const SISubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); Offset += SubIdx * 4; @@ -157,19 +200,14 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( unsigned Lane = (Offset / 4) % 64; struct SpilledReg Spill; + Spill.Lane = Lane; if (!LaneVGPRs.count(LaneVGPRIdx)) { unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); - if (LaneVGPR == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling SGPR"); - - // When compiling from inside Mesa, the compilation continues. - // Select an arbitrary register to avoid triggering assertions - // during subsequent passes. - LaneVGPR = AMDGPU::VGPR0; - } + if (LaneVGPR == AMDGPU::NoRegister) + // We have no VGPRs left for spilling SGPRs. + return Spill; LaneVGPRs[LaneVGPRIdx] = LaneVGPR; @@ -182,14 +220,10 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( } Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; - Spill.Lane = Lane; return Spill; } unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget(); - // FIXME: We should get this information from kernel attributes if it - // is available. - return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); + return MaximumWorkGroupSize; } diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 846ee5de057d..f5bd6366c717 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -11,12 +11,12 @@ // //===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H -#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" #include "SIRegisterInfo.h" +#include #include namespace llvm { @@ -25,7 +25,7 @@ class MachineRegisterInfo; /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. -class SIMachineFunctionInfo : public AMDGPUMachineFunction { +class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // FIXME: This should be removed and getPreloadedValue moved here. friend struct SIRegisterInfo; void anchor() override; @@ -61,6 +61,15 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { unsigned PSInputAddr; bool ReturnsVoid; + unsigned MaximumWorkGroupSize; + + // Number of reserved VGPRs for debugger usage. + unsigned DebuggerReservedVGPRCount; + // Stack object indices for work group IDs. + std::array DebuggerWorkGroupIDStackObjectIndices; + // Stack object indices for work item IDs. + std::array DebuggerWorkItemIDStackObjectIndices; + public: // FIXME: Make private unsigned LDSWaveSpillSize; @@ -73,6 +82,11 @@ public: private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; + bool HasNonSpillStackObjects; + bool HasFlatInstructions; + + unsigned NumSpilledSGPRs; + unsigned NumSpilledVGPRs; // Feature bits required for inputs passed in user SGPRs. bool PrivateSegmentBuffer : 1; @@ -96,7 +110,6 @@ private: bool WorkItemIDY : 1; bool WorkItemIDZ : 1; - MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); return AMDGPU::SGPR0 + NumUserSGPRs; @@ -111,8 +124,9 @@ public: unsigned VGPR; int Lane; SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { } - SpilledReg() : VGPR(0), Lane(-1) { } + SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { } bool hasLane() { return Lane != -1;} + bool hasReg() { return VGPR != AMDGPU::NoRegister;} }; // SIMachineFunctionInfo definition @@ -129,6 +143,7 @@ public: unsigned addDispatchPtr(const SIRegisterInfo &TRI); unsigned addQueuePtr(const SIRegisterInfo &TRI); unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + unsigned addFlatScratchInit(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { @@ -161,6 +176,10 @@ public: return PrivateSegmentWaveByteOffsetSystemSGPR; } + void setPrivateSegmentWaveByteOffset(unsigned Reg) { + PrivateSegmentWaveByteOffsetSystemSGPR = Reg; + } + bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } @@ -261,6 +280,10 @@ public: ScratchWaveOffsetReg = Reg; } + unsigned getQueuePtrUserSGPR() const { + return QueuePtrUserSGPR; + } + bool hasSpilledSGPRs() const { return HasSpilledSGPRs; } @@ -277,6 +300,38 @@ public: HasSpilledVGPRs = Spill; } + bool hasNonSpillStackObjects() const { + return HasNonSpillStackObjects; + } + + void setHasNonSpillStackObjects(bool StackObject = true) { + HasNonSpillStackObjects = StackObject; + } + + bool hasFlatInstructions() const { + return HasFlatInstructions; + } + + void setHasFlatInstructions(bool UseFlat = true) { + HasFlatInstructions = UseFlat; + } + + unsigned getNumSpilledSGPRs() const { + return NumSpilledSGPRs; + } + + unsigned getNumSpilledVGPRs() const { + return NumSpilledVGPRs; + } + + void addToSpilledSGPRs(unsigned num) { + NumSpilledSGPRs += num; + } + + void addToSpilledVGPRs(unsigned num) { + NumSpilledVGPRs += num; + } + unsigned getPSInputAddr() const { return PSInputAddr; } @@ -297,10 +352,70 @@ public: ReturnsVoid = Value; } + /// \returns Number of reserved VGPRs for debugger usage. + unsigned getDebuggerReservedVGPRCount() const { + return DebuggerReservedVGPRCount; + } + + /// \returns Stack object index for \p Dim's work group ID. + int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const { + assert(Dim < 3); + return DebuggerWorkGroupIDStackObjectIndices[Dim]; + } + + /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx. + void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) { + assert(Dim < 3); + DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx; + } + + /// \returns Stack object index for \p Dim's work item ID. + int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const { + assert(Dim < 3); + return DebuggerWorkItemIDStackObjectIndices[Dim]; + } + + /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx. + void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) { + assert(Dim < 3); + DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx; + } + + /// \returns SGPR used for \p Dim's work group ID. + unsigned getWorkGroupIDSGPR(unsigned Dim) const { + switch (Dim) { + case 0: + assert(hasWorkGroupIDX()); + return WorkGroupIDXSystemSGPR; + case 1: + assert(hasWorkGroupIDY()); + return WorkGroupIDYSystemSGPR; + case 2: + assert(hasWorkGroupIDZ()); + return WorkGroupIDZSystemSGPR; + } + llvm_unreachable("unexpected dimension"); + } + + /// \returns VGPR used for \p Dim' work item ID. + unsigned getWorkItemIDVGPR(unsigned Dim) const { + switch (Dim) { + case 0: + assert(hasWorkItemIDX()); + return AMDGPU::VGPR0; + case 1: + assert(hasWorkItemIDY()); + return AMDGPU::VGPR1; + case 2: + assert(hasWorkItemIDZ()); + return AMDGPU::VGPR2; + } + llvm_unreachable("unexpected dimension"); + } + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; } // End namespace llvm - #endif diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index 1cfa98430020..7125b411c603 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPU.h" #include "SIMachineScheduler.h" -#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -295,7 +295,7 @@ static bool isDefBetween(unsigned Reg, const MachineInstr* MI = &*UI; if (MI->isDebugValue()) continue; - SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot(); + SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); if (InstSlot >= First && InstSlot <= Last) return true; } @@ -327,9 +327,9 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); // Do not Track Physical Registers, because it messes up. - for (unsigned Reg : RPTracker.getPressure().LiveInRegs) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) - LiveInRegs.insert(Reg); + for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { + if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit)) + LiveInRegs.insert(RegMaskPair.RegUnit); } LiveOutRegs.clear(); // There is several possibilities to distinguish: @@ -354,11 +354,12 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 // The use of findDefBetween removes the case 4. - for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) { + for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { + unsigned Reg = RegMaskPair.RegUnit; if (TargetRegisterInfo::isVirtualRegister(Reg) && - isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(), - LIS->getInstructionIndex(EndBlock).getRegSlot(), - MRI, LIS)) { + isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), + LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, + LIS)) { LiveOutRegs.insert(Reg); } } @@ -463,6 +464,9 @@ void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { for (SDep& Succ : SU->Succs) { SUnit *SuccSU = Succ.getSUnit(); + if (SuccSU->NodeNum >= DAG->SUnits.size()) + continue; + if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) continue; @@ -521,12 +525,9 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { } Preds.push_back(Pred); -#ifndef NDEBUG - for (SIScheduleBlock* S : Succs) { - if (PredID == S->getID()) - assert(!"Loop in the Block Graph!\n"); - } -#endif + assert(none_of(Succs, + [=](SIScheduleBlock *S) { return PredID == S->getID(); }) && + "Loop in the Block Graph!"); } void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { @@ -540,12 +541,9 @@ void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { if (Succ->isHighLatencyBlock()) ++NumHighLatencySuccessors; Succs.push_back(Succ); -#ifndef NDEBUG - for (SIScheduleBlock* P : Preds) { - if (SuccID == P->getID()) - assert("Loop in the Block Graph!\n"); - } -#endif + assert(none_of(Preds, + [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && + "Loop in the Block Graph!"); } #ifndef NDEBUG @@ -712,8 +710,8 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() { // Traverse TopDown, and give different colors to SUs depending // on which combination of High Latencies they depend on. - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]]; + for (unsigned SUNum : DAG->TopDownIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set SUColors; // Already given. @@ -754,8 +752,8 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() { // Same as before, but BottomUp. - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set SUColors; // Already given. @@ -826,8 +824,8 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { unsigned DAGSize = DAG->SUnits.size(); std::vector PendingColoring = CurrentColoring; - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set SUColors; std::set SUColorsPending; @@ -893,8 +891,8 @@ void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { unsigned DAGSize = DAG->SUnits.size(); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set SUColors; if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) @@ -919,8 +917,8 @@ void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { unsigned DAGSize = DAG->SUnits.size(); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set SUColors; if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) @@ -940,8 +938,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { unsigned DAGSize = DAG->SUnits.size(); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set SUColors; if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) @@ -962,8 +960,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { unsigned DAGSize = DAG->SUnits.size(); std::map ColorCount; - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; unsigned color = CurrentColoring[SU->NodeNum]; std::map::iterator Pos = ColorCount.find(color); if (Pos != ColorCount.end()) { @@ -973,8 +971,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { } } - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; unsigned color = CurrentColoring[SU->NodeNum]; std::set SUColors; @@ -1006,8 +1004,8 @@ void SIScheduleBlockCreator::regroupNoUserInstructions() { unsigned DAGSize = DAG->SUnits.size(); int GroupID = NextNonReservedID++; - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; bool hasSuccessor = false; if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) @@ -1223,7 +1221,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() { // is the most cpu intensive operation of the scheduler. // It would gain a lot if there was a way to recompute the // LiveIntervals for the entire scheduling region. - DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true); + DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); PosNew.push_back(CurrentTopFastSched); } } @@ -1249,7 +1247,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() { DAG->getBB()->splice(POld, DAG->getBB(), PNew); // Update LiveIntervals. - DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true); + DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); } } @@ -1675,70 +1673,10 @@ ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) { // Does a topological sort over the SUs. // Both TopDown and BottomUp void SIScheduleDAGMI::topologicalSort() { - std::vector TopDownSU2Index; - unsigned DAGSize = SUnits.size(); - std::vector WorkList; - - DEBUG(dbgs() << "Topological Sort\n"); - WorkList.reserve(DAGSize); - - TopDownIndex2SU.resize(DAGSize); - TopDownSU2Index.resize(DAGSize); - BottomUpIndex2SU.resize(DAGSize); - - WorkList.push_back(&getExitSU()); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - int NodeNum = SU->NodeNum; - unsigned Degree = SU->Succs.size(); - TopDownSU2Index[NodeNum] = Degree; - if (Degree == 0) { - assert(SU->Succs.empty() && "SUnit should have no successors"); - WorkList.push_back(SU); - } - } - - int Id = DAGSize; - while (!WorkList.empty()) { - SUnit *SU = WorkList.back(); - WorkList.pop_back(); - if (SU->NodeNum < DAGSize) { - TopDownSU2Index[SU->NodeNum] = --Id; - TopDownIndex2SU[Id] = SU->NodeNum; - } - for (SDep& Pred : SU->Preds) { - SUnit *SU = Pred.getSUnit(); - if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum]) - WorkList.push_back(SU); - } - } - - BottomUpIndex2SU = std::vector(TopDownIndex2SU.rbegin(), - TopDownIndex2SU.rend()); + Topo.InitDAGTopologicalSorting(); -#ifndef NDEBUG - // Check correctness of the ordering - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - for (SDep& Pred : SU->Preds) { - if (Pred.getSUnit()->NodeNum >= DAGSize) - continue; - assert(TopDownSU2Index[SU->NodeNum] > - TopDownSU2Index[Pred.getSUnit()->NodeNum] && - "Wrong Top Down topological sorting"); - } - } - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - for (SDep& Succ : SU->Succs) { - if (Succ.getSUnit()->NodeNum >= DAGSize) - continue; - assert(TopDownSU2Index[SU->NodeNum] < - TopDownSU2Index[Succ.getSUnit()->NodeNum] && - "Wrong Bottom Up topological sorting"); - } - } -#endif + TopDownIndex2SU = std::vector(Topo.begin(), Topo.end()); + BottomUpIndex2SU = std::vector(Topo.rbegin(), Topo.rend()); } // Move low latencies further from their user without @@ -1759,7 +1697,7 @@ void SIScheduleDAGMI::moveLowLatencies() { for (SDep& PredDep : SU->Preds) { SUnit *Pred = PredDep.getSUnit(); - if (SITII->isLowLatencyInstruction(Pred->getInstr())) { + if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { IsLowLatencyUser = true; } if (Pred->NodeNum >= DAGSize) @@ -1769,7 +1707,7 @@ void SIScheduleDAGMI::moveLowLatencies() { MinPos = PredPos + 1; } - if (SITII->isLowLatencyInstruction(SU->getInstr())) { + if (SITII->isLowLatencyInstruction(*SU->getInstr())) { unsigned BestPos = LastLowLatencyUser + 1; if ((int)BestPos <= LastLowLatencyPos) BestPos = LastLowLatencyPos + 1; @@ -1794,7 +1732,7 @@ void SIScheduleDAGMI::moveLowLatencies() { bool CopyForLowLat = false; for (SDep& SuccDep : SU->Succs) { SUnit *Succ = SuccDep.getSUnit(); - if (SITII->isLowLatencyInstruction(Succ->getInstr())) { + if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { CopyForLowLat = true; } } @@ -1855,7 +1793,6 @@ void SIScheduleDAGMI::schedule() SU.dumpAll(this) ); - Topo.InitDAGTopologicalSorting(); topologicalSort(); findRootsAndBiasEdges(TopRoots, BotRoots); // We reuse several ScheduleDAGMI and ScheduleDAGMILive @@ -1878,20 +1815,21 @@ void SIScheduleDAGMI::schedule() for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { SUnit *SU = &SUnits[i]; - unsigned BaseLatReg, OffLatReg; - if (SITII->isLowLatencyInstruction(SU->getInstr())) { + unsigned BaseLatReg; + int64_t OffLatReg; + if (SITII->isLowLatencyInstruction(*SU->getInstr())) { IsLowLatencySU[i] = 1; - if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg, - OffLatReg, TRI)) + if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg, + TRI)) LowLatencyOffset[i] = OffLatReg; - } else if (SITII->isHighLatencyInstruction(SU->getInstr())) + } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) IsHighLatencySU[i] = 1; } SIScheduler Scheduler(this); Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); -#if 0 // To enable when handleMove fix lands + // if VGPR usage is extremely high, try other good performing variants // which could lead to lower VGPR usage if (Best.MaxVGPRUsage > 180) { @@ -1930,7 +1868,7 @@ void SIScheduleDAGMI::schedule() Best = Temp; } } -#endif + ScheduledSUnits = Best.SUs; ScheduledSUnitsInv.resize(SUnits.size()); diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h index b270136811c6..117aed497cc2 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/lib/Target/AMDGPU/SIMachineScheduler.h @@ -418,7 +418,7 @@ public: SISchedulerBlockSchedulerVariant ScheduleVariant); }; -class SIScheduleDAGMI : public ScheduleDAGMILive { +class SIScheduleDAGMI final : public ScheduleDAGMILive { const SIInstrInfo *SITII; const SIRegisterInfo *SITRI; @@ -441,7 +441,7 @@ public: // To init Block's RPTracker. void initRPTracker(RegPressureTracker &RPTracker) { - RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); + RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false); } MachineBasicBlock *getBB() { return BB; } @@ -460,8 +460,10 @@ public: unsigned &VgprUsage, unsigned &SgprUsage); std::set getInRegs() { - std::set InRegs (RPTracker.getPressure().LiveInRegs.begin(), - RPTracker.getPressure().LiveInRegs.end()); + std::set InRegs; + for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { + InRegs.insert(RegMaskPair.RegUnit); + } return InRegs; }; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 025ed2b5b76b..0dd88ee45c58 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -15,6 +15,7 @@ #include "SIRegisterInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" @@ -23,7 +24,75 @@ using namespace llvm; -SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() { +static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) { + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + const SISubtarget &ST = MF.getSubtarget(); + unsigned SIMDPerCU = 4; + + unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize(); + return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) / + MaxInvocationsPerWave; +} + +static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); + unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); + + unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment; + unsigned ReservedSGPRCount; + + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + TotalSGPRCountPerSIMD = 800; + AddressableSGPRCount = 102; + SGPRUsageAlignment = 16; + ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK + } else { + TotalSGPRCountPerSIMD = 512; + AddressableSGPRCount = 104; + SGPRUsageAlignment = 8; + ReservedSGPRCount = 2; // VCC + } + + unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD); + MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment); + + if (ST.hasSGPRInitBug()) + MaxSGPRCount = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + + return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount); +} + +static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) { + unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); + unsigned TotalVGPRCountPerSIMD = 256; + unsigned VGPRUsageAlignment = 4; + + return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD, + VGPRUsageAlignment); +} + +static bool hasPressureSet(const int *PSets, unsigned PSetID) { + for (unsigned i = 0; PSets[i] != -1; ++i) { + if (PSets[i] == (int)PSetID) + return true; + } + return false; +} + +void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, + BitVector &PressureSets) const { + for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) { + const int *PSets = getRegUnitPressureSets(*U); + if (hasPressureSet(PSets, PSetID)) { + PressureSets.set(PSetID); + break; + } + } +} + +SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(), + SGPRPressureSets(getNumRegPressureSets()), + VGPRPressureSets(getNumRegPressureSets()) { unsigned NumRegPressureSets = getNumRegPressureSets(); SGPR32SetID = NumRegPressureSets; @@ -33,6 +102,9 @@ SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() { SGPR32SetID = i; else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0) VGPR32SetID = i; + + classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); + classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); } assert(SGPR32SetID < NumRegPressureSets && VGPR32SetID < NumRegPressureSets); @@ -47,38 +119,27 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget(); - if (ST.hasSGPRInitBug()) { - // Leave space for flat_scr, xnack_mask, vcc, and alignment - unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4; - unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); - return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); - } - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and - // 100/101 for vcc. This is the next sgpr128 down. - return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; - } - - return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; + unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget(); - if (ST.hasSGPRInitBug()) { - unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1; - return AMDGPU::SGPR_32RegClass.getRegister(Idx); - } - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // Next register before reservations for flat_scr, xnack_mask, vcc, - // and scratch resource. - return AMDGPU::SGPR91; + unsigned RegCount = getMaxWorkGroupSGPRCount(MF); + unsigned Reg; + + // Try to place it in a hole after PrivateSegmentbufferReg. + if (RegCount & 3) { + // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to + // alignment constraints, so we have a hole where can put the wave offset. + Reg = RegCount - 1; + } else { + // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the + // wave offset before it. + Reg = RegCount - 5; } - - return AMDGPU::SGPR95; + return AMDGPU::SGPR_32RegClass.getRegister(Reg); } BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { @@ -90,35 +151,30 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); - // Reserve the last 2 registers so we will always have at least 2 more that - // will physically contain VCC. - reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103); - - const AMDGPUSubtarget &ST = MF.getSubtarget(); - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation - // for VCC/XNACK_MASK/FLAT_SCR. - // - // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose - // SGPRs when the XNACK feature is not used. This is currently not done - // because the code that counts SGPRs cannot account for such holes. - reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97); - reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99); - reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101); + // Reserve Trap Handler registers - support is not implemented in Codegen. + reserveRegisterTuples(Reserved, AMDGPU::TBA); + reserveRegisterTuples(Reserved, AMDGPU::TMA); + reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); + reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); + reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); + reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); + reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); + reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); + + unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF); + unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF); + + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) { + unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); } - // Tonga and Iceland can only allocate a fixed number of SGPRs due - // to a hw bug. - if (ST.hasSGPRInitBug()) { - unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs). - unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6; - for (unsigned i = Limit; i < NumSGPRs; ++i) { - unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); - reserveRegisterTuples(Reserved, Reg); - } + for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) { + unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); } const SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -138,48 +194,182 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } + // Reserve registers for debugger usage if "amdgpu-debugger-reserve-trap-regs" + // attribute was specified. + const SISubtarget &ST = MF.getSubtarget(); + if (ST.debuggerReserveRegs()) { + unsigned ReservedVGPRFirst = + MaxWorkGroupVGPRCount - MFI->getDebuggerReservedVGPRCount(); + for (unsigned i = ReservedVGPRFirst; i < MaxWorkGroupVGPRCount; ++i) { + unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } + } + return Reserved; } unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - const AMDGPUSubtarget &STI = MF.getSubtarget(); + const SISubtarget &STI = MF.getSubtarget(); // FIXME: We should adjust the max number of waves based on LDS size. - unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), - STI.getMaxWavesPerCU()); + unsigned SGPRLimit = getNumSGPRsAllowed(STI, STI.getMaxWavesPerCU()); unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); unsigned VSLimit = SGPRLimit + VGPRLimit; - for (regclass_iterator I = regclass_begin(), E = regclass_end(); - I != E; ++I) { - const TargetRegisterClass *RC = *I; + if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) { + // FIXME: This is a hack. We should never be considering the pressure of + // these since no virtual register should ever have this class. + return VSLimit; + } - unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1); - unsigned Limit; + if (SGPRPressureSets.test(Idx)) + return SGPRLimit; - if (isPseudoRegClass(RC)) { - // FIXME: This is a hack. We should never be considering the pressure of - // these since no virtual register should ever have this class. - Limit = VSLimit; - } else if (isSGPRClass(RC)) { - Limit = SGPRLimit / NumSubRegs; - } else { - Limit = VGPRLimit / NumSubRegs; - } + return VGPRLimit; +} + +bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { + return Fn.getFrameInfo()->hasStackObjects(); +} + +bool +SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { + return MF.getFrameInfo()->hasStackObjects(); +} + +bool SIRegisterInfo::requiresVirtualBaseRegisters( + const MachineFunction &) const { + // There are no special dedicated stack or frame pointers. + return true; +} + +bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + // This helps catch bugs as verifier errors. + return true; +} + +int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, + int Idx) const { + if (!SIInstrInfo::isMUBUF(*MI)) + return 0; + + assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::vaddr) && + "Should never see frame index on non-address operand"); + + int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::offset); + return MI->getOperand(OffIdx).getImm(); +} + +bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { + return MI->mayLoadOrStore(); +} - const int *Sets = getRegClassPressureSets(RC); - assert(Sets); - for (unsigned i = 0; Sets[i] != -1; ++i) { - if (Sets[i] == (int)Idx) - return Limit; +void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, + int FrameIdx, + int64_t Offset) const { + MachineBasicBlock::iterator Ins = MBB->begin(); + DebugLoc DL; // Defaults to "unknown" + + if (Ins != MBB->end()) + DL = Ins->getDebugLoc(); + + MachineFunction *MF = MBB->getParent(); + const SISubtarget &Subtarget = MF->getSubtarget(); + const SIInstrInfo *TII = Subtarget.getInstrInfo(); + + if (Offset == 0) { + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) + .addFrameIndex(FrameIdx); + return; + } + + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(Offset); + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead) + .addReg(OffsetReg, RegState::Kill) + .addFrameIndex(FrameIdx); +} + +void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const { + + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction *MF = MBB->getParent(); + const SISubtarget &Subtarget = MF->getSubtarget(); + const SIInstrInfo *TII = Subtarget.getInstrInfo(); + +#ifndef NDEBUG + // FIXME: Is it possible to be storing a frame index to itself? + bool SeenFI = false; + for (const MachineOperand &MO: MI.operands()) { + if (MO.isFI()) { + if (SeenFI) + llvm_unreachable("should not see multiple frame indices"); + + SeenFI = true; } } - return 256; +#endif + + MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + assert(FIOp && FIOp->isFI() && "frame index must be address operand"); + + assert(TII->isMUBUF(MI)); + + MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); + int64_t NewOffset = OffsetOp->getImm() + Offset; + if (isUInt<12>(NewOffset)) { + // If we have a legal offset, fold it directly into the instruction. + FIOp->ChangeToRegister(BaseReg, false); + OffsetOp->setImm(NewOffset); + return; + } + + // The offset is not legal, so we must insert an add of the offset. + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned NewReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + DebugLoc DL = MI.getDebugLoc(); + + assert(Offset != 0 && "Non-zero offset expected"); + + unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + // In the case the instruction already had an immediate offset, here only + // the requested new offset is added because we are leaving the original + // immediate in place. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), NewReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead) + .addReg(OffsetReg, RegState::Kill) + .addReg(BaseReg); + + FIOp->ChangeToRegister(NewReg, false); } -bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { - return Fn.getFrameInfo()->hasStackObjects(); +bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, + unsigned BaseReg, + int64_t Offset) const { + return SIInstrInfo::isMUBUF(*MI) && isUInt<12>(Offset); +} + +const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( + const MachineFunction &MF, unsigned Kind) const { + // This is inaccurate. It depends on the instruction and address space. The + // only place where we should hit this is for dealing with frame indexes / + // private accesses, so this is correct in that case. + return &AMDGPU::VGPR_32RegClass; } static unsigned getNumSubRegsForSpillOp(unsigned Op) { @@ -219,32 +409,48 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, - unsigned Value, + const MachineOperand *SrcDst, unsigned ScratchRsrcReg, unsigned ScratchOffset, int64_t Offset, RegScavenger *RS) const { + unsigned Value = SrcDst->getReg(); + bool IsKill = SrcDst->isKill(); MachineBasicBlock *MBB = MI->getParent(); - const MachineFunction *MF = MI->getParent()->getParent(); - const SIInstrInfo *TII = - static_cast(MF->getSubtarget().getInstrInfo()); - LLVMContext &Ctx = MF->getFunction()->getContext(); + MachineFunction *MF = MI->getParent()->getParent(); + const SISubtarget &ST = MF->getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); - bool IsLoad = TII->get(LoadStoreOp).mayLoad(); + bool IsStore = MI->mayStore(); bool RanOutOfSGPRs = false; bool Scavenged = false; unsigned SOffset = ScratchOffset; + unsigned OriginalImmOffset = Offset; unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned Size = NumSubRegs * 4; if (!isUInt<12>(Offset + Size)) { - SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); + SOffset = AMDGPU::NoRegister; + + // We don't have access to the register scavenger if this function is called + // during PEI::scavengeFrameVirtualRegs(). + if (RS) + SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass); + if (SOffset == AMDGPU::NoRegister) { + // There are no free SGPRs, and since we are in the process of spilling + // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true + // on SI/CI and on VI it is true until we implement spilling using scalar + // stores), we have no way to free up an SGPR. Our solution here is to + // add the offset directly to the ScratchOffset register, and then + // subtract the offset after the spill to return ScratchOffset to it's + // original value. RanOutOfSGPRs = true; - SOffset = AMDGPU::SGPR0; + SOffset = ScratchOffset; } else { Scavenged = true; } @@ -254,40 +460,48 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, Offset = 0; } - if (RanOutOfSGPRs) - Ctx.emitError("Ran out of SGPRs for spilling VGPRS"); - for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; unsigned SOffsetRegState = 0; - if (i + 1 == e && Scavenged) - SOffsetRegState |= RegState::Kill; + unsigned SrcDstRegState = getDefRegState(!IsStore); + if (i + 1 == e) { + SOffsetRegState |= getKillRegState(Scavenged); + // The last implicit use carries the "Kill" flag. + SrcDstRegState |= getKillRegState(IsKill); + } BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(SubReg, getDefRegState(IsLoad)) + .addReg(SubReg, getDefRegState(!IsStore)) .addReg(ScratchRsrcReg) .addReg(SOffset, SOffsetRegState) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe - .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)) + .addReg(Value, RegState::Implicit | SrcDstRegState) .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); } + if (RanOutOfSGPRs) { + // Subtract the offset we added to the ScratchOffset register. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset) + .addReg(ScratchOffset) + .addImm(OriginalImmOffset); + } } void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIInstrInfo *TII = - static_cast(MF->getSubtarget().getInstrInfo()); + const SISubtarget &ST = MF->getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); MachineOperand &FIOp = MI->getOperand(FIOperandNum); @@ -301,24 +515,65 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned SuperReg = MI->getOperand(0).getReg(); + bool IsKill = MI->getOperand(0).isKill(); + // SubReg carries the "Kill" flag when SubReg == SuperReg. + unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + unsigned SubReg = getPhysRegSubReg(SuperReg, &AMDGPU::SGPR_32RegClass, i); + struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), - Spill.VGPR) - .addReg(SubReg) - .addImm(Spill.Lane); - - // FIXME: Since this spills to another register instead of an actual - // frame index, we should delete the frame index when all references to - // it are fixed. + if (Spill.hasReg()) { + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill.VGPR) + .addReg(SubReg, getKillRegState(IsKill)) + .addImm(Spill.Lane); + + // FIXME: Since this spills to another register instead of an actual + // frame index, we should delete the frame index when all references to + // it are fixed. + } else { + // Spill SGPR to a frame index. + // FIXME we should use S_STORE_DWORD here for VI. + MachineInstrBuilder Mov + = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addReg(SubReg, SubKillState); + + + // There could be undef components of a spilled super register. + // TODO: Can we detect this and skip the spill? + if (NumSubRegs > 1) { + // The last implicit use of the SuperReg carries the "Kill" flag. + unsigned SuperKillState = 0; + if (i + 1 == e) + SuperKillState |= getKillRegState(IsKill); + Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); + } + + unsigned Size = FrameInfo->getObjectSize(Index); + unsigned Align = FrameInfo->getObjectAlignment(Index); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + Size, Align); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) + .addReg(TmpReg, RegState::Kill) // src + .addFrameIndex(Index) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(i * 4) // offset + .addMemOperand(MMO); + } } MI->eraseFromParent(); + MFI->addToSpilledSGPRs(NumSubRegs); break; } @@ -329,6 +584,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), @@ -336,28 +592,37 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - SubReg) - .addReg(Spill.VGPR) - .addImm(Spill.Lane) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); - } - - // TODO: only do this when it is needed - switch (MF->getSubtarget().getGeneration()) { - case AMDGPUSubtarget::SOUTHERN_ISLANDS: - // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states - // ("S_NOP 3") on SI - TII->insertWaitStates(MI, 4); - break; - case AMDGPUSubtarget::SEA_ISLANDS: - break; - default: // VOLCANIC_ISLANDS and later - // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states - // ("S_NOP 4") on VI and later. This also applies to VALUs which write - // VCC, but we're unlikely to see VMEM use VCC. - TII->insertWaitStates(MI, 5); + if (Spill.hasReg()) { + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + SubReg) + .addReg(Spill.VGPR) + .addImm(Spill.Lane) + .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + } else { + // Restore SGPR from a stack slot. + // FIXME: We should use S_LOAD_DWORD here for VI. + + unsigned Align = FrameInfo->getObjectAlignment(Index); + unsigned Size = FrameInfo->getObjectSize(Index); + + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, Size, Align); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) + .addFrameIndex(Index) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(i * 4) // offset + .addMemOperand(MMO); + BuildMI(*MBB, MI, DL, + TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) + .addReg(TmpReg, RegState::Kill) + .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + } } MI->eraseFromParent(); @@ -372,11 +637,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V32_SAVE: buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::src), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), - FrameInfo->getObjectOffset(Index), RS); + FrameInfo->getObjectOffset(Index) + + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); MI->eraseFromParent(); + MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); break; case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: @@ -385,10 +652,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::dst), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), - FrameInfo->getObjectOffset(Index), RS); + FrameInfo->getObjectOffset(Index) + + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); MI->eraseFromParent(); break; } @@ -396,8 +664,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, default: { int64_t Offset = FrameInfo->getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); - if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { - unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj); + if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); @@ -407,10 +675,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } } -unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { - return getEncodingValue(Reg) & 0xff; -} - // FIXME: This is very slow. It might be worth creating a map from physreg to // register class. const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { @@ -427,7 +691,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, &AMDGPU::VReg_512RegClass, - &AMDGPU::SReg_512RegClass + &AMDGPU::SReg_512RegClass, + &AMDGPU::SCC_CLASSRegClass, }; for (const TargetRegisterClass *BaseClass : BaseClasses) { @@ -442,6 +707,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { switch (RC->getSize()) { + case 0: return false; + case 1: return false; case 4: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; case 8: @@ -479,6 +746,24 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( } } +const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( + const TargetRegisterClass *VRC) const { + switch (VRC->getSize()) { + case 4: + return &AMDGPU::SGPR_32RegClass; + case 8: + return &AMDGPU::SReg_64RegClass; + case 16: + return &AMDGPU::SReg_128RegClass; + case 32: + return &AMDGPU::SReg_256RegClass; + case 64: + return &AMDGPU::SReg_512RegClass; + default: + llvm_unreachable("Invalid register class size"); + } +} + const TargetRegisterClass *SIRegisterInfo::getSubRegClass( const TargetRegisterClass *RC, unsigned SubIdx) const { if (SubIdx == AMDGPU::NoSubRegister) @@ -552,7 +837,21 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, switch(Channel) { case 0: return AMDGPU::VCC_LO; case 1: return AMDGPU::VCC_HI; - default: llvm_unreachable("Invalid SubIdx for VCC"); + default: llvm_unreachable("Invalid SubIdx for VCC"); break; + } + + case AMDGPU::TBA: + switch(Channel) { + case 0: return AMDGPU::TBA_LO; + case 1: return AMDGPU::TBA_HI; + default: llvm_unreachable("Invalid SubIdx for TBA"); break; + } + + case AMDGPU::TMA: + switch(Channel) { + case 0: return AMDGPU::TMA_LO; + case 1: return AMDGPU::TMA_HI; + default: llvm_unreachable("Invalid SubIdx for TMA"); break; } case AMDGPU::FLAT_SCR: @@ -610,7 +909,7 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { const SIMachineFunctionInfo *MFI = MF.getInfo(); - const AMDGPUSubtarget &ST = MF.getSubtarget(); + const SISubtarget &ST = MF.getSubtarget(); (void)ST; switch (Value) { case SIRegisterInfo::WORKGROUP_ID_X: @@ -631,11 +930,17 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, case SIRegisterInfo::KERNARG_SEGMENT_PTR: assert(MFI->hasKernargSegmentPtr()); return MFI->KernargSegmentPtrUserSGPR; + case SIRegisterInfo::DISPATCH_ID: + llvm_unreachable("unimplemented"); + case SIRegisterInfo::FLAT_SCRATCH_INIT: + assert(MFI->hasFlatScratchInit()); + return MFI->FlatScratchInitUserSGPR; case SIRegisterInfo::DISPATCH_PTR: assert(MFI->hasDispatchPtr()); return MFI->DispatchPtrUserSGPR; case SIRegisterInfo::QUEUE_PTR: - llvm_unreachable("not implemented"); + assert(MFI->hasQueuePtr()); + return MFI->QueuePtrUserSGPR; case SIRegisterInfo::WORKITEM_ID_X: assert(MFI->hasWorkItemIDX()); return AMDGPU::VGPR0; @@ -675,9 +980,9 @@ unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { } } -unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, +unsigned SIRegisterInfo::getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const { - if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { switch (WaveCount) { case 10: return 80; case 9: return 80; @@ -696,3 +1001,14 @@ unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, } } } + +bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, + unsigned Reg) const { + const TargetRegisterClass *RC; + if (TargetRegisterInfo::isVirtualRegister(Reg)) + RC = MRI.getRegClass(Reg); + else + RC = getPhysRegClass(Reg); + + return hasVGPRs(RC); +} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 9410e2049cba..6e97b1b910a9 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -12,23 +12,27 @@ // //===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H -#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H +#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" namespace llvm { -struct SIRegisterInfo : public AMDGPURegisterInfo { +class SISubtarget; +class MachineRegisterInfo; + +struct SIRegisterInfo final : public AMDGPURegisterInfo { private: unsigned SGPR32SetID; unsigned VGPR32SetID; + BitVector SGPRPressureSets; + BitVector VGPRPressureSets; void reserveRegisterTuples(BitVector &, unsigned Reg) const; + void classifyPressureSet(unsigned PSetID, unsigned Reg, + BitVector &PressureSets) const; public: SIRegisterInfo(); @@ -47,13 +51,39 @@ public: unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override; + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; + + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; + bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override; + bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; + + int64_t getFrameIndexInstrOffset(const MachineInstr *MI, + int Idx) const override; + + bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; + + void materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const override; + + void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const override; + + bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, + int64_t Offset) const override; + + const TargetRegisterClass *getPointerRegClass( + const MachineFunction &MF, unsigned Kind = 0) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override; - unsigned getHWRegIndex(unsigned Reg) const override; + unsigned getHWRegIndex(unsigned Reg) const { + return getEncodingValue(Reg) & 0xff; + } /// \brief Return the 'base' register class for this register. /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. @@ -70,9 +100,12 @@ public: } bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { + const TargetRegisterClass *RC; if (TargetRegisterInfo::isVirtualRegister(Reg)) - return isSGPRClass(MRI.getRegClass(Reg)); - return getPhysRegClass(Reg); + RC = MRI.getRegClass(Reg); + else + RC = getPhysRegClass(Reg); + return isSGPRClass(RC); } /// \returns true if this class contains VGPR registers. @@ -89,6 +122,10 @@ public: const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; + /// \returns A SGPR reg class with the same width as \p SRC + const TargetRegisterClass *getEquivalentSGPRClass( + const TargetRegisterClass *VRC) const; + /// \returns The register class that is used for a sub-register of \p RC for /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will /// be returned. @@ -117,10 +154,12 @@ public: enum PreloadedValue { // SGPRS: - PRIVATE_SEGMENT_BUFFER = 0, + PRIVATE_SEGMENT_BUFFER = 0, DISPATCH_PTR = 1, QUEUE_PTR = 2, KERNARG_SEGMENT_PTR = 3, + DISPATCH_ID = 4, + FLAT_SCRATCH_INIT = 5, WORKGROUP_ID_X = 10, WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, @@ -143,8 +182,7 @@ public: /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount /// concurrent waves. - unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, - unsigned WaveCount) const; + unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const; unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const; @@ -152,11 +190,14 @@ public: unsigned getSGPR32PressureSet() const { return SGPR32SetID; }; unsigned getVGPR32PressureSet() const { return VGPR32SetID; }; + bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + private: void buildScratchLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, unsigned Value, + unsigned LoadStoreOp, const MachineOperand *SrcDst, unsigned ScratchRsrcReg, unsigned ScratchOffset, - int64_t Offset, RegScavenger *RS) const; + int64_t Offset, + RegScavenger *RS) const; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index bfaf93709d8c..c427874d467a 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -44,6 +44,40 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; +// Trap handler registers +def TBA_LO : SIReg<"tba_lo", 108>; +def TBA_HI : SIReg<"tba_hi", 109>; + +def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, + DwarfRegAlias { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 108; +} + +def TMA_LO : SIReg<"tma_lo", 110>; +def TMA_HI : SIReg<"tma_hi", 111>; + +def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, + DwarfRegAlias { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 110; +} + +def TTMP0 : SIReg <"ttmp0", 112>; +def TTMP1 : SIReg <"ttmp1", 113>; +def TTMP2 : SIReg <"ttmp2", 114>; +def TTMP3 : SIReg <"ttmp3", 115>; +def TTMP4 : SIReg <"ttmp4", 116>; +def TTMP5 : SIReg <"ttmp5", 117>; +def TTMP6 : SIReg <"ttmp6", 118>; +def TTMP7 : SIReg <"ttmp7", 119>; +def TTMP8 : SIReg <"ttmp8", 120>; +def TTMP9 : SIReg <"ttmp9", 121>; +def TTMP10 : SIReg <"ttmp10", 122>; +def TTMP11 : SIReg <"ttmp11", 123>; + multiclass FLAT_SCR_LOHI_m ci_e, bits<16> vi_e> { def _ci : SIReg; def _vi : SIReg; @@ -81,11 +115,18 @@ foreach Index = 0-255 in { // Groupings using register classes and tuples //===----------------------------------------------------------------------===// +def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> { + let CopyCost = -1; + let isAllocatable = 0; +} + // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "SGPR%u", 0, 103))>; + (add (sequence "SGPR%u", 0, 103))> { + let AllocationPriority = 1; +} // SGPR 64-bit registers def SGPR_64Regs : RegisterTuples<[sub0, sub1], @@ -93,7 +134,7 @@ def SGPR_64Regs : RegisterTuples<[sub0, sub1], (add (decimate (shl SGPR_32, 1), 2))]>; // SGPR 128-bit registers -def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], +def SGPR_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), @@ -130,9 +171,29 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, (add (decimate (shl SGPR_32, 14), 4)), (add (decimate (shl SGPR_32, 15), 4))]>; +// Trap handler TMP 32-bit registers +def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add (sequence "TTMP%u", 0, 11))> { + let isAllocatable = 0; +} + +// Trap handler TMP 64-bit registers +def TTMP_64Regs : RegisterTuples<[sub0, sub1], + [(add (decimate TTMP_32, 2)), + (add (decimate (shl TTMP_32, 1), 2))]>; + +// Trap handler TMP 128-bit registers +def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (decimate TTMP_32, 4)), + (add (decimate (shl TTMP_32, 1), 4)), + (add (decimate (shl TTMP_32, 2), 4)), + (add (decimate (shl TTMP_32, 3), 4))]>; + // VGPR 32-bit registers def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "VGPR%u", 0, 255))>; + (add (sequence "VGPR%u", 0, 255))> { + let AllocationPriority = 1; +} // VGPR 64-bit registers def VGPR_64 : RegisterTuples<[sub0, sub1], @@ -192,36 +253,67 @@ class RegImmMatcher : AsmOperandClass { let RenderMethod = "addRegOrImmOperands"; } +// Subset of SReg_32 without M0 for SMRD instructions and alike. +// See comments in SIInstructions.td for more info. +def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI, + TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> { + let AllocationPriority = 1; +} + // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) ->; + (add SReg_32_XM0, M0)> { + let AllocationPriority = 1; +} + +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> { + let AllocationPriority = 2; +} -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>; +def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> { + let isAllocatable = 0; +} def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, - (add SGPR_64, VCC, EXEC, FLAT_SCR) ->; + (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)> { + let AllocationPriority = 2; +} -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> { - // Requires 2 s_mov_b64 to copy - let CopyCost = 2; +// Requires 2 s_mov_b64 to copy +let CopyCost = 2 in { + +def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> { + let AllocationPriority = 4; +} + +def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> { + let isAllocatable = 0; +} + +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> { + let AllocationPriority = 4; } -def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> { +} // End CopyCost = 2 + +def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; + let AllocationPriority = 5; } def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; + let AllocationPriority = 6; } // Register class for all vector registers (VGPRs + Interploation Registers) def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { // Requires 2 v_mov_b32 to copy let CopyCost = 2; + let AllocationPriority = 2; } def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { @@ -229,19 +321,23 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { // Requires 3 v_mov_b32 to copy let CopyCost = 3; + let AllocationPriority = 3; } def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { // Requires 4 v_mov_b32 to copy let CopyCost = 4; + let AllocationPriority = 4; } -def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> { +def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> { let CopyCost = 8; + let AllocationPriority = 5; } def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { let CopyCost = 16; + let AllocationPriority = 6; } def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td index cd77e519abb2..ed19217226b8 100644 --- a/lib/Target/AMDGPU/SISchedule.td +++ b/lib/Target/AMDGPU/SISchedule.td @@ -11,6 +11,12 @@ // //===----------------------------------------------------------------------===// +def : PredicateProlog<[{ + const SIInstrInfo *TII = + static_cast(SchedModel->getInstrInfo()); + (void)TII; +}]>; + def WriteBranch : SchedWrite; def WriteExport : SchedWrite; def WriteLDS : SchedWrite; @@ -39,20 +45,33 @@ def Write64Bit : SchedWrite; // instructions and have VALU rates, but write to the SALU (i.e. VOPC // instructions) -def SIFullSpeedModel : SchedMachineModel; -def SIQuarterSpeedModel : SchedMachineModel; +class SISchedMachineModel : SchedMachineModel { + let CompleteModel = 0; + let IssueWidth = 1; + let PostRAScheduler = 1; +} -// BufferSize = 0 means the processors are in-order. -let BufferSize = 0 in { +def SIFullSpeedModel : SISchedMachineModel; +def SIQuarterSpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? -def HWBranch : ProcResource<1>; -def HWExport : ProcResource<7>; // Taken from S_WAITCNT -def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT -def HWSALU : ProcResource<1>; -def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT -def HWVALU : ProcResource<1>; - +def HWBranch : ProcResource<1> { + let BufferSize = 1; +} +def HWExport : ProcResource<1> { + let BufferSize = 7; // Taken from S_WAITCNT +} +def HWLGKM : ProcResource<1> { + let BufferSize = 31; // Taken from S_WAITCNT +} +def HWSALU : ProcResource<1> { + let BufferSize = 1; +} +def HWVMEM : ProcResource<1> { + let BufferSize = 15; // Taken from S_WAITCNT +} +def HWVALU : ProcResource<1> { + let BufferSize = 1; } class HWWriteRes resources, @@ -70,12 +89,12 @@ class HWVALUWriteRes : // The latency values are 1 / (operations / cycle) / 4. multiclass SICommonWriteRes { - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // 2 - 64 - def : HWWriteRes; - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // 300 - 600 + def : HWWriteRes; + def : HWWriteRes; + def : HWWriteRes; // Can be between 2 and 64 + def : HWWriteRes; + def : HWWriteRes; + def : HWWriteRes; def : HWWriteRes; // XXX: Guessed ??? def : HWVALUWriteRes; @@ -83,6 +102,12 @@ multiclass SICommonWriteRes { def : HWVALUWriteRes; } +def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; +def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>; +def WriteCopy : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; let SchedModel = SIFullSpeedModel in { @@ -92,6 +117,8 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; +def : InstRW<[WriteCopy], (instrs COPY)>; + } // End SchedModel = SIFullSpeedModel let SchedModel = SIQuarterSpeedModel in { @@ -102,4 +129,6 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; +def : InstRW<[WriteCopy], (instrs COPY)>; + } // End SchedModel = SIQuarterSpeedModel diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 4f0913fe62f2..6cba55300a8c 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -31,10 +31,6 @@ STATISTIC(NumInstructionsShrunk, STATISTIC(NumLiteralConstantsFolded, "Number of literal constants folded into 32-bit instructions."); -namespace llvm { - void initializeSIShrinkInstructionsPass(PassRegistry&); -} - using namespace llvm; namespace { @@ -61,10 +57,8 @@ public: } // End anonymous namespace. -INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, - "SI Lower il Copies", false, false) -INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, - "SI Lower il Copies", false, false) +INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, + "SI Shrink Instructions", false, false) char SIShrinkInstructions::ID = 0; @@ -125,10 +119,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) return false; - if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) - return false; - - return true; + return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp); } /// \brief This function checks \p MI for operands defined by a move immediate @@ -181,31 +172,37 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, } // We have failed to fold src0, so commute the instruction and try again. - if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) + if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI)) foldImmediates(MI, TII, MRI, false); } // Copy MachineOperand with all flags except setting it as implicit. -static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) { - assert(!Orig.isImplicit()); - return MachineOperand::CreateReg(Orig.getReg(), - Orig.isDef(), - true, - Orig.isKill(), - Orig.isDead(), - Orig.isUndef(), - Orig.isEarlyClobber(), - Orig.getSubReg(), - Orig.isDebug(), - Orig.isInternalRead()); +static void copyFlagsToImplicitVCC(MachineInstr &MI, + const MachineOperand &Orig) { + + for (MachineOperand &Use : MI.implicit_operands()) { + if (Use.getReg() == AMDGPU::VCC) { + Use.setIsUndef(Orig.isUndef()); + Use.setIsKill(Orig.isKill()); + return; + } + } +} + +static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { + return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4); } bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); + std::vector I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -217,14 +214,94 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; + if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { + // If this has a literal constant source that is the same as the + // reversed bits of an inline immediate, replace with a bitreverse of + // that constant. This saves 4 bytes in the common case of materializing + // sign bits. + + // Test if we are after regalloc. We only want to do this after any + // optimizations happen because this will confuse them. + // XXX - not exactly a check for post-regalloc run. + MachineOperand &Src = MI.getOperand(1); + if (Src.isImm() && + TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) { + int64_t Imm = Src.getImm(); + if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) { + int32_t ReverseImm = reverseBits(static_cast(Imm)); + if (ReverseImm >= -16 && ReverseImm <= 64) { + MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); + Src.setImm(ReverseImm); + continue; + } + } + } + } + + // Combine adjacent s_nops to use the immediate operand encoding how long + // to wait. + // + // s_nop N + // s_nop M + // => + // s_nop (N + M) + if (MI.getOpcode() == AMDGPU::S_NOP && + Next != MBB.end() && + (*Next).getOpcode() == AMDGPU::S_NOP) { + + MachineInstr &NextMI = *Next; + // The instruction encodes the amount to wait with an offset of 1, + // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back + // after adding. + uint8_t Nop0 = MI.getOperand(0).getImm() + 1; + uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1; + + // Make sure we don't overflow the bounds. + if (Nop0 + Nop1 <= 8) { + NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1); + MI.eraseFromParent(); + } + + continue; + } + + // FIXME: We also need to consider movs of constant operands since + // immediate operands are not folded if they have more than one use, and + // the operand folding pass is unaware if the immediate will be free since + // it won't know if the src == dest constraint will end up being + // satisfied. + if (MI.getOpcode() == AMDGPU::S_ADD_I32 || + MI.getOpcode() == AMDGPU::S_MUL_I32) { + const MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Src0 = MI.getOperand(1); + const MachineOperand &Src1 = MI.getOperand(2); + + // FIXME: This could work better if hints worked with subregisters. If + // we have a vector add of a constant, we usually don't get the correct + // allocation due to the subregister usage. + if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) && + Src0.isReg()) { + MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg()); + continue; + } + + if (Src0.isReg() && Src0.getReg() == Dest.getReg()) { + if (Src1.isImm() && isKImmOperand(TII, Src1)) { + unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? + AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; + + MI.setDesc(TII->get(Opc)); + MI.tieOperands(0, 1); + } + } + } + // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. if (MI.getOpcode() == AMDGPU::S_MOV_B32) { const MachineOperand &Src = MI.getOperand(1); - if (Src.isImm()) { - if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) - MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); - } + if (Src.isImm() && isKImmOperand(TII, Src)) + MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); continue; } @@ -235,7 +312,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (!canShrink(MI, TII, TRI, MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. - if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || + if (!MI.isCommutable() || !TII->commuteInstruction(MI) || !canShrink(MI, TII, TRI, MRI)) continue; } @@ -287,9 +364,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); - // Add the dst operand if the 32-bit encoding also has an explicit $dst. + // Add the dst operand if the 32-bit encoding also has an explicit $vdst. // For VOPC instructions, this is replaced by an implicit def of vcc. - int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst); + int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); if (Op32DstIdx != -1) { // dst Inst32.addOperand(MI.getOperand(0)); @@ -314,10 +391,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { Inst32.addOperand(*Src2); } else { // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is - // replaced with an implicit read of vcc. - assert(Src2->getReg() == AMDGPU::VCC && - "Unexpected missing register operand"); - Inst32.addOperand(copyRegOperandAsImplicit(*Src2)); + // replaced with an implicit read of vcc. This was already added + // during the initial BuildMI, so find it to preserve the flags. + copyFlagsToImplicitVCC(*Inst32, *Src2); } } diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp index d36c5d29b127..facc0c7df1dc 100644 --- a/lib/Target/AMDGPU/SITypeRewriter.cpp +++ b/lib/Target/AMDGPU/SITypeRewriter.cpp @@ -62,7 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) { } bool SITypeRewriter::runOnFunction(Function &F) { - if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE) + if (!AMDGPU::isShader(F.getCallingConv())) return false; visit(F); diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp new file mode 100644 index 000000000000..c1a237ea5f51 --- /dev/null +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -0,0 +1,509 @@ +//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass adds instructions to enable whole quad mode for pixel +/// shaders. +/// +/// Whole quad mode is required for derivative computations, but it interferes +/// with shader side effects (stores and atomics). This pass is run on the +/// scheduled machine IR but before register coalescing, so that machine SSA is +/// available for analysis. It ensures that WQM is enabled when necessary, but +/// disabled around stores and atomics. +/// +/// When necessary, this pass creates a function prolog +/// +/// S_MOV_B64 LiveMask, EXEC +/// S_WQM_B64 EXEC, EXEC +/// +/// to enter WQM at the top of the function and surrounds blocks of Exact +/// instructions by +/// +/// S_AND_SAVEEXEC_B64 Tmp, LiveMask +/// ... +/// S_MOV_B64 EXEC, Tmp +/// +/// In order to avoid excessive switching during sequences of Exact +/// instructions, the pass first analyzes which instructions must be run in WQM +/// (aka which instructions produce values that lead to derivative +/// computations). +/// +/// Basic blocks are always exited in WQM as long as some successor needs WQM. +/// +/// There is room for improvement given better control flow analysis: +/// +/// (1) at the top level (outside of control flow statements, and as long as +/// kill hasn't been used), one SGPR can be saved by recovering WQM from +/// the LiveMask (this is implemented for the entry block). +/// +/// (2) when entire regions (e.g. if-else blocks or entire loops) only +/// consist of exact and don't-care instructions, the switch only has to +/// be done at the entry and exit points rather than potentially in each +/// block of the region. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-wqm" + +namespace { + +enum { + StateWQM = 0x1, + StateExact = 0x2, +}; + +struct InstrInfo { + char Needs = 0; + char OutNeeds = 0; +}; + +struct BlockInfo { + char Needs = 0; + char InNeeds = 0; + char OutNeeds = 0; +}; + +struct WorkItem { + MachineBasicBlock *MBB = nullptr; + MachineInstr *MI = nullptr; + + WorkItem() {} + WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} + WorkItem(MachineInstr *MI) : MI(MI) {} +}; + +class SIWholeQuadMode : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + DenseMap Instructions; + DenseMap Blocks; + SmallVector ExecExports; + SmallVector LiveMaskQueries; + + char scanInstructions(MachineFunction &MF, std::vector &Worklist); + void propagateInstruction(MachineInstr &MI, std::vector &Worklist); + void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); + char analyzeFunction(MachineFunction &MF); + + void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SaveWQM, unsigned LiveMaskReg); + void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SavedWQM); + void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); + + void lowerLiveMaskQueries(unsigned LiveMaskReg); + +public: + static char ID; + + SIWholeQuadMode() : + MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Whole Quad Mode"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace + +char SIWholeQuadMode::ID = 0; + +INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE, + "SI Whole Quad Mode", false, false) + +char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; + +FunctionPass *llvm::createSIWholeQuadModePass() { + return new SIWholeQuadMode; +} + +// Scan instructions to determine which ones require an Exact execmask and +// which ones seed WQM requirements. +char SIWholeQuadMode::scanInstructions(MachineFunction &MF, + std::vector &Worklist) { + char GlobalFlags = 0; + bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); + + for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { + MachineBasicBlock &MBB = *BI; + + for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { + MachineInstr &MI = *II; + unsigned Opcode = MI.getOpcode(); + char Flags = 0; + + if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { + Flags = StateWQM; + } else if (MI.mayStore() && TII->usesVM_CNT(MI)) { + Flags = StateExact; + } else { + // Handle export instructions with the exec mask valid flag set + if (Opcode == AMDGPU::EXP) { + if (MI.getOperand(4).getImm() != 0) + ExecExports.push_back(&MI); + } else if (Opcode == AMDGPU::SI_PS_LIVE) { + LiveMaskQueries.push_back(&MI); + } else if (WQMOutputs) { + // The function is in machine SSA form, which means that physical + // VGPRs correspond to shader inputs and outputs. Inputs are + // only used, outputs are only defined. + for (const MachineOperand &MO : MI.defs()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + + if (!TRI->isVirtualRegister(Reg) && + TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { + Flags = StateWQM; + break; + } + } + } + + if (!Flags) + continue; + } + + Instructions[&MI].Needs = Flags; + Worklist.push_back(&MI); + GlobalFlags |= Flags; + } + + if (WQMOutputs && MBB.succ_empty()) { + // This is a prolog shader. Make sure we go back to exact mode at the end. + Blocks[&MBB].OutNeeds = StateExact; + Worklist.push_back(&MBB); + GlobalFlags |= StateExact; + } + } + + return GlobalFlags; +} + +void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, + std::vector& Worklist) { + MachineBasicBlock *MBB = MI.getParent(); + InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references + BlockInfo &BI = Blocks[MBB]; + + // Control flow-type instructions that are followed by WQM computations + // must themselves be in WQM. + if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) { + Instructions[&MI].Needs = StateWQM; + II.Needs = StateWQM; + } + + // Propagate to block level + BI.Needs |= II.Needs; + if ((BI.InNeeds | II.Needs) != BI.InNeeds) { + BI.InNeeds |= II.Needs; + Worklist.push_back(MBB); + } + + // Propagate backwards within block + if (MachineInstr *PrevMI = MI.getPrevNode()) { + char InNeeds = II.Needs | II.OutNeeds; + if (!PrevMI->isPHI()) { + InstrInfo &PrevII = Instructions[PrevMI]; + if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { + PrevII.OutNeeds |= InNeeds; + Worklist.push_back(PrevMI); + } + } + } + + // Propagate WQM flag to instruction inputs + assert(II.Needs != (StateWQM | StateExact)); + if (II.Needs != StateWQM) + return; + + for (const MachineOperand &Use : MI.uses()) { + if (!Use.isReg() || !Use.isUse()) + continue; + + // At this point, physical registers appear as inputs or outputs + // and following them makes no sense (and would in fact be incorrect + // when the same VGPR is used as both an output and an input that leads + // to a NeedsWQM instruction). + // + // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we + // have to trace this, in practice it happens for 64-bit computations like + // pointers where both dwords are followed already anyway. + if (!TargetRegisterInfo::isVirtualRegister(Use.getReg())) + continue; + + for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) { + InstrInfo &DefII = Instructions[&DefMI]; + + // Obviously skip if DefMI is already flagged as NeedWQM. + // + // The instruction might also be flagged as NeedExact. This happens when + // the result of an atomic is used in a WQM computation. In this case, + // the atomic must not run for helper pixels and the WQM result is + // undefined. + if (DefII.Needs != 0) + continue; + + DefII.Needs = StateWQM; + Worklist.push_back(&DefMI); + } + } +} + +void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, + std::vector& Worklist) { + BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. + + // Propagate through instructions + if (!MBB.empty()) { + MachineInstr *LastMI = &*MBB.rbegin(); + InstrInfo &LastII = Instructions[LastMI]; + if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { + LastII.OutNeeds |= BI.OutNeeds; + Worklist.push_back(LastMI); + } + } + + // Predecessor blocks must provide for our WQM/Exact needs. + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) + continue; + + PredBI.OutNeeds |= BI.InNeeds; + PredBI.InNeeds |= BI.InNeeds; + Worklist.push_back(Pred); + } + + // All successors must be prepared to accept the same set of WQM/Exact data. + for (MachineBasicBlock *Succ : MBB.successors()) { + BlockInfo &SuccBI = Blocks[Succ]; + if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) + continue; + + SuccBI.InNeeds |= BI.OutNeeds; + Worklist.push_back(Succ); + } +} + +char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { + std::vector Worklist; + char GlobalFlags = scanInstructions(MF, Worklist); + + while (!Worklist.empty()) { + WorkItem WI = Worklist.back(); + Worklist.pop_back(); + + if (WI.MI) + propagateInstruction(*WI.MI, Worklist); + else + propagateBlock(*WI.MBB, Worklist); + } + + return GlobalFlags; +} + +void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SaveWQM, unsigned LiveMaskReg) { + if (SaveWQM) { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + SaveWQM) + .addReg(LiveMaskReg); + } else { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(LiveMaskReg); + } +} + +void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SavedWQM) { + if (SavedWQM) { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + .addReg(SavedWQM); + } else { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + } +} + +void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, + bool isEntry) { + auto BII = Blocks.find(&MBB); + if (BII == Blocks.end()) + return; + + const BlockInfo &BI = BII->second; + + if (!(BI.InNeeds & StateWQM)) + return; + + // This is a non-entry block that is WQM throughout, so no need to do + // anything. + if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) + return; + + unsigned SavedWQMReg = 0; + bool WQMFromExec = isEntry; + char State = isEntry ? StateExact : StateWQM; + + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + MachineInstr &MI = *II; + ++II; + + // Skip instructions that are not affected by EXEC + if (TII->isScalarUnit(MI) && !MI.isTerminator()) + continue; + + // Generic instructions such as COPY will either disappear by register + // coalescing or be lowered to SALU or VALU instructions. + if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) { + if (MI.getNumExplicitOperands() >= 1) { + const MachineOperand &Op = MI.getOperand(0); + if (Op.isReg()) { + if (TRI->isSGPRReg(*MRI, Op.getReg())) { + // SGPR instructions are not affected by EXEC + continue; + } + } + } + } + + char Needs = 0; + char OutNeeds = 0; + auto InstrInfoIt = Instructions.find(&MI); + if (InstrInfoIt != Instructions.end()) { + Needs = InstrInfoIt->second.Needs; + OutNeeds = InstrInfoIt->second.OutNeeds; + + // Make sure to switch to Exact mode before the end of the block when + // Exact and only Exact is needed further downstream. + if (OutNeeds == StateExact && MI.isTerminator()) { + assert(Needs == 0); + Needs = StateExact; + } + } + + // State switching + if (Needs && State != Needs) { + if (Needs == StateExact) { + assert(!SavedWQMReg); + + if (!WQMFromExec && (OutNeeds & StateWQM)) + SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + + toExact(MBB, &MI, SavedWQMReg, LiveMaskReg); + } else { + assert(WQMFromExec == (SavedWQMReg == 0)); + toWQM(MBB, &MI, SavedWQMReg); + SavedWQMReg = 0; + } + + State = Needs; + } + } + + if ((BI.OutNeeds & StateWQM) && State != StateWQM) { + assert(WQMFromExec == (SavedWQMReg == 0)); + toWQM(MBB, MBB.end(), SavedWQMReg); + } else if (BI.OutNeeds == StateExact && State != StateExact) { + toExact(MBB, MBB.end(), 0, LiveMaskReg); + } +} + +void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { + for (MachineInstr *MI : LiveMaskQueries) { + const DebugLoc &DL = MI->getDebugLoc(); + unsigned Dest = MI->getOperand(0).getReg(); + BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) + .addReg(LiveMaskReg); + MI->eraseFromParent(); + } +} + +bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { + if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) + return false; + + Instructions.clear(); + Blocks.clear(); + ExecExports.clear(); + LiveMaskQueries.clear(); + + const SISubtarget &ST = MF.getSubtarget(); + + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + char GlobalFlags = analyzeFunction(MF); + if (!(GlobalFlags & StateWQM)) { + lowerLiveMaskQueries(AMDGPU::EXEC); + return !LiveMaskQueries.empty(); + } + + // Store a copy of the original live mask when required + unsigned LiveMaskReg = 0; + { + MachineBasicBlock &Entry = MF.front(); + MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); + + if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { + LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(AMDGPU::EXEC); + } + + if (GlobalFlags == StateWQM) { + // For a shader that needs only WQM, we can just set it once. + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + + lowerLiveMaskQueries(LiveMaskReg); + // EntryMI may become invalid here + return true; + } + } + + lowerLiveMaskQueries(LiveMaskReg); + + // Handle the general case + for (auto BII : Blocks) + processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + + return true; +} diff --git a/lib/Target/AMDGPU/TargetInfo/Makefile b/lib/Target/AMDGPU/TargetInfo/Makefile deleted file mode 100644 index 1b232871bd62..000000000000 --- a/lib/Target/AMDGPU/TargetInfo/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAMDGPUInfo - -# Hack: we need to include 'main' target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp new file mode 100644 index 000000000000..b6868de6a74e --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -0,0 +1,69 @@ +//===-- AMDGPUAsmUtils.cpp - AsmParser/InstPrinter common -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#include "AMDGPUAsmUtils.h" + +namespace llvm { +namespace AMDGPU { +namespace SendMsg { + +// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h. +const char* const IdSymbolic[] = { + nullptr, + "MSG_INTERRUPT", + "MSG_GS", + "MSG_GS_DONE", + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + "MSG_SYSMSG" +}; + +// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h. +const char* const OpSysSymbolic[] = { + nullptr, + "SYSMSG_OP_ECC_ERR_INTERRUPT", + "SYSMSG_OP_REG_RD", + "SYSMSG_OP_HOST_TRAP_ACK", + "SYSMSG_OP_TTRACE_PC" +}; + +const char* const OpGsSymbolic[] = { + "GS_OP_NOP", + "GS_OP_CUT", + "GS_OP_EMIT", + "GS_OP_EMIT_CUT" +}; + +} // namespace SendMsg + +namespace Hwreg { + +// This must be in sync with llvm::AMDGPU::Hwreg::ID_SYMBOLIC_FIRST_/LAST_, see SIDefines.h. +const char* const IdSymbolic[] = { + nullptr, + "HW_REG_MODE", + "HW_REG_STATUS", + "HW_REG_TRAPSTS", + "HW_REG_HW_ID", + "HW_REG_GPR_ALLOC", + "HW_REG_LDS_ALLOC", + "HW_REG_IB_STS" +}; + +} // namespace Hwreg +} // namespace AMDGPU +} // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h new file mode 100644 index 000000000000..b2dc2c0e364c --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -0,0 +1,31 @@ +//===-- AMDGPUAsmUtils.h - AsmParser/InstPrinter common ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H + +namespace llvm { +namespace AMDGPU { +namespace SendMsg { // Symbolic names for the sendmsg(...) syntax. + +extern const char* const IdSymbolic[]; +extern const char* const OpSysSymbolic[]; +extern const char* const OpGsSymbolic[]; + +} // namespace SendMsg + +namespace Hwreg { // Symbolic names for the hwreg(...) syntax. + +extern const char* const IdSymbolic[]; + +} // namespace Hwreg +} // namespace AMDGPU +} // namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 1f5deaef9d3b..c6f9142c0aa5 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -109,29 +109,45 @@ bool isReadOnlySegment(const GlobalValue *GV) { return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; } -static unsigned getIntegerAttribute(const Function &F, const char *Name, - unsigned Default) { +int getIntegerAttribute(const Function &F, StringRef Name, int Default) { Attribute A = F.getFnAttribute(Name); - unsigned Result = Default; + int Result = Default; if (A.isStringAttribute()) { StringRef Str = A.getValueAsString(); if (Str.getAsInteger(0, Result)) { LLVMContext &Ctx = F.getContext(); - Ctx.emitError("can't parse shader type"); + Ctx.emitError("can't parse integer attribute " + Name); } } + return Result; } -unsigned getShaderType(const Function &F) { - return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE); +unsigned getMaximumWorkGroupSize(const Function &F) { + return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256); } unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } +bool isShader(CallingConv::ID cc) { + switch(cc) { + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + return true; + default: + return false; + } +} + +bool isCompute(CallingConv::ID cc) { + return !isShader(cc) || cc == CallingConv::AMDGPU_CS; +} + bool isSI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; } diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 57cbe1b58f98..995a9041fb36 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -11,6 +11,7 @@ #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H #include "AMDKernelCodeT.h" +#include "llvm/IR/CallingConv.h" namespace llvm { @@ -44,9 +45,13 @@ bool isGroupSegment(const GlobalValue *GV); bool isGlobalSegment(const GlobalValue *GV); bool isReadOnlySegment(const GlobalValue *GV); -unsigned getShaderType(const Function &F); +int getIntegerAttribute(const Function &F, StringRef Name, int Default); + +unsigned getMaximumWorkGroupSize(const Function &F); unsigned getInitialPSInputAddr(const Function &F); +bool isShader(CallingConv::ID cc); +bool isCompute(CallingConv::ID cc); bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h new file mode 100644 index 000000000000..3a5ff60601d0 --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -0,0 +1,165 @@ +//===--------------------- AMDKernelCodeTInfo.h ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +// +/// \file - specifies tables for amd_kernel_code_t structure parsing/printing +// +//===----------------------------------------------------------------------===// + +#define QNAME(name) amd_kernel_code_t::name +#define FLD_T(name) decltype(QNAME(name)), &QNAME(name) + +#define FIELD2(sname, name) \ + RECORD(sname, printField, parseField) + +#define FIELD(name) FIELD2(name, name) + + +#define PRINTCODEPROP(name) \ + printBitField + +#define PARSECODEPROP(name) \ + parseBitField + +#define CODEPROP(name, shift) \ + RECORD(name, PRINTCODEPROP(shift), PARSECODEPROP(shift)) + +// have to define these lambdas because of Set/GetMacro +#define PRINTCOMP(GetMacro, Shift) \ +[](StringRef Name, const amd_kernel_code_t &C, raw_ostream &OS) { \ + printName(OS, Name) << \ + (int)GetMacro(C.compute_pgm_resource_registers >> Shift); \ +} +#define PARSECOMP(SetMacro, Shift) \ +[](amd_kernel_code_t &C, MCAsmParser &MCParser, raw_ostream &Err) { \ + int64_t Value = 0; \ + if (!expectAbsExpression(MCParser, Value, Err)) \ + return false; \ + C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \ + return true; \ +} + +#define COMPPGM(name, GetMacro, SetMacro, Shift) \ + RECORD(name, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift)) + +#define COMPPGM1(name, AccMacro) \ + COMPPGM(compute_pgm_rsrc1_##name, \ + G_00B848_##AccMacro, S_00B848_##AccMacro, 0) + +#define COMPPGM2(name, AccMacro) \ + COMPPGM(compute_pgm_rsrc2_##name, \ + G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32) + +/////////////////////////////////////////////////////////////////////////////// +// Begin of the table +// Define RECORD(name, print, parse) in your code to get field definitions +// and include this file + +FIELD2(kernel_code_version_major, amd_kernel_code_version_major), +FIELD2(kernel_code_version_minor, amd_kernel_code_version_minor), +FIELD2(machine_kind, amd_machine_kind), +FIELD2(machine_version_major, amd_machine_version_major), +FIELD2(machine_version_minor, amd_machine_version_minor), +FIELD2(machine_version_stepping, amd_machine_version_stepping), +FIELD(kernel_code_entry_byte_offset), +FIELD(kernel_code_prefetch_byte_size), +FIELD(max_scratch_backing_memory_byte_size), +FIELD(compute_pgm_resource_registers), +FIELD(workitem_private_segment_byte_size), +FIELD(workgroup_group_segment_byte_size), +FIELD(gds_segment_byte_size), +FIELD(kernarg_segment_byte_size), +FIELD(workgroup_fbarrier_count), +FIELD(wavefront_sgpr_count), +FIELD(workitem_vgpr_count), +FIELD(reserved_vgpr_first), +FIELD(reserved_vgpr_count), +FIELD(reserved_sgpr_first), +FIELD(reserved_sgpr_count), +FIELD(debug_wavefront_private_segment_offset_sgpr), +FIELD(debug_private_segment_buffer_sgpr), +FIELD(kernarg_segment_alignment), +FIELD(group_segment_alignment), +FIELD(private_segment_alignment), +FIELD(wavefront_size), +FIELD(call_convention), +FIELD(runtime_loader_kernel_symbol), + +COMPPGM1(vgprs, VGPRS), +COMPPGM1(sgprs, SGPRS), +COMPPGM1(priority, PRIORITY), +COMPPGM1(float_mode, FLOAT_MODE), +COMPPGM1(priv, PRIV), +COMPPGM1(dx10_clamp, DX10_CLAMP), +COMPPGM1(debug_mode, DEBUG_MODE), +COMPPGM1(ieee_mode, IEEE_MODE), +COMPPGM2(scratch_en, SCRATCH_EN), +COMPPGM2(user_sgpr, USER_SGPR), +COMPPGM2(tgid_x_en, TGID_X_EN), +COMPPGM2(tgid_y_en, TGID_Y_EN), +COMPPGM2(tgid_z_en, TGID_Z_EN), +COMPPGM2(tg_size_en, TG_SIZE_EN), +COMPPGM2(tidig_comp_cnt, TIDIG_COMP_CNT), +COMPPGM2(excp_en_msb, EXCP_EN_MSB), +COMPPGM2(lds_size, LDS_SIZE), +COMPPGM2(excp_en, EXCP_EN), + +CODEPROP(enable_sgpr_private_segment_buffer, + ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER), +CODEPROP(enable_sgpr_dispatch_ptr, + ENABLE_SGPR_DISPATCH_PTR), +CODEPROP(enable_sgpr_queue_ptr, + ENABLE_SGPR_QUEUE_PTR), +CODEPROP(enable_sgpr_kernarg_segment_ptr, + ENABLE_SGPR_KERNARG_SEGMENT_PTR), +CODEPROP(enable_sgpr_dispatch_id, + ENABLE_SGPR_DISPATCH_ID), +CODEPROP(enable_sgpr_flat_scratch_init, + ENABLE_SGPR_FLAT_SCRATCH_INIT), +CODEPROP(enable_sgpr_private_segment_size, + ENABLE_SGPR_PRIVATE_SEGMENT_SIZE), +CODEPROP(enable_sgpr_grid_workgroup_count_x, + ENABLE_SGPR_GRID_WORKGROUP_COUNT_X), +CODEPROP(enable_sgpr_grid_workgroup_count_y, + ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y), +CODEPROP(enable_sgpr_grid_workgroup_count_z, + ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z), +CODEPROP(enable_ordered_append_gds, + ENABLE_ORDERED_APPEND_GDS), +CODEPROP(private_element_size, + PRIVATE_ELEMENT_SIZE), +CODEPROP(is_ptr64, + IS_PTR64), +CODEPROP(is_dynamic_callstack, + IS_DYNAMIC_CALLSTACK), +CODEPROP(is_debug_enabled, + IS_DEBUG_SUPPORTED), +CODEPROP(is_xnack_enabled, + IS_XNACK_SUPPORTED) + +// end of the table +/////////////////////////////////////////////////////////////////////////////// + +#undef QNAME +#undef FLD_T +#undef FIELD2 +#undef FIELD +#undef PRINTCODEPROP +#undef PARSECODEPROP +#undef CODEPROP +#undef PRINTCOMP +#undef PAPSECOMP +#undef COMPPGM +#undef COMPPGM1 +#undef COMPPGM2 diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp new file mode 100644 index 000000000000..f64973afa44f --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp @@ -0,0 +1,166 @@ +//===--------------------AMDKernelCodeTUtils.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +// +/// \file - utility functions to parse/print amd_kernel_code_t structure +// +//===----------------------------------------------------------------------===// + +#include "AMDKernelCodeTUtils.h" +#include "SIDefines.h" +#include +#include +#include + +using namespace llvm; + +static ArrayRef get_amd_kernel_code_t_FldNames() { + static StringRef const Table[] = { + "", // not found placeholder +#define RECORD(name, print, parse) #name +#include "AMDKernelCodeTInfo.h" +#undef RECORD + }; + return makeArrayRef(Table); +} + +static StringMap createIndexMap(const ArrayRef &a) { + StringMap map; + for (auto Name : a) + map.insert(std::make_pair(Name, map.size())); + return map; +} + +static int get_amd_kernel_code_t_FieldIndex(StringRef name) { + static const auto map = createIndexMap(get_amd_kernel_code_t_FldNames()); + return map.lookup(name) - 1; // returns -1 if not found +} + +static StringRef get_amd_kernel_code_t_FieldName(int index) { + return get_amd_kernel_code_t_FldNames()[index + 1]; +} + + +// Field printing + +static raw_ostream &printName(raw_ostream &OS, StringRef Name) { + return OS << Name << " = "; +} + +template +static void printField(StringRef Name, const amd_kernel_code_t &C, + raw_ostream &OS) { + printName(OS, Name) << (int)(C.*ptr); +} + +template +static void printBitField(StringRef Name, const amd_kernel_code_t &c, + raw_ostream &OS) { + const auto Mask = (static_cast(1) << width) - 1; + printName(OS, Name) << (int)((c.*ptr >> shift) & Mask); +} + +typedef void(*PrintFx)(StringRef, + const amd_kernel_code_t &, + raw_ostream &); + +static ArrayRef getPrinterTable() { + static const PrintFx Table[] = { +#define RECORD(name, print, parse) print +#include "AMDKernelCodeTInfo.h" +#undef RECORD + }; + return makeArrayRef(Table); +} + +void llvm::printAmdKernelCodeField(const amd_kernel_code_t &C, + int FldIndex, + raw_ostream &OS) { + auto Printer = getPrinterTable()[FldIndex]; + if (Printer) + Printer(get_amd_kernel_code_t_FieldName(FldIndex), C, OS); +} + +void llvm::dumpAmdKernelCode(const amd_kernel_code_t *C, + raw_ostream &OS, + const char *tab) { + const int Size = getPrinterTable().size(); + for (int i = 0; i < Size; ++i) { + OS << tab; + printAmdKernelCodeField(*C, i, OS); + OS << '\n'; + } +} + + +// Field parsing + +static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostream& Err) { + + if (MCParser.getLexer().isNot(AsmToken::Equal)) { + Err << "expected '='"; + return false; + } + MCParser.getLexer().Lex(); + + if (MCParser.parseAbsoluteExpression(Value)) { + Err << "integer absolute expression expected"; + return false; + } + return true; +} + +template +static bool parseField(amd_kernel_code_t &C, MCAsmParser &MCParser, + raw_ostream &Err) { + int64_t Value = 0; + if (!expectAbsExpression(MCParser, Value, Err)) + return false; + C.*ptr = (T)Value; + return true; +} + +template +static bool parseBitField(amd_kernel_code_t &C, MCAsmParser &MCParser, + raw_ostream &Err) { + int64_t Value = 0; + if (!expectAbsExpression(MCParser, Value, Err)) + return false; + const uint64_t Mask = ((UINT64_C(1) << width) - 1) << shift; + C.*ptr &= (T)~Mask; + C.*ptr |= (T)((Value << shift) & Mask); + return true; +} + +typedef bool(*ParseFx)(amd_kernel_code_t &, + MCAsmParser &MCParser, + raw_ostream &Err); + +static ArrayRef getParserTable() { + static const ParseFx Table[] = { +#define RECORD(name, print, parse) parse +#include "AMDKernelCodeTInfo.h" +#undef RECORD + }; + return makeArrayRef(Table); +} + +bool llvm::parseAmdKernelCodeField(StringRef ID, + MCAsmParser &MCParser, + amd_kernel_code_t &C, + raw_ostream &Err) { + const int Idx = get_amd_kernel_code_t_FieldIndex(ID); + if (Idx < 0) { + Err << "unexpected amd_kernel_code_t field name " << ID; + return false; + } + auto Parser = getParserTable()[Idx]; + return Parser ? Parser(C, MCParser, Err) : false; +} diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h new file mode 100644 index 000000000000..d9edca7a82ac --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h @@ -0,0 +1,39 @@ +//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t *- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file AMDKernelCodeTUtils.h +//===----------------------------------------------------------------------===// + +#ifndef AMDKERNELCODETUTILS_H +#define AMDKERNELCODETUTILS_H + +#include "AMDKernelCodeT.h" + +namespace llvm { + +class MCAsmLexer; +class MCAsmParser; +class raw_ostream; +class StringRef; + +void printAmdKernelCodeField(const amd_kernel_code_t &C, + int FldIndex, + raw_ostream &OS); + +void dumpAmdKernelCode(const amd_kernel_code_t *C, + raw_ostream &OS, + const char *tab); + +bool parseAmdKernelCodeField(StringRef ID, + MCAsmParser &Parser, + amd_kernel_code_t &C, + raw_ostream &Err); + +} + +#endif // AMDKERNELCODETUTILS_H diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt index 2c07aeab7dd3..01b80ebe8d3d 100644 --- a/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -1,3 +1,5 @@ add_llvm_library(LLVMAMDGPUUtils AMDGPUBaseInfo.cpp + AMDKernelCodeTUtils.cpp + AMDGPUAsmUtils.cpp ) diff --git a/lib/Target/AMDGPU/Utils/Makefile b/lib/Target/AMDGPU/Utils/Makefile deleted file mode 100644 index 1019e726d50e..000000000000 --- a/lib/Target/AMDGPU/Utils/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AMDGPU/Utils/Makefile --------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAMDGPUUtils - -# Hack: we need to include 'main' AMDGPU target directory to grab private -# headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/VIInstrFormats.td b/lib/Target/AMDGPU/VIInstrFormats.td index d8738f992630..912ed5329bfe 100644 --- a/lib/Target/AMDGPU/VIInstrFormats.td +++ b/lib/Target/AMDGPU/VIInstrFormats.td @@ -91,21 +91,28 @@ class MTBUFe_vi op> : Enc64 { class SMEMe_vi op, bit imm> : Enc64 { bits<7> sbase; - bits<7> sdata; + bits<7> sdst; bits<1> glc; - bits<20> offset; let Inst{5-0} = sbase{6-1}; - let Inst{12-6} = sdata; + let Inst{12-6} = sdst; let Inst{16} = glc; let Inst{17} = imm; let Inst{25-18} = op; let Inst{31-26} = 0x30; //encoding +} + +class SMEM_IMMe_vi op> : SMEMe_vi { + bits<20> offset; let Inst{51-32} = offset; } -class VOP3e_vi op> : Enc64 { - bits<8> vdst; +class SMEM_SOFFe_vi op> : SMEMe_vi { + bits<20> soff; + let Inst{51-32} = soff; +} + +class VOP3a_vi op> : Enc64 { bits<2> src0_modifiers; bits<9> src0; bits<2> src1_modifiers; @@ -115,7 +122,6 @@ class VOP3e_vi op> : Enc64 { bits<1> clamp; bits<2> omod; - let Inst{7-0} = vdst; let Inst{8} = src0_modifiers{1}; let Inst{9} = src1_modifiers{1}; let Inst{10} = src2_modifiers{1}; @@ -131,6 +137,20 @@ class VOP3e_vi op> : Enc64 { let Inst{63} = src2_modifiers{0}; } +class VOP3e_vi op> : VOP3a_vi { + bits<8> vdst; + + let Inst{7-0} = vdst; +} + +// Encoding used for VOPC instructions encoded as VOP3 +// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst +class VOP3ce_vi op> : VOP3a_vi { + bits<8> sdst; + + let Inst{7-0} = sdst; +} + class VOP3be_vi op> : Enc64 { bits<8> vdst; bits<2> src0_modifiers; @@ -157,6 +177,117 @@ class VOP3be_vi op> : Enc64 { let Inst{63} = src2_modifiers{0}; } +class VOP_DPP pattern, bit HasMods = 0> : + VOPAnyCommon { + let DPP = 1; + let Size = 8; + + let AsmMatchConverter = !if(!eq(HasMods,1), "cvtDPP", ""); +} + +class VOP_DPPe : Enc64 { + bits<2> src0_modifiers; + bits<8> src0; + bits<2> src1_modifiers; + bits<9> dpp_ctrl; + bits<1> bound_ctrl; + bits<4> bank_mask; + bits<4> row_mask; + + let Inst{39-32} = src0; + let Inst{48-40} = dpp_ctrl; + let Inst{51} = bound_ctrl; + let Inst{52} = src0_modifiers{0}; // src0_neg + let Inst{53} = src0_modifiers{1}; // src0_abs + let Inst{54} = src1_modifiers{0}; // src1_neg + let Inst{55} = src1_modifiers{1}; // src1_abs + let Inst{59-56} = bank_mask; + let Inst{63-60} = row_mask; +} + +class VOP1_DPPe op> : VOP_DPPe { + bits<8> vdst; + + let Inst{8-0} = 0xfa; // dpp + let Inst{16-9} = op; + let Inst{24-17} = vdst; + let Inst{31-25} = 0x3f; //encoding +} + +class VOP2_DPPe op> : VOP_DPPe { + bits<8> vdst; + bits<8> src1; + + let Inst{8-0} = 0xfa; //dpp + let Inst{16-9} = src1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; //encoding +} + +class VOP_SDWA pattern, bit HasMods = 0> : + VOPAnyCommon { + let SDWA = 1; + let Size = 8; +} + +class VOP_SDWAe : Enc64 { + bits<8> src0; + bits<3> src0_sel; + bits<2> src0_fmodifiers; // {abs,neg} + bits<1> src0_imodifiers; // sext + bits<3> src1_sel; + bits<2> src1_fmodifiers; + bits<1> src1_imodifiers; + bits<3> dst_sel; + bits<2> dst_unused; + bits<1> clamp; + + let Inst{39-32} = src0; + let Inst{42-40} = dst_sel; + let Inst{44-43} = dst_unused; + let Inst{45} = clamp; + let Inst{50-48} = src0_sel; + let Inst{53-52} = src0_fmodifiers; + let Inst{51} = src0_imodifiers; + let Inst{58-56} = src1_sel; + let Inst{61-60} = src1_fmodifiers; + let Inst{59} = src1_imodifiers; +} + +class VOP1_SDWAe op> : VOP_SDWAe { + bits<8> vdst; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = op; + let Inst{24-17} = vdst; + let Inst{31-25} = 0x3f; // encoding +} + +class VOP2_SDWAe op> : VOP_SDWAe { + bits<8> vdst; + bits<8> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = src1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding +} + +class VOPC_SDWAe op> : VOP_SDWAe { + bits<8> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = src1; + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; // encoding + + // VOPC disallows dst_sel and dst_unused as they have no effect on destination + let Inst{42-40} = 0x6; + let Inst{44-43} = 0x2; +} + class EXPe_vi : EXPe { let Inst{31-26} = 0x31; //encoding } diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td index 1a7801c92bd7..5c490ab900f2 100644 --- a/lib/Target/AMDGPU/VIInstructions.td +++ b/lib/Target/AMDGPU/VIInstructions.td @@ -11,6 +11,8 @@ let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in { +let DisableSIDecoder = 1 in { + //===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// @@ -52,9 +54,9 @@ defm V_SUBREV_F16 : VOP2Inst , "v_subrev_f16", VOP_F16_F16_F16, defm V_MUL_F16 : VOP2Inst , "v_mul_f16", VOP_F16_F16_F16>; defm V_MAC_F16 : VOP2Inst , "v_mac_f16", VOP_F16_F16_F16>; } // End isCommutable = 1 -defm V_MADMK_F16 : VOP2MADK , "v_madmk_f16">; +defm V_MADMK_F16 : VOP2MADK , "v_madmk_f16", VOP_MADMK>; let isCommutable = 1 in { -defm V_MADAK_F16 : VOP2MADK , "v_madak_f16">; +defm V_MADAK_F16 : VOP2MADK , "v_madak_f16", VOP_MADAK>; defm V_ADD_U16 : VOP2Inst , "v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst , "v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst , "v_subrev_u16", VOP_I16_I16_I16>; @@ -73,6 +75,16 @@ defm V_MIN_I16 : VOP2Inst , "v_min_i16", VOP_I16_I16_I16>; } // End isCommutable = 1 defm V_LDEXP_F16 : VOP2Inst , "v_ldexp_f16", VOP_F16_F16_I16>; +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// +let isCommutable = 1 in { + defm V_MAD_F16 : VOP3Inst , "v_mad_f16", VOP_F16_F16_F16_F16>; + defm V_MAD_U16 : VOP3Inst , "v_mad_u16", VOP_I16_I16_I16_I16>; + defm V_MAD_I16 : VOP3Inst , "v_mad_i16", VOP_I16_I16_I16_I16>; +} +} // let DisableSIDecoder = 1 + // Aliases to simplify matching of floating-point instructions that // are VOP2 on SI and VOP3 on VI. @@ -99,6 +111,9 @@ def S_DCACHE_WB : SMEM_Inval <0x21, def S_DCACHE_WB_VOL : SMEM_Inval <0x23, "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; +def S_MEMREALTIME : SMEM_Ret<0x25, + "s_memrealtime", int_amdgcn_s_memrealtime>; + } // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI let Predicates = [isVI] in { @@ -109,4 +124,35 @@ def : Pat < (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) >; +//===----------------------------------------------------------------------===// +// DPP Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, + imm:$bound_ctrl), + (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), + (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) +>; + +//===----------------------------------------------------------------------===// +// Misc Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i64 (readcyclecounter)), + (S_MEMREALTIME) +>; + +//===----------------------------------------------------------------------===// +// DS_PERMUTE/DS_BPERMUTE Instructions. +//===----------------------------------------------------------------------===// + +let Uses = [EXEC] in { +defm DS_PERMUTE_B32 : DS_1A1D_PERMUTE <0x3e, "ds_permute_b32", VGPR_32, + int_amdgcn_ds_permute>; +defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <0x3f, "ds_bpermute_b32", VGPR_32, + int_amdgcn_ds_bpermute>; +} + } // End Predicates = [isVI] diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp index 7a1865ce5fd6..9228cc2d7a9c 100644 --- a/lib/Target/ARM/A15SDOptimizer.cpp +++ b/lib/Target/ARM/A15SDOptimizer.cpp @@ -68,34 +68,31 @@ namespace { // unsigned createDupLane(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, - DebugLoc DL, - unsigned Reg, unsigned Lane, - bool QPR=false); + const DebugLoc &DL, unsigned Reg, unsigned Lane, + bool QPR = false); unsigned createExtractSubreg(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, - DebugLoc DL, - unsigned DReg, unsigned Lane, - const TargetRegisterClass *TRC); + const DebugLoc &DL, unsigned DReg, + unsigned Lane, const TargetRegisterClass *TRC); unsigned createVExt(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, - DebugLoc DL, - unsigned Ssub0, unsigned Ssub1); + const DebugLoc &DL, unsigned Ssub0, unsigned Ssub1); unsigned createRegSequence(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, - DebugLoc DL, - unsigned Reg1, unsigned Reg2); + const DebugLoc &DL, unsigned Reg1, + unsigned Reg2); unsigned createInsertSubreg(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, - DebugLoc DL, unsigned DReg, unsigned Lane, - unsigned ToInsert); + const DebugLoc &DL, unsigned DReg, + unsigned Lane, unsigned ToInsert); unsigned createImplicitDef(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, - DebugLoc DL); + const DebugLoc &DL); // // Various property checkers @@ -426,11 +423,10 @@ SmallVector A15SDOptimizer::getReadDPRs(MachineInstr *MI) { } // Creates a DPR register from an SPR one by using a VDUP. -unsigned -A15SDOptimizer::createDupLane(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, - DebugLoc DL, - unsigned Reg, unsigned Lane, bool QPR) { +unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + const DebugLoc &DL, unsigned Reg, + unsigned Lane, bool QPR) { unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : &ARM::DPRRegClass); AddDefaultPred(BuildMI(MBB, @@ -445,12 +441,10 @@ A15SDOptimizer::createDupLane(MachineBasicBlock &MBB, } // Creates a SPR register from a DPR by copying the value in lane 0. -unsigned -A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, - DebugLoc DL, - unsigned DReg, unsigned Lane, - const TargetRegisterClass *TRC) { +unsigned A15SDOptimizer::createExtractSubreg( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + const DebugLoc &DL, unsigned DReg, unsigned Lane, + const TargetRegisterClass *TRC) { unsigned Out = MRI->createVirtualRegister(TRC); BuildMI(MBB, InsertBefore, @@ -462,11 +456,9 @@ A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB, } // Takes two SPR registers and creates a DPR by using a REG_SEQUENCE. -unsigned -A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, - DebugLoc DL, - unsigned Reg1, unsigned Reg2) { +unsigned A15SDOptimizer::createRegSequence( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + const DebugLoc &DL, unsigned Reg1, unsigned Reg2) { unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass); BuildMI(MBB, InsertBefore, @@ -481,11 +473,10 @@ A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB, // Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1) // and merges them into one DPR register. -unsigned -A15SDOptimizer::createVExt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, - DebugLoc DL, - unsigned Ssub0, unsigned Ssub1) { +unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + const DebugLoc &DL, unsigned Ssub0, + unsigned Ssub1) { unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); AddDefaultPred(BuildMI(MBB, InsertBefore, @@ -497,11 +488,9 @@ A15SDOptimizer::createVExt(MachineBasicBlock &MBB, return Out; } -unsigned -A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, - DebugLoc DL, unsigned DReg, unsigned Lane, - unsigned ToInsert) { +unsigned A15SDOptimizer::createInsertSubreg( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) { unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass); BuildMI(MBB, InsertBefore, @@ -517,7 +506,7 @@ A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB, unsigned A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, - DebugLoc DL) { + const DebugLoc &DL) { unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); BuildMI(MBB, InsertBefore, @@ -681,6 +670,9 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { } bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) { + if (skipFunction(*Fn.getFunction())) + return false; + const ARMSubtarget &STI = Fn.getSubtarget(); // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be // enabled when NEON is available. @@ -701,7 +693,7 @@ bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) { for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end(); MI != ME;) { - Modified |= runOnInstruction(MI++); + Modified |= runOnInstruction(&*MI++); } } diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h index cd7540e52410..690ff86a0c86 100644 --- a/lib/Target/ARM/ARM.h +++ b/lib/Target/ARM/ARM.h @@ -27,6 +27,7 @@ class FunctionPass; class ImmutablePass; class MachineInstr; class MCInst; +class PassRegistry; class TargetLowering; class TargetMachine; @@ -45,6 +46,9 @@ FunctionPass *createThumb2SizeReductionPass( void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, ARMAsmPrinter &AP); +void initializeARMLoadStoreOptPass(PassRegistry &); +void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); + } // end namespace llvm; #endif diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index c171656b48ab..ef626b66a1e7 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -76,6 +76,11 @@ def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true", "Enable Thumb2 extract and pack instructions">; def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true", "Has data barrier (dmb / dsb) instructions">; +def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true", + "Has v7 clrex instruction">; +def FeatureAcquireRelease : SubtargetFeature<"acquire-release", + "HasAcquireRelease", "true", + "Has v8 acquire/release (lda/ldaex etc) instructions">; def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", "FP compare + branch is slow">; def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", @@ -84,17 +89,98 @@ def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", "Enable support for Performance Monitor extensions">; def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true", "Enable support for TrustZone security extensions">; +def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true", + "Enable support for ARMv8-M Security Extensions">; def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", "Enable support for Cryptography extensions", [FeatureNEON]>; def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable support for CRC instructions">; +// Not to be confused with FeatureHasRetAddrStack (return address stack) +def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", + "Enable Reliability, Availability and Serviceability extensions">; + // Cyclone has preferred instructions for zeroing VFP registers, which can // execute in 0 cycles. def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", "Has zero-cycle zeroing instructions">; +// Whether or not it may be profitable to unpredicate certain instructions +// during if conversion. +def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr", + "IsProfitableToUnpredicate", + "true", + "Is profitable to unpredicate">; + +// Some targets (e.g. Swift) have microcoded VGETLNi32. +def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32", + "HasSlowVGETLNi32", "true", + "Has slow VGETLNi32 - prefer VMOV">; + +// Some targets (e.g. Swift) have microcoded VDUP32. +def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true", + "Has slow VDUP32 - prefer VMOV">; + +// Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON +// for scalar FP, as this allows more effective execution domain optimization. +def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR", + "true", "Prefer VMOVSR">; + +// Swift has ISHST barriers compatible with Atomic Release semantics but weaker +// than ISH +def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST", + "true", "Prefer ISHST barriers">; + +// Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU. +def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true", + "Has muxed AGU and NEON/FPU">; + +// On some targets, a VLDM/VSTM starting with an odd register number needs more +// microops than single VLDRS. +def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister", + "true", "VLDM/VSTM starting with an odd register is slow">; + +// Some targets have a renaming dependency when loading into D subregisters. +def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg", + "SlowLoadDSubregister", "true", + "Loading into D subregs is slow">; +// Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD. +def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs", + "DontWidenVMOVS", "true", + "Don't widen VMOVS to VMOVD">; + +// Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions. +def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true", + "Expand VFP/NEON MLA/MLS instructions">; + +// Some targets have special RAW hazards for VFP/NEON VMLA/VMLS. +def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards", + "true", "Has VMLx hazards">; + +// Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from +// VFP to NEON, as an execution domain optimization. +def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs", + "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">; + +// Some processors benefit from using NEON instructions for scalar +// single-precision FP operations. This affects instruction selection and should +// only be enabled if the handling of denormals is not important. +def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", + "true", + "Use NEON for single precision FP">; + +// On some processors, VLDn instructions that access unaligned data take one +// extra cycle. Take that into account when computing operand latencies. +def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign", + "true", + "Check for VLDn unaligned access">; + +// Some processors have a nonpipelined VFP coprocessor. +def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp", + "NonpipelinedVFP", "true", + "VFP instructions are not pipelined">; + // Some processors have FP multiply-accumulate instructions that don't // play nicely with other VFP / NEON instructions, and it's generally better // to just not use them. @@ -106,12 +192,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", "HasVMLxForwarding", "true", "Has multiplier accumulator forwarding">; -// Some processors benefit from using NEON instructions for scalar -// single-precision FP operations. -def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", - "true", - "Use NEON for single precision FP">; - // Disable 32-bit to 16-bit narrowing for experimentation. def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", "Prefer 32-bit Thumb instrs">; @@ -130,7 +210,7 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", // Some processors perform return stack prediction. CodeGen should avoid issue // "normal" call instructions to callees which do not return. -def FeatureHasRAS : SubtargetFeature<"ras", "HasRAS", "true", +def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true", "Has return address stack">; /// DSP extension. @@ -200,24 +280,31 @@ def HasV6Ops : SubtargetFeature<"v6", "HasV6Ops", "true", def HasV6MOps : SubtargetFeature<"v6m", "HasV6MOps", "true", "Support ARM v6M instructions", [HasV6Ops]>; +def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true", + "Support ARM v8M Baseline instructions", + [HasV6MOps]>; def HasV6KOps : SubtargetFeature<"v6k", "HasV6KOps", "true", "Support ARM v6k instructions", [HasV6Ops]>; def HasV6T2Ops : SubtargetFeature<"v6t2", "HasV6T2Ops", "true", "Support ARM v6t2 instructions", - [HasV6MOps, HasV6KOps, FeatureThumb2]>; + [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>; def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true", "Support ARM v7 instructions", - [HasV6T2Ops, FeaturePerfMon]>; + [HasV6T2Ops, FeaturePerfMon, + FeatureV7Clrex]>; def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", "Support ARM v8 instructions", - [HasV7Ops]>; + [HasV7Ops, FeatureAcquireRelease]>; def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", [HasV8Ops]>; def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", "Support ARM v8.2a instructions", [HasV8_1aOps]>; +def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true", + "Support ARM v8M Mainline instructions", + [HasV7Ops]>; //===----------------------------------------------------------------------===// @@ -238,6 +325,8 @@ def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", "Cortex-A15 ARM processors", []>; def ProcA17 : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17", "Cortex-A17 ARM processors", []>; +def ProcA32 : SubtargetFeature<"a32", "ARMProcFamily", "CortexA32", + "Cortex-A32 ARM processors", []>; def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors", []>; def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", @@ -246,6 +335,8 @@ def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", "Cortex-A57 ARM processors", []>; def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", "Cortex-A72 ARM processors", []>; +def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", + "Cortex-A73 ARM processors", []>; def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait", "Qualcomm ARM processors", []>; @@ -256,12 +347,14 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", "Samsung Exynos-M1 processors", []>; def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4", - "Cortex-R4 ARM processors", []>; + "Cortex-R4 ARM processors", []>; def ProcR5 : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5", "Cortex-R5 ARM processors", []>; def ProcR7 : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7", "Cortex-R7 ARM processors", []>; +def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3", + "Cortex-M3 ARM processors", []>; //===----------------------------------------------------------------------===// // ARM schedules. @@ -374,7 +467,27 @@ def ARMv82a : Architecture<"armv8.2-a", "ARMv82a", [HasV8_2aOps, FeatureMP, FeatureVirtualization, FeatureCrypto, - FeatureCRC]>; + FeatureCRC, + FeatureRAS]>; + +def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline", + [HasV8MBaselineOps, + FeatureNoARM, + FeatureDB, + FeatureHWDiv, + FeatureV7Clrex, + Feature8MSecExt, + FeatureAcquireRelease, + FeatureMClass]>; + +def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline", + [HasV8MMainlineOps, + FeatureNoARM, + FeatureDB, + FeatureHWDiv, + Feature8MSecExt, + FeatureAcquireRelease, + FeatureMClass]>; // Aliases def IWMMXT : Architecture<"iwmmxt", "ARMv5te", [ARMv5te]>; @@ -452,7 +565,7 @@ def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ARMv6t2, // FIXME: A5 has currently the same Schedule model as A8 def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5, - FeatureHasRAS, + FeatureHasRetAddrStack, FeatureTrustZone, FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, @@ -462,9 +575,10 @@ def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5, FeatureVFP4]>; def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7, - FeatureHasRAS, + FeatureHasRetAddrStack, FeatureTrustZone, FeatureSlowFPBrcc, + FeatureHasVMLxHazards, FeatureHasSlowFPVMLx, FeatureVMLxForwarding, FeatureT2XtPk, @@ -475,25 +589,33 @@ def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7, FeatureVirtualization]>; def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8, - FeatureHasRAS, + FeatureHasRetAddrStack, + FeatureNonpipelinedVFP, FeatureTrustZone, FeatureSlowFPBrcc, + FeatureHasVMLxHazards, FeatureHasSlowFPVMLx, FeatureVMLxForwarding, FeatureT2XtPk]>; def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9, - FeatureHasRAS, + FeatureHasRetAddrStack, FeatureTrustZone, + FeatureHasVMLxHazards, FeatureVMLxForwarding, FeatureT2XtPk, FeatureFP16, FeatureAvoidPartialCPSR, + FeatureExpandMLx, + FeaturePreferVMOVSR, + FeatureMuxedUnits, + FeatureNEONForFPMovs, + FeatureCheckVLDnAlign, FeatureMP]>; // FIXME: A12 has currently the same Schedule model as A9 def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12, - FeatureHasRAS, + FeatureHasRetAddrStack, FeatureTrustZone, FeatureVMLxForwarding, FeatureT2XtPk, @@ -506,11 +628,14 @@ def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12, // FIXME: A15 has currently the same Schedule model as A9. def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, - FeatureHasRAS, + FeatureDontWidenVMOVS, + FeatureHasRetAddrStack, + FeatureMuxedUnits, FeatureTrustZone, FeatureT2XtPk, FeatureVFP4, FeatureMP, + FeatureCheckVLDnAlign, FeatureHWDiv, FeatureHWDivARM, FeatureAvoidPartialCPSR, @@ -518,7 +643,7 @@ def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, // FIXME: A17 has currently the same Schedule model as A9 def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17, - FeatureHasRAS, + FeatureHasRetAddrStack, FeatureTrustZone, FeatureMP, FeatureVMLxForwarding, @@ -533,7 +658,9 @@ def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17, // FIXME: krait has currently the same features as A9 plus VFP4 and hardware // division features. def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait, - FeatureHasRAS, + FeatureHasRetAddrStack, + FeatureMuxedUnits, + FeatureCheckVLDnAlign, FeatureVMLxForwarding, FeatureT2XtPk, FeatureFP16, @@ -543,7 +670,7 @@ def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait, FeatureHWDivARM]>; def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, - FeatureHasRAS, + FeatureHasRetAddrStack, FeatureNEONForFP, FeatureT2XtPk, FeatureVFP4, @@ -552,17 +679,24 @@ def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, FeatureHWDivARM, FeatureAvoidPartialCPSR, FeatureAvoidMOVsShOp, - FeatureHasSlowFPVMLx]>; + FeatureHasSlowFPVMLx, + FeatureHasVMLxHazards, + FeatureProfUnpredicate, + FeaturePrefISHSTBarrier, + FeatureSlowOddRegister, + FeatureSlowLoadDSubreg, + FeatureSlowVGETLNi32, + FeatureSlowVDUP32]>; // FIXME: R4 has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r4", CortexA8Model, [ARMv7r, ProcR4, - FeatureHasRAS, + FeatureHasRetAddrStack, FeatureAvoidPartialCPSR, FeatureT2XtPk]>; // FIXME: R4F has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, - FeatureHasRAS, + FeatureHasRetAddrStack, FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, FeatureVFP3, @@ -572,7 +706,7 @@ def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, // FIXME: R5 has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, - FeatureHasRAS, + FeatureHasRetAddrStack, FeatureVFP3, FeatureD16, FeatureSlowFPBrcc, @@ -583,9 +717,20 @@ def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, // FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5. def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, - FeatureHasRAS, + FeatureHasRetAddrStack, + FeatureVFP3, + FeatureD16, + FeatureFP16, + FeatureMP, + FeatureSlowFPBrcc, + FeatureHWDivARM, + FeatureHasSlowFPVMLx, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; + +def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, + FeatureHasRetAddrStack, FeatureVFP3, - FeatureVFPOnlySP, FeatureD16, FeatureFP16, FeatureMP, @@ -595,8 +740,8 @@ def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, FeatureAvoidPartialCPSR, FeatureT2XtPk]>; -def : ProcNoItin<"cortex-m3", [ARMv7m]>; -def : ProcNoItin<"sc300", [ARMv7m]>; +def : ProcNoItin<"cortex-m3", [ARMv7m, ProcM3]>; +def : ProcNoItin<"sc300", [ARMv7m, ProcM3]>; def : ProcNoItin<"cortex-m4", [ARMv7em, FeatureVFP4, @@ -607,6 +752,12 @@ def : ProcNoItin<"cortex-m7", [ARMv7em, FeatureFPARMv8, FeatureD16]>; +def : ProcNoItin<"cortex-a32", [ARMv8a, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; def : ProcNoItin<"cortex-a35", [ARMv8a, ProcA35, FeatureHWDiv, @@ -636,9 +787,16 @@ def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72, FeatureCrypto, FeatureCRC]>; +def : ProcNoItin<"cortex-a73", [ARMv8a, ProcA73, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + // Cyclone is very similar to swift def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, - FeatureHasRAS, + FeatureHasRetAddrStack, FeatureNEONForFP, FeatureT2XtPk, FeatureVFP4, diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index 206db9619a2f..04863a7ecf8f 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -43,12 +43,11 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ARMBuildAttributes.h" -#include "llvm/Support/TargetParser.h" #include "llvm/Support/COFF.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -213,8 +212,6 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, GetARMGVSymbol(GV, TF)->print(O, MAI); printOffset(MO.getOffset(), O); - if (TF == ARMII::MO_PLT) - O << "(PLT)"; break; } case MachineOperand::MO_ConstantPoolIndex: @@ -516,9 +513,10 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer->AddBlankLine(); } - Stubs = MMIMacho.GetHiddenGVStubList(); + Stubs = MMIMacho.GetThreadLocalGVStubList(); if (!Stubs.empty()) { - OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); + // Switch with ".non_lazy_symbol_pointer" directive. + OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection()); EmitAlignment(2); for (auto &Stub : Stubs) @@ -536,18 +534,48 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); } + if (TT.isOSBinFormatCOFF()) { + const auto &TLOF = + static_cast(getObjFileLowering()); + + std::string Flags; + raw_string_ostream OS(Flags); + + for (const auto &Function : M) + TLOF.emitLinkerFlagsForGlobal(OS, &Function, *Mang); + for (const auto &Global : M.globals()) + TLOF.emitLinkerFlagsForGlobal(OS, &Global, *Mang); + for (const auto &Alias : M.aliases()) + TLOF.emitLinkerFlagsForGlobal(OS, &Alias, *Mang); + + OS.flush(); + + // Output collected flags + if (!Flags.empty()) { + OutStreamer->SwitchSection(TLOF.getDrectveSection()); + OutStreamer->EmitBytes(Flags); + } + } + // The last attribute to be emitted is ABI_optimization_goals MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); ARMTargetStreamer &ATS = static_cast(TS); if (OptimizationGoals > 0 && - (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI())) + (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || + Subtarget->isTargetMuslAEABI())) ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals); OptimizationGoals = -1; ATS.finishAttributeSection(); } +static bool isV8M(const ARMSubtarget *Subtarget) { + // Note that v8M Baseline is a subset of v6T2! + return (Subtarget->hasV8MBaselineOps() && !Subtarget->hasV6T2Ops()) || + Subtarget->hasV8MMainlineOps(); +} + //===----------------------------------------------------------------------===// // Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile() // FIXME: @@ -561,13 +589,17 @@ static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU, return ARMBuildAttrs::v5TEJ; if (Subtarget->hasV8Ops()) - return ARMBuildAttrs::v8; + return ARMBuildAttrs::v8_A; + else if (Subtarget->hasV8MMainlineOps()) + return ARMBuildAttrs::v8_M_Main; else if (Subtarget->hasV7Ops()) { if (Subtarget->isMClass() && Subtarget->hasDSP()) return ARMBuildAttrs::v7E_M; return ARMBuildAttrs::v7; } else if (Subtarget->hasV6T2Ops()) return ARMBuildAttrs::v6T2; + else if (Subtarget->hasV8MBaselineOps()) + return ARMBuildAttrs::v8_M_Base; else if (Subtarget->hasV6MOps()) return ARMBuildAttrs::v6S_M; else if (Subtarget->hasV6Ops()) @@ -609,9 +641,9 @@ void ARMAsmPrinter::emitAttributes() { static_cast(TM); const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian()); - std::string CPUString = STI.getCPUString(); + const std::string &CPUString = STI.getCPUString(); - if (CPUString.find("generic") != 0) { //CPUString doesn't start with "generic" + if (!StringRef(CPUString).startswith("generic")) { // FIXME: remove krait check when GNU tools support krait cpu if (STI.isKrait()) { ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9"); @@ -627,7 +659,7 @@ void ARMAsmPrinter::emitAttributes() { // Tag_CPU_arch_profile must have the default value of 0 when "Architecture // profile is not applicable (e.g. pre v7, or cross-profile code)". - if (STI.hasV7Ops()) { + if (STI.hasV7Ops() || isV8M(&STI)) { if (STI.isAClass()) { ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile, ARMBuildAttrs::ApplicationProfile); @@ -643,7 +675,10 @@ void ARMAsmPrinter::emitAttributes() { ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use, STI.hasARMOps() ? ARMBuildAttrs::Allowed : ARMBuildAttrs::Not_Allowed); - if (STI.isThumb1Only()) { + if (isV8M(&STI)) { + ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::AllowThumbDerived); + } else if (STI.isThumb1Only()) { ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed); } else if (STI.hasThumb2()) { ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, @@ -690,7 +725,7 @@ void ARMAsmPrinter::emitAttributes() { ATS.emitFPU(ARM::FK_VFPV2); } - if (TM.getRelocationModel() == Reloc::PIC_) { + if (isPositionIndependent()) { // PIC specific attributes. ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data, ARMBuildAttrs::AddressRWPCRel); @@ -794,6 +829,9 @@ void ARMAsmPrinter::emitAttributes() { if (STI.hasDivideInARMMode() && !STI.hasV8Ops()) ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt); + if (STI.hasDSP() && isV8M(&STI)) + ATS.emitAttribute(ARMBuildAttrs::DSP_extension, ARMBuildAttrs::Allowed); + if (MMI) { if (const Module *SourceModule = MMI->getModule()) { // ABI_PCS_wchar_t to indicate wchar_t width @@ -853,11 +891,18 @@ static MCSymbol *getPICLabel(const char *Prefix, unsigned FunctionNumber, static MCSymbolRefExpr::VariantKind getModifierVariantKind(ARMCP::ARMCPModifier Modifier) { switch (Modifier) { - case ARMCP::no_modifier: return MCSymbolRefExpr::VK_None; - case ARMCP::TLSGD: return MCSymbolRefExpr::VK_TLSGD; - case ARMCP::TPOFF: return MCSymbolRefExpr::VK_TPOFF; - case ARMCP::GOTTPOFF: return MCSymbolRefExpr::VK_GOTTPOFF; - case ARMCP::GOT_PREL: return MCSymbolRefExpr::VK_ARM_GOT_PREL; + case ARMCP::no_modifier: + return MCSymbolRefExpr::VK_None; + case ARMCP::TLSGD: + return MCSymbolRefExpr::VK_TLSGD; + case ARMCP::TPOFF: + return MCSymbolRefExpr::VK_TPOFF; + case ARMCP::GOTTPOFF: + return MCSymbolRefExpr::VK_GOTTPOFF; + case ARMCP::GOT_PREL: + return MCSymbolRefExpr::VK_ARM_GOT_PREL; + case ARMCP::SECREL: + return MCSymbolRefExpr::VK_SECREL; } llvm_unreachable("Invalid ARMCPModifier!"); } @@ -865,8 +910,8 @@ getModifierVariantKind(ARMCP::ARMCPModifier Modifier) { MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags) { if (Subtarget->isTargetMachO()) { - bool IsIndirect = (TargetFlags & ARMII::MO_NONLAZY) && - Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); + bool IsIndirect = + (TargetFlags & ARMII::MO_NONLAZY) && Subtarget->isGVIndirectSymbol(GV); if (!IsIndirect) return getSymbol(GV); @@ -876,8 +921,9 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, MachineModuleInfoMachO &MMIMachO = MMI->getObjFileInfo(); MachineModuleInfoImpl::StubValueTy &StubSym = - GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) - : MMIMachO.getGVStubEntry(MCSym); + GV->isThreadLocal() ? MMIMachO.getThreadLocalGVStubEntry(MCSym) + : MMIMachO.getGVStubEntry(MCSym); + if (!StubSym.getPointer()) StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); @@ -991,7 +1037,7 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) { // .word (LBB1 - LJTI_0_0) const MCExpr *Expr = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); - if (TM.getRelocationModel() == Reloc::PIC_) + if (isPositionIndependent()) Expr = MCBinaryExpr::createSub(Expr, MCSymbolRefExpr::create(JTISymbol, OutContext), OutContext); @@ -1227,6 +1273,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { const DataLayout &DL = getDataLayout(); + MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast(TS); // If we just ended a constant pool, mark it as such. if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) { @@ -1643,29 +1691,26 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // Non-Darwin binutils don't yet support the "trap" mnemonic. // FIXME: Remove this special case when they do. if (!Subtarget->isTargetMachO()) { - //.long 0xe7ffdefe @ trap uint32_t Val = 0xe7ffdefeUL; OutStreamer->AddComment("trap"); - OutStreamer->EmitIntValue(Val, 4); + ATS.emitInst(Val); return; } break; } case ARM::TRAPNaCl: { - //.long 0xe7fedef0 @ trap uint32_t Val = 0xe7fedef0UL; OutStreamer->AddComment("trap"); - OutStreamer->EmitIntValue(Val, 4); + ATS.emitInst(Val); return; } case ARM::tTRAP: { // Non-Darwin binutils don't yet support the "trap" mnemonic. // FIXME: Remove this special case when they do. if (!Subtarget->isTargetMachO()) { - //.short 57086 @ trap uint16_t Val = 0xdefe; OutStreamer->AddComment("trap"); - OutStreamer->EmitIntValue(Val, 2); + ATS.emitInst(Val, 'n'); return; } break; @@ -1845,6 +1890,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // bx $scratch unsigned SrcReg = MI->getOperand(0).getReg(); unsigned ScratchReg = MI->getOperand(1).getReg(); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi) .addReg(ScratchReg) .addReg(SrcReg) @@ -1885,6 +1931,36 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { .addReg(0)); return; } + case ARM::tInt_WIN_eh_sjlj_longjmp: { + // ldr.w r11, [$src, #0] + // ldr.w sp, [$src, #8] + // ldr.w pc, [$src, #4] + + unsigned SrcReg = MI->getOperand(0).getReg(); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12) + .addReg(ARM::R11) + .addReg(SrcReg) + .addImm(0) + // Predicate + .addImm(ARMCC::AL) + .addReg(0)); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12) + .addReg(ARM::SP) + .addReg(SrcReg) + .addImm(8) + // Predicate + .addImm(ARMCC::AL) + .addReg(0)); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12) + .addReg(ARM::PC) + .addReg(SrcReg) + .addImm(4) + // Predicate + .addImm(ARMCC::AL) + .addReg(0)); + return; + } } MCInst TmpInst; diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h index ed7be2de51ca..97f5ca0ecbc2 100644 --- a/lib/Target/ARM/ARMAsmPrinter.h +++ b/lib/Target/ARM/ARMAsmPrinter.h @@ -95,6 +95,7 @@ public: bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp); private: + // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile() void emitAttributes(); diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 49f328852667..693f16499717 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -51,15 +51,6 @@ static cl::opt EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden, cl::desc("Enable ARM 2-addr to 3-addr conv")); -static cl::opt -WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true), - cl::desc("Widen ARM vmovs to vmovd when possible")); - -static cl::opt -SwiftPartialUpdateClearance("swift-partial-update-clearance", - cl::Hidden, cl::init(12), - cl::desc("Clearance before partial register updates")); - /// ARM_MLxEntry - Record information about MLA / MLS instructions. struct ARM_MLxEntry { uint16_t MLxOpc; // MLA / MLS opcode @@ -124,18 +115,15 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG); } -MachineInstr * -ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const { +MachineInstr *ARMBaseInstrInfo::convertToThreeAddress( + MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const { // FIXME: Thumb2 support. if (!EnableARM3Addr) return nullptr; - MachineInstr *MI = MBBI; - MachineFunction &MF = *MI->getParent()->getParent(); - uint64_t TSFlags = MI->getDesc().TSFlags; + MachineFunction &MF = *MI.getParent()->getParent(); + uint64_t TSFlags = MI.getDesc().TSFlags; bool isPre = false; switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) { default: return nullptr; @@ -148,24 +136,24 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // Try splitting an indexed load/store to an un-indexed one plus an add/sub // operation. - unsigned MemOpc = getUnindexedOpcode(MI->getOpcode()); + unsigned MemOpc = getUnindexedOpcode(MI.getOpcode()); if (MemOpc == 0) return nullptr; MachineInstr *UpdateMI = nullptr; MachineInstr *MemMI = nullptr; unsigned AddrMode = (TSFlags & ARMII::AddrModeMask); - const MCInstrDesc &MCID = MI->getDesc(); + const MCInstrDesc &MCID = MI.getDesc(); unsigned NumOps = MCID.getNumOperands(); - bool isLoad = !MI->mayStore(); - const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0); - const MachineOperand &Base = MI->getOperand(2); - const MachineOperand &Offset = MI->getOperand(NumOps-3); + bool isLoad = !MI.mayStore(); + const MachineOperand &WB = isLoad ? MI.getOperand(1) : MI.getOperand(0); + const MachineOperand &Base = MI.getOperand(2); + const MachineOperand &Offset = MI.getOperand(NumOps - 3); unsigned WBReg = WB.getReg(); unsigned BaseReg = Base.getReg(); unsigned OffReg = Offset.getReg(); - unsigned OffImm = MI->getOperand(NumOps-2).getImm(); - ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm(); + unsigned OffImm = MI.getOperand(NumOps - 2).getImm(); + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI.getOperand(NumOps - 1).getImm(); switch (AddrMode) { default: llvm_unreachable("Unknown indexed op!"); case ARMII::AddrMode2: { @@ -176,22 +164,33 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // Can't encode it in a so_imm operand. This transformation will // add more than 1 instruction. Abandon! return nullptr; - UpdateMI = BuildMI(MF, MI->getDebugLoc(), + UpdateMI = BuildMI(MF, MI.getDebugLoc(), get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) - .addReg(BaseReg).addImm(Amt) - .addImm(Pred).addReg(0).addReg(0); + .addReg(BaseReg) + .addImm(Amt) + .addImm(Pred) + .addReg(0) + .addReg(0); } else if (Amt != 0) { ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm); unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt); - UpdateMI = BuildMI(MF, MI->getDebugLoc(), + UpdateMI = BuildMI(MF, MI.getDebugLoc(), get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg) - .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc) - .addImm(Pred).addReg(0).addReg(0); + .addReg(BaseReg) + .addReg(OffReg) + .addReg(0) + .addImm(SOOpc) + .addImm(Pred) + .addReg(0) + .addReg(0); } else - UpdateMI = BuildMI(MF, MI->getDebugLoc(), + UpdateMI = BuildMI(MF, MI.getDebugLoc(), get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) - .addReg(BaseReg).addReg(OffReg) - .addImm(Pred).addReg(0).addReg(0); + .addReg(BaseReg) + .addReg(OffReg) + .addImm(Pred) + .addReg(0) + .addReg(0); break; } case ARMII::AddrMode3 : { @@ -199,15 +198,21 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned Amt = ARM_AM::getAM3Offset(OffImm); if (OffReg == 0) // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand. - UpdateMI = BuildMI(MF, MI->getDebugLoc(), + UpdateMI = BuildMI(MF, MI.getDebugLoc(), get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) - .addReg(BaseReg).addImm(Amt) - .addImm(Pred).addReg(0).addReg(0); + .addReg(BaseReg) + .addImm(Amt) + .addImm(Pred) + .addReg(0) + .addReg(0); else - UpdateMI = BuildMI(MF, MI->getDebugLoc(), + UpdateMI = BuildMI(MF, MI.getDebugLoc(), get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) - .addReg(BaseReg).addReg(OffReg) - .addImm(Pred).addReg(0).addReg(0); + .addReg(BaseReg) + .addReg(OffReg) + .addImm(Pred) + .addReg(0) + .addReg(0); break; } } @@ -215,24 +220,34 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, std::vector NewMIs; if (isPre) { if (isLoad) - MemMI = BuildMI(MF, MI->getDebugLoc(), - get(MemOpc), MI->getOperand(0).getReg()) - .addReg(WBReg).addImm(0).addImm(Pred); + MemMI = + BuildMI(MF, MI.getDebugLoc(), get(MemOpc), MI.getOperand(0).getReg()) + .addReg(WBReg) + .addImm(0) + .addImm(Pred); else - MemMI = BuildMI(MF, MI->getDebugLoc(), - get(MemOpc)).addReg(MI->getOperand(1).getReg()) - .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); + MemMI = BuildMI(MF, MI.getDebugLoc(), get(MemOpc)) + .addReg(MI.getOperand(1).getReg()) + .addReg(WBReg) + .addReg(0) + .addImm(0) + .addImm(Pred); NewMIs.push_back(MemMI); NewMIs.push_back(UpdateMI); } else { if (isLoad) - MemMI = BuildMI(MF, MI->getDebugLoc(), - get(MemOpc), MI->getOperand(0).getReg()) - .addReg(BaseReg).addImm(0).addImm(Pred); + MemMI = + BuildMI(MF, MI.getDebugLoc(), get(MemOpc), MI.getOperand(0).getReg()) + .addReg(BaseReg) + .addImm(0) + .addImm(Pred); else - MemMI = BuildMI(MF, MI->getDebugLoc(), - get(MemOpc)).addReg(MI->getOperand(1).getReg()) - .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); + MemMI = BuildMI(MF, MI.getDebugLoc(), get(MemOpc)) + .addReg(MI.getOperand(1).getReg()) + .addReg(BaseReg) + .addReg(0) + .addImm(0) + .addImm(Pred); if (WB.isDead()) UpdateMI->getOperand(0).setIsDead(); NewMIs.push_back(UpdateMI); @@ -241,8 +256,8 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // Transfer LiveVariables states, kill / dead info. if (LV) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) { unsigned Reg = MO.getReg(); @@ -250,7 +265,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (MO.isDef()) { MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI; if (MO.isDead()) - LV->addVirtualRegisterDead(Reg, NewMI); + LV->addVirtualRegisterDead(Reg, *NewMI); } if (MO.isUse() && MO.isKill()) { for (unsigned j = 0; j < 2; ++j) { @@ -258,7 +273,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineInstr *NewMI = NewMIs[j]; if (!NewMI->readsRegister(Reg)) continue; - LV->addVirtualRegisterKilled(Reg, NewMI); + LV->addVirtualRegisterKilled(Reg, *NewMI); if (VI.removeKill(MI)) VI.Kills.push_back(NewMI); break; @@ -268,17 +283,18 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } } + MachineBasicBlock::iterator MBBI = MI.getIterator(); MFI->insert(MBBI, NewMIs[1]); MFI->insert(MBBI, NewMIs[0]); return NewMIs[0]; } // Branch analysis. -bool -ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl &Cond, - bool AllowModify) const { +bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify) const { TBB = nullptr; FBB = nullptr; @@ -289,7 +305,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, // Walk backwards from the end of the basic block until the branch is // analyzed or we give up. - while (isPredicated(I) || I->isTerminator() || I->isDebugValue()) { + while (isPredicated(*I) || I->isTerminator() || I->isDebugValue()) { // Flag to be raised on unanalyzeable instructions. This is useful in cases // where we want to clean up on the end of the basic block before we bail @@ -322,7 +338,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, Cond.push_back(I->getOperand(2)); } else if (I->isReturn()) { // Returns can't be analyzed, but we should run cleanup. - CantAnalyze = !isPredicated(I); + CantAnalyze = !isPredicated(*I); } else { // We encountered other unrecognized terminator. Bail out immediately. return true; @@ -330,7 +346,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, // Cleanup code - to be run for unpredicated unconditional branches and // returns. - if (!isPredicated(I) && + if (!isPredicated(*I) && (isUncondBranchOpcode(I->getOpcode()) || isIndirectBranchOpcode(I->getOpcode()) || isJumpTableBranchOpcode(I->getOpcode()) || @@ -344,9 +360,9 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, if (AllowModify) { MachineBasicBlock::iterator DI = std::next(I); while (DI != MBB.end()) { - MachineInstr *InstToDelete = DI; + MachineInstr &InstToDelete = *DI; ++DI; - InstToDelete->eraseFromParent(); + InstToDelete.eraseFromParent(); } } } @@ -390,11 +406,11 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return 2; } -unsigned -ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - ArrayRef Cond, - DebugLoc DL) const { +unsigned ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef Cond, + const DebugLoc &DL) const { ARMFunctionInfo *AFI = MBB.getParent()->getInfo(); int BOpc = !AFI->isThumbFunction() ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB); @@ -438,10 +454,10 @@ ReverseBranchCondition(SmallVectorImpl &Cond) const { return false; } -bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const { - if (MI->isBundle()) { - MachineBasicBlock::const_instr_iterator I = MI->getIterator(); - MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); +bool ARMBaseInstrInfo::isPredicated(const MachineInstr &MI) const { + if (MI.isBundle()) { + MachineBasicBlock::const_instr_iterator I = MI.getIterator(); + MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); while (++I != E && I->isInsideBundle()) { int PIdx = I->findFirstPredOperandIdx(); if (PIdx != -1 && I->getOperand(PIdx).getImm() != ARMCC::AL) @@ -450,26 +466,26 @@ bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const { return false; } - int PIdx = MI->findFirstPredOperandIdx(); - return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL; + int PIdx = MI.findFirstPredOperandIdx(); + return PIdx != -1 && MI.getOperand(PIdx).getImm() != ARMCC::AL; } -bool ARMBaseInstrInfo:: -PredicateInstruction(MachineInstr *MI, ArrayRef Pred) const { - unsigned Opc = MI->getOpcode(); +bool ARMBaseInstrInfo::PredicateInstruction( + MachineInstr &MI, ArrayRef Pred) const { + unsigned Opc = MI.getOpcode(); if (isUncondBranchOpcode(Opc)) { - MI->setDesc(get(getMatchingCondBranchOpcode(Opc))); - MachineInstrBuilder(*MI->getParent()->getParent(), MI) + MI.setDesc(get(getMatchingCondBranchOpcode(Opc))); + MachineInstrBuilder(*MI.getParent()->getParent(), MI) .addImm(Pred[0].getImm()) .addReg(Pred[1].getReg()); return true; } - int PIdx = MI->findFirstPredOperandIdx(); + int PIdx = MI.findFirstPredOperandIdx(); if (PIdx != -1) { - MachineOperand &PMO = MI->getOperand(PIdx); + MachineOperand &PMO = MI.getOperand(PIdx); PMO.setImm(Pred[0].getImm()); - MI->getOperand(PIdx+1).setReg(Pred[1].getReg()); + MI.getOperand(PIdx+1).setReg(Pred[1].getReg()); return true; } return false; @@ -501,11 +517,11 @@ bool ARMBaseInstrInfo::SubsumesPredicate(ArrayRef Pred1, } } -bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const { +bool ARMBaseInstrInfo::DefinesPredicate( + MachineInstr &MI, std::vector &Pred) const { bool Found = false; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) || (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) { Pred.push_back(MO); @@ -555,21 +571,21 @@ static bool isEligibleForITBlock(const MachineInstr *MI) { /// isPredicable - Return true if the specified instruction can be predicated. /// By default, this returns true for every instruction with a /// PredicateOperand. -bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const { - if (!MI->isPredicable()) +bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const { + if (!MI.isPredicable()) return false; - if (!isEligibleForITBlock(MI)) + if (!isEligibleForITBlock(&MI)) return false; ARMFunctionInfo *AFI = - MI->getParent()->getParent()->getInfo(); + MI.getParent()->getParent()->getInfo(); if (AFI->isThumb2Function()) { if (getSubtarget().restrictIT()) - return isV8EligibleForIT(MI); + return isV8EligibleForIT(&MI); } else { // non-Thumb - if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) + if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) return false; } @@ -594,19 +610,19 @@ template <> bool IsCPSRDead(MachineInstr *MI) { /// GetInstSize - Return the size of the specified MachineInstr. /// -unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { - const MachineBasicBlock &MBB = *MI->getParent(); +unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const { + const MachineBasicBlock &MBB = *MI.getParent(); const MachineFunction *MF = MBB.getParent(); const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - const MCInstrDesc &MCID = MI->getDesc(); + const MCInstrDesc &MCID = MI.getDesc(); if (MCID.getSize()) return MCID.getSize(); // If this machine instr is an inline asm, measure it. - if (MI->getOpcode() == ARM::INLINEASM) - return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI); - unsigned Opc = MI->getOpcode(); + if (MI.getOpcode() == ARM::INLINEASM) + return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); + unsigned Opc = MI.getOpcode(); switch (Opc) { default: // pseudo-instruction sizes are zero. @@ -628,11 +644,13 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { case ARM::JUMPTABLE_TBH: // If this machine instr is a constant pool entry, its size is recorded as // operand #2. - return MI->getOperand(2).getImm(); + return MI.getOperand(2).getImm(); case ARM::Int_eh_sjlj_longjmp: return 16; case ARM::tInt_eh_sjlj_longjmp: return 10; + case ARM::tInt_WIN_eh_sjlj_longjmp: + return 12; case ARM::Int_eh_sjlj_setjmp: case ARM::Int_eh_sjlj_setjmp_nofp: return 20; @@ -641,17 +659,17 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { case ARM::t2Int_eh_sjlj_setjmp_nofp: return 12; case ARM::SPACE: - return MI->getOperand(1).getImm(); + return MI.getOperand(1).getImm(); } } -unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const { +unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr &MI) const { unsigned Size = 0; - MachineBasicBlock::const_instr_iterator I = MI->getIterator(); - MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); + MachineBasicBlock::const_instr_iterator I = MI.getIterator(); + MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); while (++I != E && I->isInsideBundle()) { assert(!I->isBundle() && "No nested bundle!"); - Size += GetInstSizeInBytes(&*I); + Size += GetInstSizeInBytes(*I); } return Size; } @@ -700,9 +718,9 @@ void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB, } void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { bool GPRDest = ARM::GPRRegClass.contains(DestReg); bool GPRSrc = ARM::GPRRegClass.contains(SrcReg); @@ -976,20 +994,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, } } -unsigned -ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { +unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex) const { + switch (MI.getOpcode()) { default: break; case ARM::STRrs: case ARM::t2STRs: // FIXME: don't use t2STRs to access frame. - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isReg() && - MI->getOperand(3).isImm() && - MI->getOperand(2).getReg() == 0 && - MI->getOperand(3).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(1).isFI() && MI.getOperand(2).isReg() && + MI.getOperand(3).isImm() && MI.getOperand(2).getReg() == 0 && + MI.getOperand(3).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; case ARM::STRi12: @@ -997,27 +1012,24 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, case ARM::tSTRspi: case ARM::VSTRD: case ARM::VSTRS: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && + MI.getOperand(2).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; case ARM::VST1q64: case ARM::VST1d64TPseudo: case ARM::VST1d64QPseudo: - if (MI->getOperand(0).isFI() && - MI->getOperand(2).getSubReg() == 0) { - FrameIndex = MI->getOperand(0).getIndex(); - return MI->getOperand(2).getReg(); + if (MI.getOperand(0).isFI() && MI.getOperand(2).getSubReg() == 0) { + FrameIndex = MI.getOperand(0).getIndex(); + return MI.getOperand(2).getReg(); } break; case ARM::VSTMQIA: - if (MI->getOperand(1).isFI() && - MI->getOperand(0).getSubReg() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; } @@ -1025,10 +1037,10 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, return 0; } -unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI, +unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { const MachineMemOperand *Dummy; - return MI->mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex); + return MI.mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex); } void ARMBaseInstrInfo:: @@ -1164,20 +1176,17 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, } } -unsigned -ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { +unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex) const { + switch (MI.getOpcode()) { default: break; case ARM::LDRrs: case ARM::t2LDRs: // FIXME: don't use t2LDRs to access frame. - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isReg() && - MI->getOperand(3).isImm() && - MI->getOperand(2).getReg() == 0 && - MI->getOperand(3).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(1).isFI() && MI.getOperand(2).isReg() && + MI.getOperand(3).isImm() && MI.getOperand(2).getReg() == 0 && + MI.getOperand(3).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; case ARM::LDRi12: @@ -1185,27 +1194,24 @@ ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, case ARM::tLDRspi: case ARM::VLDRD: case ARM::VLDRS: - if (MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && - MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && + MI.getOperand(2).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; case ARM::VLD1q64: case ARM::VLD1d64TPseudo: case ARM::VLD1d64QPseudo: - if (MI->getOperand(1).isFI() && - MI->getOperand(0).getSubReg() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; case ARM::VLDMQIA: - if (MI->getOperand(1).isFI() && - MI->getOperand(0).getSubReg() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; } @@ -1213,20 +1219,19 @@ ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, return 0; } -unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const { +unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, + int &FrameIndex) const { const MachineMemOperand *Dummy; - return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex); + return MI.mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex); } /// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD /// depending on whether the result is used. -void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const { +void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const { bool isThumb1 = Subtarget.isThumb1Only(); bool isThumb2 = Subtarget.isThumb2(); const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo(); - MachineInstr *MI = MBBI; DebugLoc dl = MI->getDebugLoc(); MachineBasicBlock *BB = MI->getParent(); @@ -1269,24 +1274,20 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const { STM.addReg(Reg, RegState::Kill); } - BB->erase(MBBI); + BB->erase(MI); } -bool -ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - MachineFunction &MF = *MI->getParent()->getParent(); - Reloc::Model RM = MF.getTarget().getRelocationModel(); - - if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD) { +bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) { assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() && "LOAD_STACK_GUARD currently supported only for MachO."); - expandLoadStackGuard(MI, RM); - MI->getParent()->erase(MI); + expandLoadStackGuard(MI); + MI.getParent()->erase(MI); return true; } - if (MI->getOpcode() == ARM::MEMCPY) { + if (MI.getOpcode() == ARM::MEMCPY) { expandMEMCPY(MI); return true; } @@ -1295,14 +1296,13 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { // copyPhysReg() calls. Look for VMOVS instructions that can legally be // widened to VMOVD. We prefer the VMOVD when possible because it may be // changed into a VORR that can go down the NEON pipeline. - if (!WidenVMOVS || !MI->isCopy() || Subtarget.isCortexA15() || - Subtarget.isFPOnlySP()) + if (!MI.isCopy() || Subtarget.dontWidenVMOVS() || Subtarget.isFPOnlySP()) return false; // Look for a copy between even S-registers. That is where we keep floats // when using NEON v2f32 instructions for f32 arithmetic. - unsigned DstRegS = MI->getOperand(0).getReg(); - unsigned SrcRegS = MI->getOperand(1).getReg(); + unsigned DstRegS = MI.getOperand(0).getReg(); + unsigned SrcRegS = MI.getOperand(1).getReg(); if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS)) return false; @@ -1317,44 +1317,44 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { // We want to widen this into a DstRegD = VMOVD SrcRegD copy. This is only // legal if the COPY already defines the full DstRegD, and it isn't a // sub-register insertion. - if (!MI->definesRegister(DstRegD, TRI) || MI->readsRegister(DstRegD, TRI)) + if (!MI.definesRegister(DstRegD, TRI) || MI.readsRegister(DstRegD, TRI)) return false; // A dead copy shouldn't show up here, but reject it just in case. - if (MI->getOperand(0).isDead()) + if (MI.getOperand(0).isDead()) return false; // All clear, widen the COPY. - DEBUG(dbgs() << "widening: " << *MI); - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + DEBUG(dbgs() << "widening: " << MI); + MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); // Get rid of the old of DstRegD. Leave it if it defines a Q-reg // or some other super-register. - int ImpDefIdx = MI->findRegisterDefOperandIdx(DstRegD); + int ImpDefIdx = MI.findRegisterDefOperandIdx(DstRegD); if (ImpDefIdx != -1) - MI->RemoveOperand(ImpDefIdx); + MI.RemoveOperand(ImpDefIdx); // Change the opcode and operands. - MI->setDesc(get(ARM::VMOVD)); - MI->getOperand(0).setReg(DstRegD); - MI->getOperand(1).setReg(SrcRegD); + MI.setDesc(get(ARM::VMOVD)); + MI.getOperand(0).setReg(DstRegD); + MI.getOperand(1).setReg(SrcRegD); AddDefaultPred(MIB); // We are now reading SrcRegD instead of SrcRegS. This may upset the // register scavenger and machine verifier, so we need to indicate that we // are reading an undefined value from SrcRegD, but a proper value from // SrcRegS. - MI->getOperand(1).setIsUndef(); + MI.getOperand(1).setIsUndef(); MIB.addReg(SrcRegS, RegState::Implicit); // SrcRegD may actually contain an unrelated value in the ssub_1 // sub-register. Don't kill it. Only kill the ssub_0 sub-register. - if (MI->getOperand(1).isKill()) { - MI->getOperand(1).setIsKill(false); - MI->addRegisterKilled(SrcRegS, TRI, true); + if (MI.getOperand(1).isKill()) { + MI.getOperand(1).setIsKill(false); + MI.addRegisterKilled(SrcRegS, TRI, true); } - DEBUG(dbgs() << "replaced by: " << *MI); + DEBUG(dbgs() << "replaced by: " << MI); return true; } @@ -1403,54 +1403,54 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { return PCLabelId; } -void ARMBaseInstrInfo:: -reMaterialize(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SubIdx, - const MachineInstr *Orig, - const TargetRegisterInfo &TRI) const { - unsigned Opcode = Orig->getOpcode(); +void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SubIdx, + const MachineInstr &Orig, + const TargetRegisterInfo &TRI) const { + unsigned Opcode = Orig.getOpcode(); switch (Opcode) { default: { - MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); - MI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); + MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig); + MI->substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI); MBB.insert(I, MI); break; } case ARM::tLDRpci_pic: case ARM::t2LDRpci_pic: { MachineFunction &MF = *MBB.getParent(); - unsigned CPI = Orig->getOperand(1).getIndex(); + unsigned CPI = Orig.getOperand(1).getIndex(); unsigned PCLabelId = duplicateCPV(MF, CPI); - MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode), - DestReg) - .addConstantPoolIndex(CPI).addImm(PCLabelId); - MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end()); + MachineInstrBuilder MIB = + BuildMI(MBB, I, Orig.getDebugLoc(), get(Opcode), DestReg) + .addConstantPoolIndex(CPI) + .addImm(PCLabelId); + MIB->setMemRefs(Orig.memoperands_begin(), Orig.memoperands_end()); break; } } } -MachineInstr * -ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const { +MachineInstr *ARMBaseInstrInfo::duplicate(MachineInstr &Orig, + MachineFunction &MF) const { MachineInstr *MI = TargetInstrInfo::duplicate(Orig, MF); - switch(Orig->getOpcode()) { + switch (Orig.getOpcode()) { case ARM::tLDRpci_pic: case ARM::t2LDRpci_pic: { - unsigned CPI = Orig->getOperand(1).getIndex(); + unsigned CPI = Orig.getOperand(1).getIndex(); unsigned PCLabelId = duplicateCPV(MF, CPI); - Orig->getOperand(1).setIndex(CPI); - Orig->getOperand(2).setImm(PCLabelId); + Orig.getOperand(1).setIndex(CPI); + Orig.getOperand(2).setImm(PCLabelId); break; } } return MI; } -bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, - const MachineInstr *MI1, +bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0, + const MachineInstr &MI1, const MachineRegisterInfo *MRI) const { - unsigned Opcode = MI0->getOpcode(); + unsigned Opcode = MI0.getOpcode(); if (Opcode == ARM::t2LDRpci || Opcode == ARM::t2LDRpci_pic || Opcode == ARM::tLDRpci || @@ -1461,13 +1461,13 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, Opcode == ARM::MOV_ga_pcrel || Opcode == ARM::MOV_ga_pcrel_ldr || Opcode == ARM::t2MOV_ga_pcrel) { - if (MI1->getOpcode() != Opcode) + if (MI1.getOpcode() != Opcode) return false; - if (MI0->getNumOperands() != MI1->getNumOperands()) + if (MI0.getNumOperands() != MI1.getNumOperands()) return false; - const MachineOperand &MO0 = MI0->getOperand(1); - const MachineOperand &MO1 = MI1->getOperand(1); + const MachineOperand &MO0 = MI0.getOperand(1); + const MachineOperand &MO1 = MI1.getOperand(1); if (MO0.getOffset() != MO1.getOffset()) return false; @@ -1480,7 +1480,7 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, // Ignore the PC labels. return MO0.getGlobal() == MO1.getGlobal(); - const MachineFunction *MF = MI0->getParent()->getParent(); + const MachineFunction *MF = MI0.getParent()->getParent(); const MachineConstantPool *MCP = MF->getConstantPool(); int CPI0 = MO0.getIndex(); int CPI1 = MO1.getIndex(); @@ -1499,13 +1499,13 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, } return false; } else if (Opcode == ARM::PICLDR) { - if (MI1->getOpcode() != Opcode) + if (MI1.getOpcode() != Opcode) return false; - if (MI0->getNumOperands() != MI1->getNumOperands()) + if (MI0.getNumOperands() != MI1.getNumOperands()) return false; - unsigned Addr0 = MI0->getOperand(1).getReg(); - unsigned Addr1 = MI1->getOperand(1).getReg(); + unsigned Addr0 = MI0.getOperand(1).getReg(); + unsigned Addr1 = MI1.getOperand(1).getReg(); if (Addr0 != Addr1) { if (!MRI || !TargetRegisterInfo::isVirtualRegister(Addr0) || @@ -1517,21 +1517,21 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, MachineInstr *Def1 = MRI->getVRegDef(Addr1); // Check if the loaded value, e.g. a constantpool of a global address, are // the same. - if (!produceSameValue(Def0, Def1, MRI)) + if (!produceSameValue(*Def0, *Def1, MRI)) return false; } - for (unsigned i = 3, e = MI0->getNumOperands(); i != e; ++i) { + for (unsigned i = 3, e = MI0.getNumOperands(); i != e; ++i) { // %vreg12 = PICLDR %vreg11, 0, pred:14, pred:%noreg - const MachineOperand &MO0 = MI0->getOperand(i); - const MachineOperand &MO1 = MI1->getOperand(i); + const MachineOperand &MO0 = MI0.getOperand(i); + const MachineOperand &MO1 = MI1.getOperand(i); if (!MO0.isIdenticalTo(MO1)) return false; } return true; } - return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs); + return MI0.isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs); } /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to @@ -1653,7 +1653,7 @@ bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } -bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, +bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const { // Debug info is never a scheduling boundary. It's necessary to be explicit @@ -1662,11 +1662,11 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, // considered a scheduling hazard, which is wrong. It should be the actual // instruction preceding the dbg_value instruction(s), just like it is // when debug info is not present. - if (MI->isDebugValue()) + if (MI.isDebugValue()) return false; // Terminators and labels can't be scheduled around. - if (MI->isTerminator() || MI->isPosition()) + if (MI.isTerminator() || MI.isPosition()) return true; // Treat the start of the IT block as a scheduling boundary, but schedule @@ -1690,7 +1690,7 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, // Calls don't actually change the stack pointer, even if they have imp-defs. // No ARM calling conventions change the stack pointer. (X86 calling // conventions sometimes do). - if (!MI->isCall() && MI->definesRegister(ARM::SP)) + if (!MI.isCall() && MI.definesRegister(ARM::SP)) return true; return false; @@ -1718,7 +1718,7 @@ isProfitableToIfCvt(MachineBasicBlock &MBB, CmpMI->getOpcode() == ARM::t2CMPri) { unsigned Reg = CmpMI->getOperand(0).getReg(); unsigned PredReg = 0; - ARMCC::CondCodes P = getInstrPredicate(CmpMI, PredReg); + ARMCC::CondCodes P = getInstrPredicate(*CmpMI, PredReg); if (P == ARMCC::AL && CmpMI->getOperand(1).getImm() == 0 && isARMLowRegister(Reg)) return false; @@ -1765,24 +1765,24 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB, bool ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, MachineBasicBlock &FMBB) const { - // Reduce false anti-dependencies to let Swift's out-of-order execution + // Reduce false anti-dependencies to let the target's out-of-order execution // engine do its thing. - return Subtarget.isSwift(); + return Subtarget.isProfitableToUnpredicate(); } /// getInstrPredicate - If instruction is predicated, returns its predicate /// condition, otherwise returns AL. It also returns the condition code /// register by reference. -ARMCC::CondCodes -llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) { - int PIdx = MI->findFirstPredOperandIdx(); +ARMCC::CondCodes llvm::getInstrPredicate(const MachineInstr &MI, + unsigned &PredReg) { + int PIdx = MI.findFirstPredOperandIdx(); if (PIdx == -1) { PredReg = 0; return ARMCC::AL; } - PredReg = MI->getOperand(PIdx+1).getReg(); - return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm(); + PredReg = MI.getOperand(PIdx+1).getReg(); + return (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); } @@ -1797,11 +1797,11 @@ unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) { llvm_unreachable("Unknown unconditional branch opcode!"); } -MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI, +MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case ARM::MOVCCr: case ARM::t2MOVCCr: { // MOVCC can be commuted by inverting the condition. @@ -1810,13 +1810,14 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI, // MOVCC AL can't be inverted. Shouldn't happen. if (CC == ARMCC::AL || PredReg != ARM::CPSR) return nullptr; - MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); - if (!MI) + MachineInstr *CommutedMI = + TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + if (!CommutedMI) return nullptr; // After swapping the MOVCC operands, also invert the condition. - MI->getOperand(MI->findFirstPredOperandIdx()) - .setImm(ARMCC::getOppositeCondition(CC)); - return MI; + CommutedMI->getOperand(CommutedMI->findFirstPredOperandIdx()) + .setImm(ARMCC::getOppositeCondition(CC)); + return CommutedMI; } } return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); @@ -1860,11 +1861,11 @@ static MachineInstr *canFoldIntoMOVCC(unsigned Reg, return MI; } -bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI, +bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr &MI, SmallVectorImpl &Cond, unsigned &TrueOp, unsigned &FalseOp, bool &Optimizable) const { - assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) && + assert((MI.getOpcode() == ARM::MOVCCr || MI.getOpcode() == ARM::t2MOVCCr) && "Unknown select instruction"); // MOVCC operands: // 0: Def. @@ -1874,38 +1875,38 @@ bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI, // 4: CPSR use. TrueOp = 1; FalseOp = 2; - Cond.push_back(MI->getOperand(3)); - Cond.push_back(MI->getOperand(4)); + Cond.push_back(MI.getOperand(3)); + Cond.push_back(MI.getOperand(4)); // We can always fold a def. Optimizable = true; return false; } MachineInstr * -ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI, +ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI, SmallPtrSetImpl &SeenMIs, bool PreferFalse) const { - assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) && + assert((MI.getOpcode() == ARM::MOVCCr || MI.getOpcode() == ARM::t2MOVCCr) && "Unknown select instruction"); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this); + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + MachineInstr *DefMI = canFoldIntoMOVCC(MI.getOperand(2).getReg(), MRI, this); bool Invert = !DefMI; if (!DefMI) - DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this); + DefMI = canFoldIntoMOVCC(MI.getOperand(1).getReg(), MRI, this); if (!DefMI) return nullptr; // Find new register class to use. - MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1); - unsigned DestReg = MI->getOperand(0).getReg(); + MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1); + unsigned DestReg = MI.getOperand(0).getReg(); const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg()); if (!MRI.constrainRegClass(DestReg, PreviousClass)) return nullptr; // Create a new predicated version of DefMI. // Rfalse is the first use. - MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - DefMI->getDesc(), DestReg); + MachineInstrBuilder NewMI = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), DefMI->getDesc(), DestReg); // Copy all the DefMI operands, excluding its (null) predicate. const MCInstrDesc &DefDesc = DefMI->getDesc(); @@ -1913,12 +1914,12 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI, i != e && !DefDesc.OpInfo[i].isPredicate(); ++i) NewMI.addOperand(DefMI->getOperand(i)); - unsigned CondCode = MI->getOperand(3).getImm(); + unsigned CondCode = MI.getOperand(3).getImm(); if (Invert) NewMI.addImm(ARMCC::getOppositeCondition(ARMCC::CondCodes(CondCode))); else NewMI.addImm(CondCode); - NewMI.addOperand(MI->getOperand(4)); + NewMI.addOperand(MI.getOperand(4)); // DefMI is not the -S version that sets CPSR, so add an optional %noreg. if (NewMI->hasOptionalDef()) @@ -1940,7 +1941,7 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI, // DefMI would be invalid when tranferred inside the loop. Checking for a // loop is expensive, but at least remove kill flags if they are in different // BBs. - if (DefMI->getParent() != MI->getParent()) + if (DefMI->getParent() != MI.getParent()) NewMI->clearKillInfo(); // The caller will erase MI, but not DefMI. @@ -1994,10 +1995,12 @@ unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) { } void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, DebugLoc dl, - unsigned DestReg, unsigned BaseReg, int NumBytes, - ARMCC::CondCodes Pred, unsigned PredReg, - const ARMBaseInstrInfo &TII, unsigned MIFlags) { + MachineBasicBlock::iterator &MBBI, + const DebugLoc &dl, unsigned DestReg, + unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII, + unsigned MIFlags) { if (NumBytes == 0 && DestReg != BaseReg) { BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg) .addReg(BaseReg, RegState::Kill) @@ -2281,30 +2284,30 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, /// in SrcReg and SrcReg2 if having two register operands, and the value it /// compares against in CmpValue. Return true if the comparison instruction /// can be analyzed. -bool ARMBaseInstrInfo:: -analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, - int &CmpMask, int &CmpValue) const { - switch (MI->getOpcode()) { +bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, + unsigned &SrcReg2, int &CmpMask, + int &CmpValue) const { + switch (MI.getOpcode()) { default: break; case ARM::CMPri: case ARM::t2CMPri: - SrcReg = MI->getOperand(0).getReg(); + SrcReg = MI.getOperand(0).getReg(); SrcReg2 = 0; CmpMask = ~0; - CmpValue = MI->getOperand(1).getImm(); + CmpValue = MI.getOperand(1).getImm(); return true; case ARM::CMPrr: case ARM::t2CMPrr: - SrcReg = MI->getOperand(0).getReg(); - SrcReg2 = MI->getOperand(1).getReg(); + SrcReg = MI.getOperand(0).getReg(); + SrcReg2 = MI.getOperand(1).getReg(); CmpMask = ~0; CmpValue = 0; return true; case ARM::TSTri: case ARM::t2TSTri: - SrcReg = MI->getOperand(0).getReg(); + SrcReg = MI.getOperand(0).getReg(); SrcReg2 = 0; - CmpMask = MI->getOperand(1).getImm(); + CmpMask = MI.getOperand(1).getImm(); CmpValue = 0; return true; } @@ -2385,25 +2388,25 @@ inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg, /// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two /// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the /// condition code of instructions which use the flags. -bool ARMBaseInstrInfo:: -optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, - int CmpMask, int CmpValue, - const MachineRegisterInfo *MRI) const { +bool ARMBaseInstrInfo::optimizeCompareInstr( + MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, + int CmpValue, const MachineRegisterInfo *MRI) const { // Get the unique definition of SrcReg. MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); if (!MI) return false; // Masked compares sometimes use the same register as the corresponding 'and'. if (CmpMask != ~0) { - if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) { + if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(*MI)) { MI = nullptr; for (MachineRegisterInfo::use_instr_iterator UI = MRI->use_instr_begin(SrcReg), UE = MRI->use_instr_end(); UI != UE; ++UI) { - if (UI->getParent() != CmpInstr->getParent()) continue; + if (UI->getParent() != CmpInstr.getParent()) + continue; MachineInstr *PotentialAND = &*UI; if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) || - isPredicated(PotentialAND)) + isPredicated(*PotentialAND)) continue; MI = PotentialAND; break; @@ -2414,7 +2417,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // Get ready to iterate backward from CmpInstr. MachineBasicBlock::iterator I = CmpInstr, E = MI, - B = CmpInstr->getParent()->begin(); + B = CmpInstr.getParent()->begin(); // Early exit if CmpInstr is at the beginning of the BB. if (I == B) return false; @@ -2427,13 +2430,13 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, if (SrcReg2 != 0) // MI is not a candidate for CMPrr. MI = nullptr; - else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) { + else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) { // Conservatively refuse to convert an instruction which isn't in the same // BB as the comparison. // For CMPri w/ CmpValue != 0, a Sub may still be a candidate. // Thus we cannot return here. - if (CmpInstr->getOpcode() == ARM::CMPri || - CmpInstr->getOpcode() == ARM::t2CMPri) + if (CmpInstr.getOpcode() == ARM::CMPri || + CmpInstr.getOpcode() == ARM::t2CMPri) MI = nullptr; else return false; @@ -2453,7 +2456,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, return false; // Check whether CmpInstr can be made redundant by the current instruction. - if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) { + if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) { Sub = &*I; break; } @@ -2471,7 +2474,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, if (!MI) MI = Sub; // We can't use a predicated instruction - it doesn't always write the flags. - if (isPredicated(MI)) + if (isPredicated(*MI)) return false; switch (MI->getOpcode()) { @@ -2519,7 +2522,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, OperandsToUpdate; bool isSafe = false; I = CmpInstr; - E = CmpInstr->getParent()->end(); + E = CmpInstr.getParent()->end(); while (!isSafe && ++I != E) { const MachineInstr &Instr = *I; for (unsigned IO = 0, EO = Instr.getNumOperands(); @@ -2608,7 +2611,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // If CPSR is not killed nor re-defined, we should check whether it is // live-out. If it is live-out, do not optimize. if (!isSafe) { - MachineBasicBlock *MBB = CmpInstr->getParent(); + MachineBasicBlock *MBB = CmpInstr.getParent(); for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) if ((*SI)->isLiveIn(ARM::CPSR)) @@ -2618,8 +2621,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // Toggle the optional operand to CPSR. MI->getOperand(5).setReg(ARM::CPSR); MI->getOperand(5).setIsDef(true); - assert(!isPredicated(MI) && "Can't use flags from predicated instruction"); - CmpInstr->eraseFromParent(); + assert(!isPredicated(*MI) && "Can't use flags from predicated instruction"); + CmpInstr.eraseFromParent(); // Modify the condition code of operands in OperandsToUpdate. // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to @@ -2633,42 +2636,42 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, return false; } -bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI, - MachineInstr *DefMI, unsigned Reg, +bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const { // Fold large immediates into add, sub, or, xor. - unsigned DefOpc = DefMI->getOpcode(); + unsigned DefOpc = DefMI.getOpcode(); if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm) return false; - if (!DefMI->getOperand(1).isImm()) + if (!DefMI.getOperand(1).isImm()) // Could be t2MOVi32imm return false; if (!MRI->hasOneNonDBGUse(Reg)) return false; - const MCInstrDesc &DefMCID = DefMI->getDesc(); + const MCInstrDesc &DefMCID = DefMI.getDesc(); if (DefMCID.hasOptionalDef()) { unsigned NumOps = DefMCID.getNumOperands(); - const MachineOperand &MO = DefMI->getOperand(NumOps-1); + const MachineOperand &MO = DefMI.getOperand(NumOps - 1); if (MO.getReg() == ARM::CPSR && !MO.isDead()) // If DefMI defines CPSR and it is not dead, it's obviously not safe // to delete DefMI. return false; } - const MCInstrDesc &UseMCID = UseMI->getDesc(); + const MCInstrDesc &UseMCID = UseMI.getDesc(); if (UseMCID.hasOptionalDef()) { unsigned NumOps = UseMCID.getNumOperands(); - if (UseMI->getOperand(NumOps-1).getReg() == ARM::CPSR) + if (UseMI.getOperand(NumOps - 1).getReg() == ARM::CPSR) // If the instruction sets the flag, do not attempt this optimization // since it may change the semantics of the code. return false; } - unsigned UseOpc = UseMI->getOpcode(); + unsigned UseOpc = UseMI.getOpcode(); unsigned NewUseOpc = 0; - uint32_t ImmVal = (uint32_t)DefMI->getOperand(1).getImm(); + uint32_t ImmVal = (uint32_t)DefMI.getOperand(1).getImm(); uint32_t SOImmValV1 = 0, SOImmValV2 = 0; bool Commute = false; switch (UseOpc) { @@ -2681,17 +2684,27 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI, case ARM::t2ADDrr: case ARM::t2ORRrr: case ARM::t2EORrr: { - Commute = UseMI->getOperand(2).getReg() != Reg; + Commute = UseMI.getOperand(2).getReg() != Reg; switch (UseOpc) { default: break; + case ARM::ADDrr: case ARM::SUBrr: { - if (Commute) + if (UseOpc == ARM::SUBrr && Commute) + return false; + + // ADD/SUB are special because they're essentially the same operation, so + // we can handle a larger range of immediates. + if (ARM_AM::isSOImmTwoPartVal(ImmVal)) + NewUseOpc = UseOpc == ARM::ADDrr ? ARM::ADDri : ARM::SUBri; + else if (ARM_AM::isSOImmTwoPartVal(-ImmVal)) { + ImmVal = -ImmVal; + NewUseOpc = UseOpc == ARM::ADDrr ? ARM::SUBri : ARM::ADDri; + } else return false; - ImmVal = -ImmVal; - NewUseOpc = ARM::SUBri; - // Fallthrough + SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal); + SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal); + break; } - case ARM::ADDrr: case ARM::ORRrr: case ARM::EORrr: { if (!ARM_AM::isSOImmTwoPartVal(ImmVal)) @@ -2700,20 +2713,29 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI, SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal); switch (UseOpc) { default: break; - case ARM::ADDrr: NewUseOpc = ARM::ADDri; break; case ARM::ORRrr: NewUseOpc = ARM::ORRri; break; case ARM::EORrr: NewUseOpc = ARM::EORri; break; } break; } + case ARM::t2ADDrr: case ARM::t2SUBrr: { - if (Commute) + if (UseOpc == ARM::t2SUBrr && Commute) return false; - ImmVal = -ImmVal; - NewUseOpc = ARM::t2SUBri; - // Fallthrough + + // ADD/SUB are special because they're essentially the same operation, so + // we can handle a larger range of immediates. + if (ARM_AM::isT2SOImmTwoPartVal(ImmVal)) + NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2ADDri : ARM::t2SUBri; + else if (ARM_AM::isT2SOImmTwoPartVal(-ImmVal)) { + ImmVal = -ImmVal; + NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2SUBri : ARM::t2ADDri; + } else + return false; + SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal); + SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal); + break; } - case ARM::t2ADDrr: case ARM::t2ORRrr: case ARM::t2EORrr: { if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal)) @@ -2722,7 +2744,6 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI, SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal); switch (UseOpc) { default: break; - case ARM::t2ADDrr: NewUseOpc = ARM::t2ADDri; break; case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break; case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break; } @@ -2733,27 +2754,27 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI, } unsigned OpIdx = Commute ? 2 : 1; - unsigned Reg1 = UseMI->getOperand(OpIdx).getReg(); - bool isKill = UseMI->getOperand(OpIdx).isKill(); + unsigned Reg1 = UseMI.getOperand(OpIdx).getReg(); + bool isKill = UseMI.getOperand(OpIdx).isKill(); unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg)); - AddDefaultCC(AddDefaultPred(BuildMI(*UseMI->getParent(), - UseMI, UseMI->getDebugLoc(), - get(NewUseOpc), NewReg) - .addReg(Reg1, getKillRegState(isKill)) - .addImm(SOImmValV1))); - UseMI->setDesc(get(NewUseOpc)); - UseMI->getOperand(1).setReg(NewReg); - UseMI->getOperand(1).setIsKill(); - UseMI->getOperand(2).ChangeToImmediate(SOImmValV2); - DefMI->eraseFromParent(); + AddDefaultCC( + AddDefaultPred(BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(NewUseOpc), NewReg) + .addReg(Reg1, getKillRegState(isKill)) + .addImm(SOImmValV1))); + UseMI.setDesc(get(NewUseOpc)); + UseMI.getOperand(1).setReg(NewReg); + UseMI.getOperand(1).setIsKill(); + UseMI.getOperand(2).ChangeToImmediate(SOImmValV2); + DefMI.eraseFromParent(); return true; } static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, - const MachineInstr *MI) { - switch (MI->getOpcode()) { + const MachineInstr &MI) { + switch (MI.getOpcode()) { default: { - const MCInstrDesc &Desc = MI->getDesc(); + const MCInstrDesc &Desc = MI.getDesc(); int UOps = ItinData->getNumMicroOps(Desc.getSchedClass()); assert(UOps >= 0 && "bad # UOps"); return UOps; @@ -2763,7 +2784,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRBrs: case ARM::STRrs: case ARM::STRBrs: { - unsigned ShOpVal = MI->getOperand(3).getImm(); + unsigned ShOpVal = MI.getOperand(3).getImm(); bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); if (!isSub && @@ -2776,10 +2797,10 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRH: case ARM::STRH: { - if (!MI->getOperand(2).getReg()) + if (!MI.getOperand(2).getReg()) return 1; - unsigned ShOpVal = MI->getOperand(3).getImm(); + unsigned ShOpVal = MI.getOperand(3).getImm(); bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); if (!isSub && @@ -2792,22 +2813,22 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRSB: case ARM::LDRSH: - return (ARM_AM::getAM3Op(MI->getOperand(3).getImm()) == ARM_AM::sub) ? 3:2; + return (ARM_AM::getAM3Op(MI.getOperand(3).getImm()) == ARM_AM::sub) ? 3 : 2; case ARM::LDRSB_POST: case ARM::LDRSH_POST: { - unsigned Rt = MI->getOperand(0).getReg(); - unsigned Rm = MI->getOperand(3).getReg(); + unsigned Rt = MI.getOperand(0).getReg(); + unsigned Rm = MI.getOperand(3).getReg(); return (Rt == Rm) ? 4 : 3; } case ARM::LDR_PRE_REG: case ARM::LDRB_PRE_REG: { - unsigned Rt = MI->getOperand(0).getReg(); - unsigned Rm = MI->getOperand(3).getReg(); + unsigned Rt = MI.getOperand(0).getReg(); + unsigned Rm = MI.getOperand(3).getReg(); if (Rt == Rm) return 3; - unsigned ShOpVal = MI->getOperand(4).getImm(); + unsigned ShOpVal = MI.getOperand(4).getImm(); bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); if (!isSub && @@ -2820,7 +2841,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::STR_PRE_REG: case ARM::STRB_PRE_REG: { - unsigned ShOpVal = MI->getOperand(4).getImm(); + unsigned ShOpVal = MI.getOperand(4).getImm(); bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); if (!isSub && @@ -2833,21 +2854,20 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRH_PRE: case ARM::STRH_PRE: { - unsigned Rt = MI->getOperand(0).getReg(); - unsigned Rm = MI->getOperand(3).getReg(); + unsigned Rt = MI.getOperand(0).getReg(); + unsigned Rm = MI.getOperand(3).getReg(); if (!Rm) return 2; if (Rt == Rm) return 3; - return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) - ? 3 : 2; + return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 3 : 2; } case ARM::LDR_POST_REG: case ARM::LDRB_POST_REG: case ARM::LDRH_POST: { - unsigned Rt = MI->getOperand(0).getReg(); - unsigned Rm = MI->getOperand(3).getReg(); + unsigned Rt = MI.getOperand(0).getReg(); + unsigned Rm = MI.getOperand(3).getReg(); return (Rt == Rm) ? 3 : 2; } @@ -2866,13 +2886,13 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRSB_PRE: case ARM::LDRSH_PRE: { - unsigned Rm = MI->getOperand(3).getReg(); + unsigned Rm = MI.getOperand(3).getReg(); if (Rm == 0) return 3; - unsigned Rt = MI->getOperand(0).getReg(); + unsigned Rt = MI.getOperand(0).getReg(); if (Rt == Rm) return 4; - unsigned ShOpVal = MI->getOperand(4).getImm(); + unsigned ShOpVal = MI.getOperand(4).getImm(); bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); if (!isSub && @@ -2884,18 +2904,20 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, } case ARM::LDRD: { - unsigned Rt = MI->getOperand(0).getReg(); - unsigned Rn = MI->getOperand(2).getReg(); - unsigned Rm = MI->getOperand(3).getReg(); + unsigned Rt = MI.getOperand(0).getReg(); + unsigned Rn = MI.getOperand(2).getReg(); + unsigned Rm = MI.getOperand(3).getReg(); if (Rm) - return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3; + return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4 + : 3; return (Rt == Rn) ? 3 : 2; } case ARM::STRD: { - unsigned Rm = MI->getOperand(3).getReg(); + unsigned Rm = MI.getOperand(3).getReg(); if (Rm) - return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3; + return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4 + : 3; return 2; } @@ -2908,24 +2930,26 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, return 4; case ARM::LDRD_PRE: { - unsigned Rt = MI->getOperand(0).getReg(); - unsigned Rn = MI->getOperand(3).getReg(); - unsigned Rm = MI->getOperand(4).getReg(); + unsigned Rt = MI.getOperand(0).getReg(); + unsigned Rn = MI.getOperand(3).getReg(); + unsigned Rm = MI.getOperand(4).getReg(); if (Rm) - return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4; + return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5 + : 4; return (Rt == Rn) ? 4 : 3; } case ARM::t2LDRD_PRE: { - unsigned Rt = MI->getOperand(0).getReg(); - unsigned Rn = MI->getOperand(3).getReg(); + unsigned Rt = MI.getOperand(0).getReg(); + unsigned Rn = MI.getOperand(3).getReg(); return (Rt == Rn) ? 4 : 3; } case ARM::STRD_PRE: { - unsigned Rm = MI->getOperand(4).getReg(); + unsigned Rm = MI.getOperand(4).getReg(); if (Rm) - return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4; + return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5 + : 4; return 3; } @@ -2953,8 +2977,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, return 2; case ARM::t2LDRDi8: { - unsigned Rt = MI->getOperand(0).getReg(); - unsigned Rn = MI->getOperand(2).getReg(); + unsigned Rt = MI.getOperand(0).getReg(); + unsigned Rn = MI.getOperand(2).getReg(); return (Rt == Rn) ? 3 : 2; } @@ -2994,22 +3018,61 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, // sizes during MC lowering. That target hook should be local to MC lowering // because we can't ensure that it is aware of other MI forms. Doing this will // ensure that MachineMemOperands are correctly propagated through all passes. -unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr *MI) const { +unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr &MI) const { unsigned Size = 0; - for (MachineInstr::mmo_iterator I = MI->memoperands_begin(), - E = MI->memoperands_end(); I != E; ++I) { + for (MachineInstr::mmo_iterator I = MI.memoperands_begin(), + E = MI.memoperands_end(); + I != E; ++I) { Size += (*I)->getSize(); } return Size / 4; } -unsigned -ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, - const MachineInstr *MI) const { +static unsigned getNumMicroOpsSingleIssuePlusExtras(unsigned Opc, + unsigned NumRegs) { + unsigned UOps = 1 + NumRegs; // 1 for address computation. + switch (Opc) { + default: + break; + case ARM::VLDMDIA_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + case ARM::VSTMDIA_UPD: + case ARM::VSTMDDB_UPD: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: + case ARM::LDMIA_UPD: + case ARM::LDMDA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::STMIA_UPD: + case ARM::STMDA_UPD: + case ARM::STMDB_UPD: + case ARM::STMIB_UPD: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: + ++UOps; // One for base register writeback. + break; + case ARM::LDMIA_RET: + case ARM::tPOP_RET: + case ARM::t2LDMIA_RET: + UOps += 2; // One for base reg wb, one for write to pc. + break; + } + return UOps; +} + +unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr &MI) const { if (!ItinData || ItinData->isEmpty()) return 1; - const MCInstrDesc &Desc = MI->getDesc(); + const MCInstrDesc &Desc = MI.getDesc(); unsigned Class = Desc.getSchedClass(); int ItinUOps = ItinData->getNumMicroOps(Class); if (ItinUOps >= 0) { @@ -3019,7 +3082,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, return ItinUOps; } - unsigned Opc = MI->getOpcode(); + unsigned Opc = MI.getOpcode(); switch (Opc) { default: llvm_unreachable("Unexpected multi-uops instruction!"); @@ -3049,7 +3112,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, case ARM::VSTMSIA: case ARM::VSTMSIA_UPD: case ARM::VSTMSDB_UPD: { - unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands(); + unsigned NumRegs = MI.getNumOperands() - Desc.getNumOperands(); return (NumRegs / 2) + (NumRegs % 2) + 1; } @@ -3085,66 +3148,36 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, case ARM::t2STMDB: case ARM::t2STMIA_UPD: case ARM::t2STMDB_UPD: { - unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1; - if (Subtarget.isSwift()) { - int UOps = 1 + NumRegs; // One for address computation, one for each ld / st. - switch (Opc) { - default: break; - case ARM::VLDMDIA_UPD: - case ARM::VLDMDDB_UPD: - case ARM::VLDMSIA_UPD: - case ARM::VLDMSDB_UPD: - case ARM::VSTMDIA_UPD: - case ARM::VSTMDDB_UPD: - case ARM::VSTMSIA_UPD: - case ARM::VSTMSDB_UPD: - case ARM::LDMIA_UPD: - case ARM::LDMDA_UPD: - case ARM::LDMDB_UPD: - case ARM::LDMIB_UPD: - case ARM::STMIA_UPD: - case ARM::STMDA_UPD: - case ARM::STMDB_UPD: - case ARM::STMIB_UPD: - case ARM::tLDMIA_UPD: - case ARM::tSTMIA_UPD: - case ARM::t2LDMIA_UPD: - case ARM::t2LDMDB_UPD: - case ARM::t2STMIA_UPD: - case ARM::t2STMDB_UPD: - ++UOps; // One for base register writeback. - break; - case ARM::LDMIA_RET: - case ARM::tPOP_RET: - case ARM::t2LDMIA_RET: - UOps += 2; // One for base reg wb, one for write to pc. - break; - } - return UOps; - } else if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) { + unsigned NumRegs = MI.getNumOperands() - Desc.getNumOperands() + 1; + switch (Subtarget.getLdStMultipleTiming()) { + case ARMSubtarget::SingleIssuePlusExtras: + return getNumMicroOpsSingleIssuePlusExtras(Opc, NumRegs); + case ARMSubtarget::SingleIssue: + // Assume the worst. + return NumRegs; + case ARMSubtarget::DoubleIssue: { if (NumRegs < 4) return 2; // 4 registers would be issued: 2, 2. // 5 registers would be issued: 2, 2, 1. - int A8UOps = (NumRegs / 2); + unsigned UOps = (NumRegs / 2); if (NumRegs % 2) - ++A8UOps; - return A8UOps; - } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) { - int A9UOps = (NumRegs / 2); + ++UOps; + return UOps; + } + case ARMSubtarget::DoubleIssueCheckUnalignedAccess: { + unsigned UOps = (NumRegs / 2); // If there are odd number of registers or if it's not 64-bit aligned, // then it takes an extra AGU (Address Generation Unit) cycle. - if ((NumRegs % 2) || - !MI->hasOneMemOperand() || - (*MI->memoperands_begin())->getAlignment() < 8) - ++A9UOps; - return A9UOps; - } else { - // Assume the worst. - return NumRegs; + if ((NumRegs % 2) || !MI.hasOneMemOperand() || + (*MI.memoperands_begin())->getAlignment() < 8) + ++UOps; + return UOps; + } } } } + llvm_unreachable("Didn't find the number of microops"); } int @@ -3428,13 +3461,13 @@ static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI, } static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, - const MachineInstr *MI, unsigned Reg, + const MachineInstr &MI, unsigned Reg, unsigned &UseIdx, unsigned &Dist) { Dist = 0; - MachineBasicBlock::const_instr_iterator II = ++MI->getIterator(); + MachineBasicBlock::const_instr_iterator II = ++MI.getIterator(); assert(II->isInsideBundle() && "Empty bundle?"); - MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); + MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); // FIXME: This doesn't properly handle multiple uses. int Idx = -1; @@ -3460,17 +3493,17 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, /// itinerary based on the def opcode and alignment. The caller will ensure that /// adjusted latency is at least one cycle. static int adjustDefLatency(const ARMSubtarget &Subtarget, - const MachineInstr *DefMI, - const MCInstrDesc *DefMCID, unsigned DefAlign) { + const MachineInstr &DefMI, + const MCInstrDesc &DefMCID, unsigned DefAlign) { int Adjust = 0; if (Subtarget.isCortexA8() || Subtarget.isLikeA9() || Subtarget.isCortexA7()) { // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2] // variants are one cycle cheaper. - switch (DefMCID->getOpcode()) { + switch (DefMCID.getOpcode()) { default: break; case ARM::LDRrs: case ARM::LDRBrs: { - unsigned ShOpVal = DefMI->getOperand(3).getImm(); + unsigned ShOpVal = DefMI.getOperand(3).getImm(); unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); if (ShImm == 0 || (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)) @@ -3482,7 +3515,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget, case ARM::t2LDRHs: case ARM::t2LDRSHs: { // Thumb2 mode: lsl only. - unsigned ShAmt = DefMI->getOperand(3).getImm(); + unsigned ShAmt = DefMI.getOperand(3).getImm(); if (ShAmt == 0 || ShAmt == 2) --Adjust; break; @@ -3491,11 +3524,11 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget, } else if (Subtarget.isSwift()) { // FIXME: Properly handle all of the latency adjustments for address // writeback. - switch (DefMCID->getOpcode()) { + switch (DefMCID.getOpcode()) { default: break; case ARM::LDRrs: case ARM::LDRBrs: { - unsigned ShOpVal = DefMI->getOperand(3).getImm(); + unsigned ShOpVal = DefMI.getOperand(3).getImm(); bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); if (!isSub && @@ -3513,7 +3546,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget, case ARM::t2LDRHs: case ARM::t2LDRSHs: { // Thumb2 mode: lsl only. - unsigned ShAmt = DefMI->getOperand(3).getImm(); + unsigned ShAmt = DefMI.getOperand(3).getImm(); if (ShAmt == 0 || ShAmt == 1 || ShAmt == 2 || ShAmt == 3) Adjust -= 2; break; @@ -3521,8 +3554,8 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget, } } - if (DefAlign < 8 && Subtarget.isLikeA9()) { - switch (DefMCID->getOpcode()) { + if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment()) { + switch (DefMCID.getOpcode()) { default: break; case ARM::VLD1q8: case ARM::VLD1q16: @@ -3637,53 +3670,55 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget, return Adjust; } - - -int -ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, - unsigned UseIdx) const { +int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr &DefMI, + unsigned DefIdx, + const MachineInstr &UseMI, + unsigned UseIdx) const { // No operand latency. The caller may fall back to getInstrLatency. if (!ItinData || ItinData->isEmpty()) return -1; - const MachineOperand &DefMO = DefMI->getOperand(DefIdx); + const MachineOperand &DefMO = DefMI.getOperand(DefIdx); unsigned Reg = DefMO.getReg(); - const MCInstrDesc *DefMCID = &DefMI->getDesc(); - const MCInstrDesc *UseMCID = &UseMI->getDesc(); + const MachineInstr *ResolvedDefMI = &DefMI; unsigned DefAdj = 0; - if (DefMI->isBundle()) { - DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj); - DefMCID = &DefMI->getDesc(); - } - if (DefMI->isCopyLike() || DefMI->isInsertSubreg() || - DefMI->isRegSequence() || DefMI->isImplicitDef()) { + if (DefMI.isBundle()) + ResolvedDefMI = + getBundledDefMI(&getRegisterInfo(), &DefMI, Reg, DefIdx, DefAdj); + if (ResolvedDefMI->isCopyLike() || ResolvedDefMI->isInsertSubreg() || + ResolvedDefMI->isRegSequence() || ResolvedDefMI->isImplicitDef()) { return 1; } + const MachineInstr *ResolvedUseMI = &UseMI; unsigned UseAdj = 0; - if (UseMI->isBundle()) { - unsigned NewUseIdx; - const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI, - Reg, NewUseIdx, UseAdj); - if (!NewUseMI) + if (UseMI.isBundle()) { + ResolvedUseMI = + getBundledUseMI(&getRegisterInfo(), UseMI, Reg, UseIdx, UseAdj); + if (!ResolvedUseMI) return -1; - - UseMI = NewUseMI; - UseIdx = NewUseIdx; - UseMCID = &UseMI->getDesc(); } + return getOperandLatencyImpl( + ItinData, *ResolvedDefMI, DefIdx, ResolvedDefMI->getDesc(), DefAdj, DefMO, + Reg, *ResolvedUseMI, UseIdx, ResolvedUseMI->getDesc(), UseAdj); +} + +int ARMBaseInstrInfo::getOperandLatencyImpl( + const InstrItineraryData *ItinData, const MachineInstr &DefMI, + unsigned DefIdx, const MCInstrDesc &DefMCID, unsigned DefAdj, + const MachineOperand &DefMO, unsigned Reg, const MachineInstr &UseMI, + unsigned UseIdx, const MCInstrDesc &UseMCID, unsigned UseAdj) const { if (Reg == ARM::CPSR) { - if (DefMI->getOpcode() == ARM::FMSTAT) { + if (DefMI.getOpcode() == ARM::FMSTAT) { // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?) return Subtarget.isLikeA9() ? 1 : 20; } // CPSR set and branch can be paired in the same cycle. - if (UseMI->isBranch()) + if (UseMI.isBranch()) return 0; // Otherwise it takes the instruction latency (generally one). @@ -3694,7 +3729,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, // incur a code size penalty (not able to use the CPSR setting 16-bit // instructions). if (Latency > 0 && Subtarget.isThumb2()) { - const MachineFunction *MF = DefMI->getParent()->getParent(); + const MachineFunction *MF = DefMI.getParent()->getParent(); // FIXME: Use Function::optForSize(). if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize)) --Latency; @@ -3702,17 +3737,19 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return Latency; } - if (DefMO.isImplicit() || UseMI->getOperand(UseIdx).isImplicit()) + if (DefMO.isImplicit() || UseMI.getOperand(UseIdx).isImplicit()) return -1; - unsigned DefAlign = DefMI->hasOneMemOperand() - ? (*DefMI->memoperands_begin())->getAlignment() : 0; - unsigned UseAlign = UseMI->hasOneMemOperand() - ? (*UseMI->memoperands_begin())->getAlignment() : 0; + unsigned DefAlign = DefMI.hasOneMemOperand() + ? (*DefMI.memoperands_begin())->getAlignment() + : 0; + unsigned UseAlign = UseMI.hasOneMemOperand() + ? (*UseMI.memoperands_begin())->getAlignment() + : 0; // Get the itinerary's latency if possible, and handle variable_ops. - int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign, - *UseMCID, UseIdx, UseAlign); + int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, UseMCID, + UseIdx, UseAlign); // Unable to find operand latency. The caller may resort to getInstrLatency. if (Latency < 0) return Latency; @@ -3746,10 +3783,9 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, if (!UseNode->isMachineOpcode()) { int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx); - if (Subtarget.isLikeA9() || Subtarget.isSwift()) - return Latency <= 2 ? 1 : Latency - 1; - else - return Latency <= 3 ? 1 : Latency - 2; + int Adj = Subtarget.getPreISelOperandLatencyAdjustment(); + int Threshold = 1 + Adj; + return Latency <= Threshold ? 1 : Latency - Adj; } const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode()); @@ -3820,7 +3856,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, } } - if (DefAlign < 8 && Subtarget.isLikeA9()) + if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment()) switch (DefMCID.getOpcode()) { default: break; case ARM::VLD1q8: @@ -3946,15 +3982,15 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return Latency; } -unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const { - if (MI->isCopyLike() || MI->isInsertSubreg() || - MI->isRegSequence() || MI->isImplicitDef()) +unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr &MI) const { + if (MI.isCopyLike() || MI.isInsertSubreg() || MI.isRegSequence() || + MI.isImplicitDef()) return 0; - if (MI->isBundle()) + if (MI.isBundle()) return 0; - const MCInstrDesc &MCID = MI->getDesc(); + const MCInstrDesc &MCID = MI.getDesc(); if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) { // When predicated, CPSR is an additional source operand for CPSR updating @@ -3965,26 +4001,26 @@ unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const { } unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, + const MachineInstr &MI, unsigned *PredCost) const { - if (MI->isCopyLike() || MI->isInsertSubreg() || - MI->isRegSequence() || MI->isImplicitDef()) + if (MI.isCopyLike() || MI.isInsertSubreg() || MI.isRegSequence() || + MI.isImplicitDef()) return 1; // An instruction scheduler typically runs on unbundled instructions, however // other passes may query the latency of a bundled instruction. - if (MI->isBundle()) { + if (MI.isBundle()) { unsigned Latency = 0; - MachineBasicBlock::const_instr_iterator I = MI->getIterator(); - MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); + MachineBasicBlock::const_instr_iterator I = MI.getIterator(); + MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); while (++I != E && I->isInsideBundle()) { if (I->getOpcode() != ARM::t2IT) - Latency += getInstrLatency(ItinData, &*I, PredCost); + Latency += getInstrLatency(ItinData, *I, PredCost); } return Latency; } - const MCInstrDesc &MCID = MI->getDesc(); + const MCInstrDesc &MCID = MI.getDesc(); if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) { // When predicated, CPSR is an additional source operand for CPSR updating // instructions, this apparently increases their latencies. @@ -3993,7 +4029,7 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, // Be sure to call getStageLatency for an empty itinerary in case it has a // valid MinLatency property. if (!ItinData) - return MI->mayLoad() ? 3 : 1; + return MI.mayLoad() ? 3 : 1; unsigned Class = MCID.getSchedClass(); @@ -4005,9 +4041,9 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, unsigned Latency = ItinData->getStageLatency(Class); // Adjust for dynamic def-side opcode variants not captured by the itinerary. - unsigned DefAlign = MI->hasOneMemOperand() - ? (*MI->memoperands_begin())->getAlignment() : 0; - int Adj = adjustDefLatency(Subtarget, MI, &MCID, DefAlign); + unsigned DefAlign = + MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlignment() : 0; + int Adj = adjustDefLatency(Subtarget, MI, MCID, DefAlign); if (Adj >= 0 || (int)Latency > -Adj) { return Latency + Adj; } @@ -4032,46 +4068,46 @@ int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, } } -bool ARMBaseInstrInfo:: -hasHighOperandLatency(const TargetSchedModel &SchedModel, - const MachineRegisterInfo *MRI, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx) const { - unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask; - unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask; - if (Subtarget.isCortexA8() && +bool ARMBaseInstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel, + const MachineRegisterInfo *MRI, + const MachineInstr &DefMI, + unsigned DefIdx, + const MachineInstr &UseMI, + unsigned UseIdx) const { + unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask; + unsigned UDomain = UseMI.getDesc().TSFlags & ARMII::DomainMask; + if (Subtarget.nonpipelinedVFP() && (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP)) - // CortexA8 VFP instructions are not pipelined. return true; // Hoist VFP / NEON instructions with 4 or higher latency. - unsigned Latency - = SchedModel.computeOperandLatency(DefMI, DefIdx, UseMI, UseIdx); + unsigned Latency = + SchedModel.computeOperandLatency(&DefMI, DefIdx, &UseMI, UseIdx); if (Latency <= 3) return false; return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON || UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON; } -bool ARMBaseInstrInfo:: -hasLowDefLatency(const TargetSchedModel &SchedModel, - const MachineInstr *DefMI, unsigned DefIdx) const { +bool ARMBaseInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel, + const MachineInstr &DefMI, + unsigned DefIdx) const { const InstrItineraryData *ItinData = SchedModel.getInstrItineraries(); if (!ItinData || ItinData->isEmpty()) return false; - unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask; + unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask; if (DDomain == ARMII::DomainGeneral) { - unsigned DefClass = DefMI->getDesc().getSchedClass(); + unsigned DefClass = DefMI.getDesc().getSchedClass(); int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); return (DefCycle != -1 && DefCycle <= 2); } return false; } -bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI, +bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { - if (convertAddSubFlagsOpcode(MI->getOpcode())) { + if (convertAddSubFlagsOpcode(MI.getOpcode())) { ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG"; return false; } @@ -4082,8 +4118,7 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI, // sequence is needed for other targets. void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI, unsigned LoadImmOpc, - unsigned LoadOpc, - Reloc::Model RM) const { + unsigned LoadOpc) const { MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); unsigned Reg = MI->getOperand(0).getReg(); @@ -4094,12 +4129,12 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI, BuildMI(MBB, MI, DL, get(LoadImmOpc), Reg) .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY); - if (Subtarget.GVIsIndirectSymbol(GV, RM)) { + if (Subtarget.isGVIndirectSymbol(GV)) { MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg); MIB.addReg(Reg, RegState::Kill).addImm(0); - unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; + auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( - MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4); + MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4); MIB.addMemOperand(MMO); AddDefaultPred(MIB); } @@ -4146,24 +4181,24 @@ enum ARMExeDomain { // Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h // std::pair -ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const { +ARMBaseInstrInfo::getExecutionDomain(const MachineInstr &MI) const { // If we don't have access to NEON instructions then we won't be able // to swizzle anything to the NEON domain. Check to make sure. if (Subtarget.hasNEON()) { // VMOVD, VMOVRS and VMOVSR are VFP instructions, but can be changed to NEON // if they are not predicated. - if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI)) + if (MI.getOpcode() == ARM::VMOVD && !isPredicated(MI)) return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON)); // CortexA9 is particularly picky about mixing the two and wants these // converted. - if (Subtarget.isCortexA9() && !isPredicated(MI) && - (MI->getOpcode() == ARM::VMOVRS || MI->getOpcode() == ARM::VMOVSR || - MI->getOpcode() == ARM::VMOVS)) + if (Subtarget.useNEONForFPMovs() && !isPredicated(MI) && + (MI.getOpcode() == ARM::VMOVRS || MI.getOpcode() == ARM::VMOVSR || + MI.getOpcode() == ARM::VMOVS)) return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON)); } // No other instructions can be swizzled, so just determine their domain. - unsigned Domain = MI->getDesc().TSFlags & ARMII::DomainMask; + unsigned Domain = MI.getDesc().TSFlags & ARMII::DomainMask; if (Domain & ARMII::DomainNEON) return std::make_pair(ExeNEON, 0); @@ -4210,12 +4245,11 @@ static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI, /// (including the case where the DPR itself is defined), it should not. /// static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI, - MachineInstr *MI, - unsigned DReg, unsigned Lane, - unsigned &ImplicitSReg) { + MachineInstr &MI, unsigned DReg, + unsigned Lane, unsigned &ImplicitSReg) { // If the DPR is defined or used already, the other SPR lane will be chained // correctly, so there is nothing to be done. - if (MI->definesRegister(DReg, TRI) || MI->readsRegister(DReg, TRI)) { + if (MI.definesRegister(DReg, TRI) || MI.readsRegister(DReg, TRI)) { ImplicitSReg = 0; return true; } @@ -4224,7 +4258,7 @@ static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI, ImplicitSReg = TRI->getSubReg(DReg, (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1); MachineBasicBlock::LivenessQueryResult LQR = - MI->getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI); + MI.getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI); if (LQR == MachineBasicBlock::LQR_Live) return true; @@ -4237,106 +4271,105 @@ static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI, return true; } -void -ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { +void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI, + unsigned Domain) const { unsigned DstReg, SrcReg, DReg; unsigned Lane; - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); const TargetRegisterInfo *TRI = &getRegisterInfo(); - switch (MI->getOpcode()) { - default: - llvm_unreachable("cannot handle opcode!"); + switch (MI.getOpcode()) { + default: + llvm_unreachable("cannot handle opcode!"); + break; + case ARM::VMOVD: + if (Domain != ExeNEON) break; - case ARM::VMOVD: - if (Domain != ExeNEON) - break; - // Zap the predicate operands. - assert(!isPredicated(MI) && "Cannot predicate a VORRd"); + // Zap the predicate operands. + assert(!isPredicated(MI) && "Cannot predicate a VORRd"); - // Make sure we've got NEON instructions. - assert(Subtarget.hasNEON() && "VORRd requires NEON"); + // Make sure we've got NEON instructions. + assert(Subtarget.hasNEON() && "VORRd requires NEON"); - // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits) - DstReg = MI->getOperand(0).getReg(); - SrcReg = MI->getOperand(1).getReg(); + // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits) + DstReg = MI.getOperand(0).getReg(); + SrcReg = MI.getOperand(1).getReg(); - for (unsigned i = MI->getDesc().getNumOperands(); i; --i) - MI->RemoveOperand(i-1); + for (unsigned i = MI.getDesc().getNumOperands(); i; --i) + MI.RemoveOperand(i - 1); - // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits) - MI->setDesc(get(ARM::VORRd)); - AddDefaultPred(MIB.addReg(DstReg, RegState::Define) - .addReg(SrcReg) - .addReg(SrcReg)); + // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits) + MI.setDesc(get(ARM::VORRd)); + AddDefaultPred( + MIB.addReg(DstReg, RegState::Define).addReg(SrcReg).addReg(SrcReg)); + break; + case ARM::VMOVRS: + if (Domain != ExeNEON) break; - case ARM::VMOVRS: - if (Domain != ExeNEON) - break; - assert(!isPredicated(MI) && "Cannot predicate a VGETLN"); + assert(!isPredicated(MI) && "Cannot predicate a VGETLN"); - // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits) - DstReg = MI->getOperand(0).getReg(); - SrcReg = MI->getOperand(1).getReg(); + // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits) + DstReg = MI.getOperand(0).getReg(); + SrcReg = MI.getOperand(1).getReg(); - for (unsigned i = MI->getDesc().getNumOperands(); i; --i) - MI->RemoveOperand(i-1); + for (unsigned i = MI.getDesc().getNumOperands(); i; --i) + MI.RemoveOperand(i - 1); - DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane); + DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane); - // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps) - // Note that DSrc has been widened and the other lane may be undef, which - // contaminates the entire register. - MI->setDesc(get(ARM::VGETLNi32)); - AddDefaultPred(MIB.addReg(DstReg, RegState::Define) - .addReg(DReg, RegState::Undef) - .addImm(Lane)); + // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps) + // Note that DSrc has been widened and the other lane may be undef, which + // contaminates the entire register. + MI.setDesc(get(ARM::VGETLNi32)); + AddDefaultPred(MIB.addReg(DstReg, RegState::Define) + .addReg(DReg, RegState::Undef) + .addImm(Lane)); - // The old source should be an implicit use, otherwise we might think it - // was dead before here. - MIB.addReg(SrcReg, RegState::Implicit); + // The old source should be an implicit use, otherwise we might think it + // was dead before here. + MIB.addReg(SrcReg, RegState::Implicit); + break; + case ARM::VMOVSR: { + if (Domain != ExeNEON) break; - case ARM::VMOVSR: { - if (Domain != ExeNEON) - break; - assert(!isPredicated(MI) && "Cannot predicate a VSETLN"); + assert(!isPredicated(MI) && "Cannot predicate a VSETLN"); - // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits) - DstReg = MI->getOperand(0).getReg(); - SrcReg = MI->getOperand(1).getReg(); + // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits) + DstReg = MI.getOperand(0).getReg(); + SrcReg = MI.getOperand(1).getReg(); - DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane); + DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane); - unsigned ImplicitSReg; - if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg)) - break; + unsigned ImplicitSReg; + if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg)) + break; - for (unsigned i = MI->getDesc().getNumOperands(); i; --i) - MI->RemoveOperand(i-1); + for (unsigned i = MI.getDesc().getNumOperands(); i; --i) + MI.RemoveOperand(i - 1); - // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps) - // Again DDst may be undefined at the beginning of this instruction. - MI->setDesc(get(ARM::VSETLNi32)); - MIB.addReg(DReg, RegState::Define) - .addReg(DReg, getUndefRegState(!MI->readsRegister(DReg, TRI))) - .addReg(SrcReg) - .addImm(Lane); - AddDefaultPred(MIB); + // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps) + // Again DDst may be undefined at the beginning of this instruction. + MI.setDesc(get(ARM::VSETLNi32)); + MIB.addReg(DReg, RegState::Define) + .addReg(DReg, getUndefRegState(!MI.readsRegister(DReg, TRI))) + .addReg(SrcReg) + .addImm(Lane); + AddDefaultPred(MIB); - // The narrower destination must be marked as set to keep previous chains - // in place. - MIB.addReg(DstReg, RegState::Define | RegState::Implicit); - if (ImplicitSReg != 0) - MIB.addReg(ImplicitSReg, RegState::Implicit); - break; + // The narrower destination must be marked as set to keep previous chains + // in place. + MIB.addReg(DstReg, RegState::Define | RegState::Implicit); + if (ImplicitSReg != 0) + MIB.addReg(ImplicitSReg, RegState::Implicit); + break; } case ARM::VMOVS: { if (Domain != ExeNEON) break; // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits) - DstReg = MI->getOperand(0).getReg(); - SrcReg = MI->getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + SrcReg = MI.getOperand(1).getReg(); unsigned DstLane = 0, SrcLane = 0, DDst, DSrc; DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane); @@ -4346,16 +4379,16 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg)) break; - for (unsigned i = MI->getDesc().getNumOperands(); i; --i) - MI->RemoveOperand(i-1); + for (unsigned i = MI.getDesc().getNumOperands(); i; --i) + MI.RemoveOperand(i - 1); if (DSrc == DDst) { // Destination can be: // %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits) - MI->setDesc(get(ARM::VDUPLN32d)); + MI.setDesc(get(ARM::VDUPLN32d)); MIB.addReg(DDst, RegState::Define) - .addReg(DDst, getUndefRegState(!MI->readsRegister(DDst, TRI))) - .addImm(SrcLane); + .addReg(DDst, getUndefRegState(!MI.readsRegister(DDst, TRI))) + .addImm(SrcLane); AddDefaultPred(MIB); // Neither the source or the destination are naturally represented any @@ -4380,18 +4413,18 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { // Pattern of the MachineInstrs is: // %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits) MachineInstrBuilder NewMIB; - NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - get(ARM::VEXTd32), DDst); + NewMIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::VEXTd32), + DDst); // On the first instruction, both DSrc and DDst may be if present. // Specifically when the original instruction didn't have them as an // . unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst; - bool CurUndef = !MI->readsRegister(CurReg, TRI); + bool CurUndef = !MI.readsRegister(CurReg, TRI); NewMIB.addReg(CurReg, getUndefRegState(CurUndef)); CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst; - CurUndef = !MI->readsRegister(CurReg, TRI); + CurUndef = !MI.readsRegister(CurReg, TRI); NewMIB.addReg(CurReg, getUndefRegState(CurUndef)); NewMIB.addImm(1); @@ -4400,17 +4433,17 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { if (SrcLane == DstLane) NewMIB.addReg(SrcReg, RegState::Implicit); - MI->setDesc(get(ARM::VEXTd32)); + MI.setDesc(get(ARM::VEXTd32)); MIB.addReg(DDst, RegState::Define); // On the second instruction, DDst has definitely been defined above, so // it is not . DSrc, if present, can be as above. CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst; - CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI); + CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI); MIB.addReg(CurReg, getUndefRegState(CurUndef)); CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst; - CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI); + CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI); MIB.addReg(CurReg, getUndefRegState(CurUndef)); MIB.addImm(1); @@ -4446,24 +4479,23 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { // VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops. // // FCONSTD can be used as a dependency-breaking instruction. -unsigned ARMBaseInstrInfo:: -getPartialRegUpdateClearance(const MachineInstr *MI, - unsigned OpNum, - const TargetRegisterInfo *TRI) const { - if (!SwiftPartialUpdateClearance || - !(Subtarget.isSwift() || Subtarget.isCortexA15())) +unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance( + const MachineInstr &MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + auto PartialUpdateClearance = Subtarget.getPartialUpdateClearance(); + if (!PartialUpdateClearance) return 0; assert(TRI && "Need TRI instance"); - const MachineOperand &MO = MI->getOperand(OpNum); + const MachineOperand &MO = MI.getOperand(OpNum); if (MO.readsReg()) return 0; unsigned Reg = MO.getReg(); int UseOp = -1; - switch(MI->getOpcode()) { - // Normal instructions writing only an S-register. + switch (MI.getOpcode()) { + // Normal instructions writing only an S-register. case ARM::VLDRS: case ARM::FCONSTS: case ARM::VMOVSR: @@ -4472,7 +4504,7 @@ getPartialRegUpdateClearance(const MachineInstr *MI, case ARM::VMOVv2i32: case ARM::VMOVv2f32: case ARM::VMOVv1i64: - UseOp = MI->findRegisterUseOperandIdx(Reg, false, TRI); + UseOp = MI.findRegisterUseOperandIdx(Reg, false, TRI); break; // Explicitly reads the dependency. @@ -4485,37 +4517,35 @@ getPartialRegUpdateClearance(const MachineInstr *MI, // If this instruction actually reads a value from Reg, there is no unwanted // dependency. - if (UseOp != -1 && MI->getOperand(UseOp).readsReg()) + if (UseOp != -1 && MI.getOperand(UseOp).readsReg()) return 0; // We must be able to clobber the whole D-reg. if (TargetRegisterInfo::isVirtualRegister(Reg)) { // Virtual register must be a foo:ssub_0 operand. - if (!MO.getSubReg() || MI->readsVirtualRegister(Reg)) + if (!MO.getSubReg() || MI.readsVirtualRegister(Reg)) return 0; } else if (ARM::SPRRegClass.contains(Reg)) { // Physical register: MI must define the full D-reg. unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0, &ARM::DPRRegClass); - if (!DReg || !MI->definesRegister(DReg, TRI)) + if (!DReg || !MI.definesRegister(DReg, TRI)) return 0; } // MI has an unwanted D-register dependency. // Avoid defs in the previous N instructrions. - return SwiftPartialUpdateClearance; + return PartialUpdateClearance; } // Break a partial register dependency after getPartialRegUpdateClearance // returned non-zero. -void ARMBaseInstrInfo:: -breakPartialRegDependency(MachineBasicBlock::iterator MI, - unsigned OpNum, - const TargetRegisterInfo *TRI) const { - assert(MI && OpNum < MI->getDesc().getNumDefs() && "OpNum is not a def"); +void ARMBaseInstrInfo::breakPartialRegDependency( + MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { + assert(OpNum < MI.getDesc().getNumDefs() && "OpNum is not a def"); assert(TRI && "Need TRI instance"); - const MachineOperand &MO = MI->getOperand(OpNum); + const MachineOperand &MO = MI.getOperand(OpNum); unsigned Reg = MO.getReg(); assert(TargetRegisterInfo::isPhysicalRegister(Reg) && "Can't break virtual register dependencies."); @@ -4528,7 +4558,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, } assert(ARM::DPRRegClass.contains(DReg) && "Can only break D-reg deps"); - assert(MI->definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg"); + assert(MI.definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg"); // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines // the full D-register by loading the same value to both lanes. The @@ -4538,9 +4568,10 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, // Insert the dependency-breaking FCONSTD before MI. // 96 is the encoding of 0.5, but the actual value doesn't matter here. - AddDefaultPred(BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - get(ARM::FCONSTD), DReg).addImm(96)); - MI->addRegisterKilled(DReg, TRI, true); + AddDefaultPred( + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::FCONSTD), DReg) + .addImm(96)); + MI.addRegisterKilled(DReg, TRI, true); } bool ARMBaseInstrInfo::hasNOP() const { diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index d80c49494c77..52b0ff17dea2 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -36,8 +36,7 @@ protected: explicit ARMBaseInstrInfo(const ARMSubtarget &STI); void expandLoadStackGuardBase(MachineBasicBlock::iterator MI, - unsigned LoadImmOpc, unsigned LoadOpc, - Reloc::Model RM) const; + unsigned LoadImmOpc, unsigned LoadOpc) const; /// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI /// and \p DefIdx. @@ -93,8 +92,7 @@ protected: /// non-commutable pair of operand indices OpIdx1 and OpIdx2. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. - MachineInstr *commuteInstructionImpl(MachineInstr *MI, - bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const override; @@ -107,7 +105,7 @@ public: virtual unsigned getUnindexedOpcode(unsigned Opc) const =0; MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, + MachineInstr &MI, LiveVariables *LV) const override; virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0; @@ -122,49 +120,49 @@ public: const ScheduleDAG *DAG) const override; // Branch analysis. - bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify = false) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef Cond, - DebugLoc DL) const override; + const DebugLoc &DL) const override; bool ReverseBranchCondition(SmallVectorImpl &Cond) const override; // Predication support. - bool isPredicated(const MachineInstr *MI) const override; + bool isPredicated(const MachineInstr &MI) const override; - ARMCC::CondCodes getPredicate(const MachineInstr *MI) const { - int PIdx = MI->findFirstPredOperandIdx(); - return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm() + ARMCC::CondCodes getPredicate(const MachineInstr &MI) const { + int PIdx = MI.findFirstPredOperandIdx(); + return PIdx != -1 ? (ARMCC::CondCodes)MI.getOperand(PIdx).getImm() : ARMCC::AL; } - bool PredicateInstruction(MachineInstr *MI, - ArrayRef Pred) const override; + bool PredicateInstruction(MachineInstr &MI, + ArrayRef Pred) const override; bool SubsumesPredicate(ArrayRef Pred1, ArrayRef Pred2) const override; - bool DefinesPredicate(MachineInstr *MI, + bool DefinesPredicate(MachineInstr &MI, std::vector &Pred) const override; - bool isPredicable(MachineInstr *MI) const override; + bool isPredicable(MachineInstr &MI) const override; /// GetInstSize - Returns the size of the specified MachineInstr. /// - virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const; + virtual unsigned GetInstSizeInBytes(const MachineInstr &MI) const; - unsigned isLoadFromStackSlot(const MachineInstr *MI, + unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - unsigned isStoreToStackSlot(const MachineInstr *MI, + unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override; - unsigned isStoreToStackSlotPostFE(const MachineInstr *MI, + unsigned isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override; void copyToCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -175,7 +173,7 @@ public: const ARMSubtarget &Subtarget) const; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - DebugLoc DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, @@ -190,21 +188,21 @@ public: const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + bool expandPostRAPseudo(MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, unsigned SubIdx, - const MachineInstr *Orig, + const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override; - MachineInstr *duplicate(MachineInstr *Orig, + MachineInstr *duplicate(MachineInstr &Orig, MachineFunction &MF) const override; const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg, unsigned SubIdx, unsigned State, const TargetRegisterInfo *TRI) const; - bool produceSameValue(const MachineInstr *MI0, const MachineInstr *MI1, + bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1, const MachineRegisterInfo *MRI) const override; /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to @@ -227,7 +225,7 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; - bool isSchedulingBoundary(const MachineInstr *MI, + bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override; @@ -252,7 +250,7 @@ public: /// in SrcReg and SrcReg2 if having two register operands, and the value it /// compares against in CmpValue. Return true if the comparison instruction /// can be analyzed. - bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const override; @@ -260,30 +258,29 @@ public: /// that we can remove a "comparison with zero"; Remove a redundant CMP /// instruction if the flags can be updated in the same way by an earlier /// instruction such as SUB. - bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; - bool analyzeSelect(const MachineInstr *MI, - SmallVectorImpl &Cond, - unsigned &TrueOp, unsigned &FalseOp, - bool &Optimizable) const override; + bool analyzeSelect(const MachineInstr &MI, + SmallVectorImpl &Cond, unsigned &TrueOp, + unsigned &FalseOp, bool &Optimizable) const override; - MachineInstr *optimizeSelect(MachineInstr *MI, + MachineInstr *optimizeSelect(MachineInstr &MI, SmallPtrSetImpl &SeenMIs, bool) const override; /// FoldImmediate - 'Reg' is known to be defined by a move immediate /// instruction, try to fold the immediate into the use instruction. - bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, - unsigned Reg, MachineRegisterInfo *MRI) const override; + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, + MachineRegisterInfo *MRI) const override; unsigned getNumMicroOps(const InstrItineraryData *ItinData, - const MachineInstr *MI) const override; + const MachineInstr &MI) const override; int getOperandLatency(const InstrItineraryData *ItinData, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, + const MachineInstr &DefMI, unsigned DefIdx, + const MachineInstr &UseMI, unsigned UseIdx) const override; int getOperandLatency(const InstrItineraryData *ItinData, SDNode *DefNode, unsigned DefIdx, @@ -291,19 +288,20 @@ public: /// VFP/NEON execution domains. std::pair - getExecutionDomain(const MachineInstr *MI) const override; - void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override; + getExecutionDomain(const MachineInstr &MI) const override; + void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override; - unsigned getPartialRegUpdateClearance(const MachineInstr*, unsigned, - const TargetRegisterInfo*) const override; - void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned, + unsigned + getPartialRegUpdateClearance(const MachineInstr &, unsigned, + const TargetRegisterInfo *) const override; + void breakPartialRegDependency(MachineInstr &, unsigned, const TargetRegisterInfo *TRI) const override; /// Get the number of addresses by LDM or VLDM or zero for unknown. - unsigned getNumLDMAddresses(const MachineInstr *MI) const; + unsigned getNumLDMAddresses(const MachineInstr &MI) const; private: - unsigned getInstBundleLength(const MachineInstr *MI) const; + unsigned getInstBundleLength(const MachineInstr &MI) const; int getVLDMDefCycle(const InstrItineraryData *ItinData, const MCInstrDesc &DefMCID, @@ -327,10 +325,17 @@ private: const MCInstrDesc &UseMCID, unsigned UseIdx, unsigned UseAlign) const; - unsigned getPredicationCost(const MachineInstr *MI) const override; + int getOperandLatencyImpl(const InstrItineraryData *ItinData, + const MachineInstr &DefMI, unsigned DefIdx, + const MCInstrDesc &DefMCID, unsigned DefAdj, + const MachineOperand &DefMO, unsigned Reg, + const MachineInstr &UseMI, unsigned UseIdx, + const MCInstrDesc &UseMCID, unsigned UseAdj) const; + + unsigned getPredicationCost(const MachineInstr &MI) const override; unsigned getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, + const MachineInstr &MI, unsigned *PredCost = nullptr) const override; int getInstrLatency(const InstrItineraryData *ItinData, @@ -338,19 +343,18 @@ private: bool hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, + const MachineInstr &DefMI, unsigned DefIdx, + const MachineInstr &UseMI, unsigned UseIdx) const override; bool hasLowDefLatency(const TargetSchedModel &SchedModel, - const MachineInstr *DefMI, + const MachineInstr &DefMI, unsigned DefIdx) const override; /// verifyInstruction - Perform target specific instruction verification. - bool verifyInstruction(const MachineInstr *MI, + bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override; - virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI, - Reloc::Model RM) const = 0; + virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI) const = 0; void expandMEMCPY(MachineBasicBlock::iterator) const; @@ -447,7 +451,7 @@ static inline bool isPushOpcode(int Opc) { /// getInstrPredicate - If instruction is predicated, returns its predicate /// condition, otherwise returns AL. It also returns the condition code /// register by reference. -ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg); +ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg); unsigned getMatchingCondBranchOpcode(unsigned Opc); @@ -466,21 +470,24 @@ unsigned convertAddSubFlagsOpcode(unsigned OldOpc); /// instructions to materializea destreg = basereg + immediate in ARM / Thumb2 /// code. void emitARMRegPlusImmediate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, DebugLoc dl, - unsigned DestReg, unsigned BaseReg, int NumBytes, + MachineBasicBlock::iterator &MBBI, + const DebugLoc &dl, unsigned DestReg, + unsigned BaseReg, int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg, const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); void emitT2RegPlusImmediate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, DebugLoc dl, - unsigned DestReg, unsigned BaseReg, int NumBytes, + MachineBasicBlock::iterator &MBBI, + const DebugLoc &dl, unsigned DestReg, + unsigned BaseReg, int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg, const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, DebugLoc dl, - unsigned DestReg, unsigned BaseReg, - int NumBytes, const TargetInstrInfo &TII, - const ARMBaseRegisterInfo& MRI, + MachineBasicBlock::iterator &MBBI, + const DebugLoc &dl, unsigned DestReg, + unsigned BaseReg, int NumBytes, + const TargetInstrInfo &TII, + const ARMBaseRegisterInfo &MRI, unsigned MIFlags = 0); /// Tries to add registers to the reglist of a given base-updating diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index a5207705fc69..aa968efc37d4 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -49,12 +49,9 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo() : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), BasePtr(ARM::R6) {} static unsigned getFramePointerReg(const ARMSubtarget &STI) { - if (STI.isTargetMachO()) { - if (STI.isTargetDarwin() || STI.isThumb1Only()) - return ARM::R7; - else - return ARM::R11; - } else if (STI.isTargetWindows()) + if (STI.isTargetMachO()) + return ARM::R7; + else if (STI.isTargetWindows()) return ARM::R11; else // ARM EABI return STI.isThumb() ? ARM::R7 : ARM::R11; @@ -63,8 +60,11 @@ static unsigned getFramePointerReg(const ARMSubtarget &STI) { const MCPhysReg* ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const ARMSubtarget &STI = MF->getSubtarget(); + bool UseSplitPush = STI.splitFramePushPop(); const MCPhysReg *RegList = - STI.isTargetDarwin() ? CSR_iOS_SaveList : CSR_AAPCS_SaveList; + STI.isTargetDarwin() + ? CSR_iOS_SaveList + : (UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList); const Function *F = MF->getFunction(); if (F->getCallingConv() == CallingConv::GHC) { @@ -75,7 +75,7 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (STI.isMClass()) { // M-class CPUs have hardware which saves the registers needed to allow a // function conforming to the AAPCS to function as a handler. - return CSR_AAPCS_SaveList; + return UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList; } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") { // Fast interrupt mode gives the handler a private copy of R8-R14, so less // need to be saved to restore user-mode state. @@ -87,6 +87,10 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } } + if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() && + F->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return CSR_iOS_SwiftError_SaveList; + if (STI.isTargetDarwin() && F->getCallingConv() == CallingConv::CXX_FAST_TLS) return MF->getInfo()->isSplitCSR() ? CSR_iOS_CXX_TLS_PE_SaveList @@ -110,6 +114,11 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (CC == CallingConv::GHC) // This is academic becase all GHC calls are (supposed to be) tail calls return CSR_NoRegs_RegMask; + + if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() && + MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return CSR_iOS_SwiftError_RegMask; + if (STI.isTargetDarwin() && CC == CallingConv::CXX_FAST_TLS) return CSR_iOS_CXX_TLS_RegMask; return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask; @@ -167,9 +176,8 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(ARM::R9); // Reserve D16-D31 if the subtarget doesn't support them. if (!STI.hasVFP3() || STI.hasD16()) { - assert(ARM::D31 == ARM::D16 + 15); - for (unsigned i = 0; i != 16; ++i) - Reserved.set(ARM::D16 + i); + static_assert(ARM::D31 == ARM::D16 + 15, "Register list not consecutive!"); + Reserved.set(ARM::D16, ARM::D31 + 1); } const TargetRegisterClass *RC = &ARM::GPRPairRegClass; for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I) @@ -400,13 +408,10 @@ ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const { /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. -void ARMBaseRegisterInfo:: -emitLoadConstPool(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - DebugLoc dl, - unsigned DestReg, unsigned SubIdx, int Val, - ARMCC::CondCodes Pred, - unsigned PredReg, unsigned MIFlags) const { +void ARMBaseRegisterInfo::emitLoadConstPool( + MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineConstantPool *ConstantPool = MF.getConstantPool(); diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index 6a9a45a65687..1eee94857e05 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -166,12 +166,12 @@ public: /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. - virtual void emitLoadConstPool(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - DebugLoc dl, unsigned DestReg, unsigned SubIdx, - int Val, ARMCC::CondCodes Pred = ARMCC::AL, - unsigned PredReg = 0, - unsigned MIFlags = MachineInstr::NoFlags)const; + virtual void + emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, + int Val, ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0, + unsigned MIFlags = MachineInstr::NoFlags) const; /// Code Generation virtual methods... bool requiresRegisterScavenging(const MachineFunction &MF) const override; diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h index a731d00883a1..71b819362404 100644 --- a/lib/Target/ARM/ARMCallingConv.h +++ b/lib/Target/ARM/ARMCallingConv.h @@ -211,7 +211,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, // First consume all registers that would give an unaligned object. Whether // we go on stack or in regs, no-one will be using them in future. - unsigned RegAlign = RoundUpToAlignment(Align, 4) / 4; + unsigned RegAlign = alignTo(Align, 4) / 4; while (RegIdx % RegAlign != 0 && RegIdx < RegList.size()) State.AllocateReg(RegList[RegIdx++]); diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index 847ef87c1b26..edb69581b9d3 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -23,6 +23,12 @@ def CC_ARM_APCS : CallingConv<[ CCIfType<[i1, i8, i16], CCPromoteToType>, + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf>>, + + // A SwiftError is passed in R6. + CCIfSwiftError>>, + // Handle all vector types as either f64 or v2f64. CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, @@ -42,6 +48,12 @@ def RetCC_ARM_APCS : CallingConv<[ CCIfType<[i1, i8, i16], CCPromoteToType>, CCIfType<[f32], CCBitConvertToType>, + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf>>, + + // A SwiftError is returned in R6. + CCIfSwiftError>>, + // Handle all vector types as either f64 or v2f64. CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, @@ -151,6 +163,12 @@ def CC_ARM_AAPCS : CallingConv<[ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf>>, + + // A SwiftError is passed in R6. + CCIfSwiftError>>, + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType>, CCDelegateTo @@ -161,6 +179,12 @@ def RetCC_ARM_AAPCS : CallingConv<[ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf>>, + + // A SwiftError is returned in R6. + CCIfSwiftError>>, + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType>, CCDelegateTo @@ -179,6 +203,12 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf>>, + + // A SwiftError is passed in R6. + CCIfSwiftError>>, + // HFAs are passed in a contiguous block of registers, or on the stack CCIfConsecutiveRegs>, @@ -194,6 +224,12 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[ CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf>>, + + // A SwiftError is returned in R6. + CCIfSwiftError>>, + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, @@ -210,6 +246,14 @@ def CSR_NoRegs : CalleeSavedRegs<(add)>; def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4, (sequence "D%u", 15, 8))>; +// The order of callee-saved registers needs to match the order we actually push +// them in FrameLowering, because this order is what's used by +// PrologEpilogInserter to allocate frame index slots. So when R7 is the frame +// pointer, we use this AAPCS alternative. +def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R7, R6, R5, R4, + R11, R10, R9, R8, + (sequence "D%u", 15, 8))>; + // Constructors and destructors return 'this' in the ARM C++ ABI; since 'this' // and the pointer return value are both passed in R0 in these cases, this can // be partially modelled by treating R0 as a callee-saved register @@ -222,6 +266,9 @@ def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, // Also save R7-R4 first to match the stack frame fixed spill areas. def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>; +// R6 is used to pass swifterror, remove it from CSR. +def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R6)>; + def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS_ThisReturn, R9))>; @@ -235,10 +282,11 @@ def CSR_iOS_CXX_TLS : CalleeSavedRegs<(add CSR_iOS, (sequence "R%u", 12, 1), (sequence "D%u", 31, 0))>; // CSRs that are handled by prologue, epilogue. -def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR)>; +def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR, R12, R11, R7, R5, R4)>; // CSRs that are handled explicitly via copies. -def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS, LR)>; +def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS, + CSR_iOS_CXX_TLS_PE)>; // The "interrupt" attribute is used to generate code that is acceptable in // exception-handlers of various kinds. It makes us use a different return diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 55c1684028c2..8511f67dccd5 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -53,6 +53,11 @@ static cl::opt AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true), cl::desc("Adjust basic block layout to better use TB[BH]")); +static cl::opt +CPMaxIteration("arm-constant-island-max-iteration", cl::Hidden, cl::init(30), + cl::desc("The max number of iteration for converge")); + + /// UnknownPadding - Return the worst case padding that could result from /// unknown offset bits. This does not include alignment padding caused by /// known offset bits. @@ -274,6 +279,11 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return "ARM constant island placement and branch shortening pass"; } @@ -293,10 +303,10 @@ namespace { unsigned getCombinedIndex(const MachineInstr *CPEMI); int findInRangeCPEntry(CPUser& U, unsigned UserOffset); bool findAvailableWater(CPUser&U, unsigned UserOffset, - water_iterator &WaterIter); + water_iterator &WaterIter, bool CloserWater); void createNewWater(unsigned CPUserIndex, unsigned UserOffset, MachineBasicBlock *&NewMBB); - bool handleConstantPoolUser(unsigned CPUserIndex); + bool handleConstantPoolUser(unsigned CPUserIndex, bool CloserWater); void removeDeadCPEMI(MachineInstr *CPEMI); bool removeUnusedCPEntries(); bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset, @@ -456,8 +466,11 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n'); bool CPChange = false; for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) - CPChange |= handleConstantPoolUser(i); - if (CPChange && ++NoCPIters > 30) + // For most inputs, it converges in no more than 5 iterations. + // If it doesn't end in 10, the input may have huge BB or many CPEs. + // In this case, we will try different heuristics. + CPChange |= handleConstantPoolUser(i, NoCPIters >= CPMaxIteration / 2); + if (CPChange && ++NoCPIters > CPMaxIteration) report_fatal_error("Constant Island pass failed to converge!"); DEBUG(dumpBBs()); @@ -478,10 +491,18 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { MadeChange = true; } - // Shrink 32-bit Thumb2 branch, load, and store instructions. + // Shrink 32-bit Thumb2 load and store instructions. if (isThumb2 && !STI->prefers32BitThumb()) MadeChange |= optimizeThumb2Instructions(); + // Shrink 32-bit branch instructions. + if (isThumb && STI->hasV8MBaselineOps()) + MadeChange |= optimizeThumb2Branches(); + + // Optimize jump tables using TBB / TBH. + if (isThumb2) + MadeChange |= optimizeThumb2JumpTables(); + // After a while, this might be made debug-only, but it is not expensive. verify(); @@ -654,7 +675,7 @@ bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) { // have an unconditional branch for whatever reason. MachineBasicBlock *TBB, *FBB; SmallVector Cond; - bool TooDifficult = TII->AnalyzeBranch(*MBB, TBB, FBB, Cond); + bool TooDifficult = TII->analyzeBranch(*MBB, TBB, FBB, Cond); return TooDifficult || FBB == nullptr; } @@ -701,14 +722,10 @@ unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) { /// information about the sizes of each block and the locations of all /// the jump tables. void ARMConstantIslands::scanFunctionJumpTables() { - for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); - MBBI != E; ++MBBI) { - MachineBasicBlock &MBB = *MBBI; - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) - if (I->isBranch() && I->getOpcode() == ARM::t2BR_JT) - T2JumpTables.push_back(I); + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &I : MBB) + if (I.isBranch() && I.getOpcode() == ARM::t2BR_JT) + T2JumpTables.push_back(&I); } } @@ -735,22 +752,18 @@ initializeFunctionInfo(const std::vector &CPEMIs) { adjustBBOffsetsAfter(&MF->front()); // Now go back through the instructions and build up our data structures. - for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); - MBBI != E; ++MBBI) { - MachineBasicBlock &MBB = *MBBI; - + for (MachineBasicBlock &MBB : *MF) { // If this block doesn't fall through into the next MBB, then this is // 'water' that a constant pool island could be placed. if (!BBHasFallthrough(&MBB)) WaterList.push_back(&MBB); - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - if (I->isDebugValue()) + for (MachineInstr &I : MBB) { + if (I.isDebugValue()) continue; - unsigned Opc = I->getOpcode(); - if (I->isBranch()) { + unsigned Opc = I.getOpcode(); + if (I.isBranch()) { bool isCond = false; unsigned Bits = 0; unsigned Scale = 1; @@ -759,7 +772,7 @@ initializeFunctionInfo(const std::vector &CPEMIs) { default: continue; // Ignore other JT branches case ARM::t2BR_JT: - T2JumpTables.push_back(I); + T2JumpTables.push_back(&I); continue; // Does not get an entry in ImmBranches case ARM::Bcc: isCond = true; @@ -793,11 +806,11 @@ initializeFunctionInfo(const std::vector &CPEMIs) { // Record this immediate branch. unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale; - ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc)); + ImmBranches.push_back(ImmBranch(&I, MaxOffs, isCond, UOpc)); } if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET) - PushPopMIs.push_back(I); + PushPopMIs.push_back(&I); if (Opc == ARM::CONSTPOOL_ENTRY || Opc == ARM::JUMPTABLE_ADDRS || Opc == ARM::JUMPTABLE_INSTS || Opc == ARM::JUMPTABLE_TBB || @@ -805,8 +818,8 @@ initializeFunctionInfo(const std::vector &CPEMIs) { continue; // Scan the instructions for constant pool operands. - for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) - if (I->getOperand(op).isCPI() || I->getOperand(op).isJTI()) { + for (unsigned op = 0, e = I.getNumOperands(); op != e; ++op) + if (I.getOperand(op).isCPI() || I.getOperand(op).isJTI()) { // We found one. The addressing mode tells us the max displacement // from the PC that this instruction permits. @@ -865,15 +878,15 @@ initializeFunctionInfo(const std::vector &CPEMIs) { } // Remember that this is a user of a CP entry. - unsigned CPI = I->getOperand(op).getIndex(); - if (I->getOperand(op).isJTI()) { + unsigned CPI = I.getOperand(op).getIndex(); + if (I.getOperand(op).isJTI()) { JumpTableUserIndices.insert(std::make_pair(CPI, CPUsers.size())); CPI = JumpTableEntryIndices[CPI]; } MachineInstr *CPEMI = CPEMIs[CPI]; unsigned MaxOffs = ((1 << Bits)-1) * Scale; - CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm)); + CPUsers.push_back(CPUser(&I, CPEMI, MaxOffs, NegOk, IsSoImm)); // Increment corresponding CPEntry reference count. CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); @@ -896,15 +909,14 @@ void ARMConstantIslands::computeBlockSize(MachineBasicBlock *MBB) { BBI.Unalign = 0; BBI.PostAlign = 0; - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; - ++I) { + for (MachineInstr &I : *MBB) { BBI.Size += TII->GetInstSizeInBytes(I); // For inline asm, GetInstSizeInBytes returns a conservative estimate. // The actual size may be smaller, but still a multiple of the instr size. - if (I->isInlineAsm()) + if (I.isInlineAsm()) BBI.Unalign = isThumb ? 1 : 2; // Also consider instructions that may be shrunk later. - else if (isThumb && mayOptimizeThumb2Instruction(I)) + else if (isThumb && mayOptimizeThumb2Instruction(&I)) BBI.Unalign = 1; } @@ -929,7 +941,7 @@ unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const { // Sum instructions before MI in MBB. for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) { assert(I != MBB->end() && "Didn't find MI in its own basic block?"); - Offset += TII->GetInstSizeInBytes(I); + Offset += TII->GetInstSizeInBytes(*I); } return Offset; } @@ -1108,7 +1120,7 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset, Growth = CPEEnd - NextBlockOffset; // Compute the padding that would go at the end of the CPE to align the next // block. - Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment); + Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment); // If the CPE is to be inserted before the instruction, that will raise // the offset of the instruction. Also account for unknown alignment padding @@ -1285,11 +1297,27 @@ static inline unsigned getUnconditionalBrDisp(int Opc) { /// move to a lower address, so search backward from the end of the list and /// prefer the first water that is in range. bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset, - water_iterator &WaterIter) { + water_iterator &WaterIter, + bool CloserWater) { if (WaterList.empty()) return false; unsigned BestGrowth = ~0u; + // The nearest water without splitting the UserBB is right after it. + // If the distance is still large (we have a big BB), then we need to split it + // if we don't converge after certain iterations. This helps the following + // situation to converge: + // BB0: + // Big BB + // BB1: + // Constant Pool + // When a CP access is out of range, BB0 may be used as water. However, + // inserting islands between BB0 and BB1 makes other accesses out of range. + MachineBasicBlock *UserBB = U.MI->getParent(); + unsigned MinNoSplitDisp = + BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI)); + if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2) + return false; for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();; --IP) { MachineBasicBlock* WaterBB = *IP; @@ -1301,6 +1329,8 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset, // should be relatively uncommon and when it does happen, we want to be // sure to take advantage of it for all the CPEs near that block, so that // we don't insert more branches than necessary. + // When CloserWater is true, we try to find the lowest address after (or + // equal to) user MI's BB no matter of padding growth. unsigned Growth; if (isWaterInRange(UserOffset, WaterBB, U, Growth) && (WaterBB->getNumber() < U.HighWaterMark->getNumber() || @@ -1312,8 +1342,11 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset, DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber() << " Growth=" << Growth << '\n'); - // Keep looking unless it is perfect. - if (BestGrowth == 0) + if (CloserWater && WaterBB == U.MI->getParent()) + return true; + // Keep looking unless it is perfect and we're not looking for the lowest + // possible address. + if (!CloserWater && BestGrowth == 0) return true; } if (IP == B) @@ -1416,7 +1449,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // iterates at least once. BaseInsertOffset = std::max(UserBBI.postOffset() - UPad - 8, - UserOffset + TII->GetInstSizeInBytes(UserMI) + 1); + UserOffset + TII->GetInstSizeInBytes(*UserMI) + 1); DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset)); } unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad + @@ -1426,11 +1459,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, unsigned CPUIndex = CPUserIndex+1; unsigned NumCPUsers = CPUsers.size(); MachineInstr *LastIT = nullptr; - for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI); + for (unsigned Offset = UserOffset + TII->GetInstSizeInBytes(*UserMI); Offset < BaseInsertOffset; - Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) { + Offset += TII->GetInstSizeInBytes(*MI), MI = std::next(MI)) { assert(MI != UserMBB->end() && "Fell off end of block"); - if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) { + if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == &*MI) { CPUser &U = CPUsers[CPUIndex]; if (!isOffsetInRange(Offset, EndInsertOffset, U)) { // Shift intertion point by one unit of alignment so it is within reach. @@ -1447,7 +1480,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // Remember the last IT instruction. if (MI->getOpcode() == ARM::t2IT) - LastIT = MI; + LastIT = &*MI; } --MI; @@ -1455,23 +1488,24 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // Avoid splitting an IT block. if (LastIT) { unsigned PredReg = 0; - ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg); + ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg); if (CC != ARMCC::AL) MI = LastIT; } // We really must not split an IT block. DEBUG(unsigned PredReg; - assert(!isThumb || getITInstrPredicate(MI, PredReg) == ARMCC::AL)); + assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL)); - NewMBB = splitBlockBeforeInstr(MI); + NewMBB = splitBlockBeforeInstr(&*MI); } /// handleConstantPoolUser - Analyze the specified user, checking to see if it /// is out-of-range. If so, pick up the constant pool value and move it some /// place in-range. Return true if we changed any addresses (thus must run /// another pass of branch lengthening), false otherwise. -bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { +bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex, + bool CloserWater) { CPUser &U = CPUsers[CPUserIndex]; MachineInstr *UserMI = U.MI; MachineInstr *CPEMI = U.CPEMI; @@ -1494,7 +1528,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock(); MachineBasicBlock *NewMBB; water_iterator IP; - if (findAvailableWater(U, UserOffset, IP)) { + if (findAvailableWater(U, UserOffset, IP, CloserWater)) { DEBUG(dbgs() << "Found water in range\n"); MachineBasicBlock *WaterBB = *IP; @@ -1584,7 +1618,7 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { CPEBB->setAlignment(0); } else // Entries are sorted by descending alignment, so realign from the front. - CPEBB->setAlignment(getCPELogAlign(CPEBB->begin())); + CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin())); adjustBBOffsetsAfter(CPEBB); // An island has only one predecessor BB and one successor BB. Check if @@ -1728,7 +1762,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { splitBlockBeforeInstr(MI); // No need for the branch to the next block. We're adding an unconditional // branch to the destination. - int delta = TII->GetInstSizeInBytes(&MBB->back()); + int delta = TII->GetInstSizeInBytes(MBB->back()); BBInfo[MBB->getNumber()].Size -= delta; MBB->back().eraseFromParent(); // BBInfo[SplitBB].Offset is wrong temporarily, fixed below @@ -1744,18 +1778,18 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode())) .addMBB(NextBB).addImm(CC).addReg(CCReg); Br.MI = &MBB->back(); - BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back()); if (isThumb) BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB) .addImm(ARMCC::AL).addReg(0); else BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB); - BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back()); unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr); ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr)); // Remove the old conditional branch. It may or may not still be in MBB. - BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI); + BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(*MI); MI->eraseFromParent(); adjustBBOffsetsAfter(MBB); return true; @@ -1852,8 +1886,6 @@ bool ARMConstantIslands::optimizeThumb2Instructions() { } } - MadeChange |= optimizeThumb2Branches(); - MadeChange |= optimizeThumb2JumpTables(); return MadeChange; } @@ -1910,7 +1942,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() { NewOpc = 0; unsigned PredReg = 0; - ARMCC::CondCodes Pred = getInstrPredicate(Br.MI, PredReg); + ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg); if (Pred == ARMCC::EQ) NewOpc = ARM::tCBZ; else if (Pred == ARMCC::NE) @@ -1928,7 +1960,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() { --CmpMI; if (CmpMI->getOpcode() == ARM::tCMPi8) { unsigned Reg = CmpMI->getOperand(0).getReg(); - Pred = getInstrPredicate(CmpMI, PredReg); + Pred = getInstrPredicate(*CmpMI, PredReg); if (Pred == ARMCC::AL && CmpMI->getOperand(1).getImm() == 0 && isARMLowRegister(Reg)) { @@ -2170,8 +2202,8 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() { } } - unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI); - unsigned OrigSize = TII->GetInstSizeInBytes(MI); + unsigned NewSize = TII->GetInstSizeInBytes(*NewJTMI); + unsigned OrigSize = TII->GetInstSizeInBytes(*MI); MI->eraseFromParent(); int Delta = OrigSize - NewSize + DeadSize; @@ -2240,13 +2272,13 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { MachineFunction::iterator OldPrior = std::prev(BBi); // If the block terminator isn't analyzable, don't try to move the block - bool B = TII->AnalyzeBranch(*BB, TBB, FBB, Cond); + bool B = TII->analyzeBranch(*BB, TBB, FBB, Cond); // If the block ends in an unconditional branch, move it. The prior block // has to have an analyzable terminator for us to move this one. Be paranoid // and make sure we're not trying to move the entry block of the function. - if (!B && Cond.empty() && BB != MF->begin() && - !TII->AnalyzeBranch(*OldPrior, TBB, FBB, CondPrior)) { + if (!B && Cond.empty() && BB != &MF->front() && + !TII->analyzeBranch(*OldPrior, TBB, FBB, CondPrior)) { BB->moveAfter(JTBB); OldPrior->updateTerminator(); BB->updateTerminator(); diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp index c9849b2605ea..c0db001cb6f1 100644 --- a/lib/Target/ARM/ARMConstantPoolValue.cpp +++ b/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -50,11 +50,18 @@ const char *ARMConstantPoolValue::getModifierText() const { switch (Modifier) { // FIXME: Are these case sensitive? It'd be nice to lower-case all the // strings if that's legal. - case ARMCP::no_modifier: return "none"; - case ARMCP::TLSGD: return "tlsgd"; - case ARMCP::GOT_PREL: return "GOT_PREL"; - case ARMCP::GOTTPOFF: return "gottpoff"; - case ARMCP::TPOFF: return "tpoff"; + case ARMCP::no_modifier: + return "none"; + case ARMCP::TLSGD: + return "tlsgd"; + case ARMCP::GOT_PREL: + return "GOT_PREL"; + case ARMCP::GOTTPOFF: + return "gottpoff"; + case ARMCP::TPOFF: + return "tpoff"; + case ARMCP::SECREL: + return "secrel32"; } llvm_unreachable("Unknown modifier!"); } @@ -74,9 +81,9 @@ bool ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) { if (ACPV->Kind == Kind && ACPV->PCAdjust == PCAdjust && - ACPV->Modifier == Modifier) { - if (ACPV->LabelId == LabelId) - return true; + ACPV->Modifier == Modifier && + ACPV->LabelId == LabelId && + ACPV->AddCurrentAddress == AddCurrentAddress) { // Two PC relative constpool entries containing the same GV address or // external symbols. FIXME: What about blockaddress? if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol) @@ -85,7 +92,7 @@ ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) { return false; } -void ARMConstantPoolValue::dump() const { +LLVM_DUMP_METHOD void ARMConstantPoolValue::dump() const { errs() << " " << *this; } diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h index 6b18a4e52878..c07331d71dad 100644 --- a/lib/Target/ARM/ARMConstantPoolValue.h +++ b/lib/Target/ARM/ARMConstantPoolValue.h @@ -37,11 +37,12 @@ namespace ARMCP { }; enum ARMCPModifier { - no_modifier, - TLSGD, - GOT_PREL, - GOTTPOFF, - TPOFF + no_modifier, /// None + TLSGD, /// Thread Local Storage (General Dynamic Mode) + GOT_PREL, /// Global Offset Table, PC Relative + GOTTPOFF, /// Global Offset Table, Thread Pointer Offset + TPOFF, /// Thread Pointer Offset + SECREL, /// Section Relative (Windows TLS) }; } diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 56f3498e1204..56f5728ecfb8 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -20,6 +20,7 @@ #include "ARMConstantPoolValue.h" #include "ARMMachineFunctionInfo.h" #include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -50,6 +51,11 @@ namespace { bool runOnMachineFunction(MachineFunction &Fn) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return "ARM pseudo instruction expansion pass"; } @@ -58,7 +64,8 @@ namespace { void TransferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI); bool ExpandMI(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI); + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); bool ExpandMBB(MachineBasicBlock &MBB); void ExpandVLD(MachineBasicBlock::iterator &MBBI); void ExpandVST(MachineBasicBlock::iterator &MBBI); @@ -67,6 +74,14 @@ namespace { unsigned Opc, bool IsExt); void ExpandMOV32BitImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI); + bool ExpandCMP_SWAP(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, unsigned LdrexOp, + unsigned StrexOp, unsigned UxtOp, + MachineBasicBlock::iterator &NextMBBI); + + bool ExpandCMP_SWAP_64(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); }; char ARMExpandPseudo::ID = 0; } @@ -651,7 +666,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); unsigned PredReg = 0; - ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg); + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); unsigned DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm; @@ -737,8 +752,242 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, MI.eraseFromParent(); } +static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) { + for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) + MBB->addLiveIn(*I); +} + +/// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as +/// possible. This only gets used at -O0 so we don't care about efficiency of the +/// generated code. +bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned LdrexOp, unsigned StrexOp, + unsigned UxtOp, + MachineBasicBlock::iterator &NextMBBI) { + bool IsThumb = STI->isThumb(); + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + unsigned StatusReg = MI.getOperand(1).getReg(); + MachineOperand &Addr = MI.getOperand(2); + MachineOperand &Desired = MI.getOperand(3); + MachineOperand &New = MI.getOperand(4); + + LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + for (auto I = std::prev(MBB.end()); I != MBBI; --I) + LiveRegs.stepBackward(*I); + + MachineFunction *MF = MBB.getParent(); + auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), LoadCmpBB); + MF->insert(++LoadCmpBB->getIterator(), StoreBB); + MF->insert(++StoreBB->getIterator(), DoneBB); + + if (UxtOp) { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, DL, TII->get(UxtOp), Desired.getReg()) + .addReg(Desired.getReg(), RegState::Kill); + if (!IsThumb) + MIB.addImm(0); + AddDefaultPred(MIB); + } + + // .Lloadcmp: + // ldrex rDest, [rAddr] + // cmp rDest, rDesired + // bne .Ldone + LoadCmpBB->addLiveIn(Addr.getReg()); + LoadCmpBB->addLiveIn(Dest.getReg()); + LoadCmpBB->addLiveIn(Desired.getReg()); + addPostLoopLiveIns(LoadCmpBB, LiveRegs); + + MachineInstrBuilder MIB; + MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg()); + MIB.addReg(Addr.getReg()); + if (LdrexOp == ARM::t2LDREX) + MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset. + AddDefaultPred(MIB); + + unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr; + AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) + .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) + .addOperand(Desired)); + unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc; + BuildMI(LoadCmpBB, DL, TII->get(Bcc)) + .addMBB(DoneBB) + .addImm(ARMCC::NE) + .addReg(ARM::CPSR, RegState::Kill); + LoadCmpBB->addSuccessor(DoneBB); + LoadCmpBB->addSuccessor(StoreBB); + + // .Lstore: + // strex rStatus, rNew, [rAddr] + // cmp rStatus, #0 + // bne .Lloadcmp + StoreBB->addLiveIn(Addr.getReg()); + StoreBB->addLiveIn(New.getReg()); + addPostLoopLiveIns(StoreBB, LiveRegs); + + + MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg); + MIB.addOperand(New); + MIB.addOperand(Addr); + if (StrexOp == ARM::t2STREX) + MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset. + AddDefaultPred(MIB); + + unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; + AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri)) + .addReg(StatusReg, RegState::Kill) + .addImm(0)); + BuildMI(StoreBB, DL, TII->get(Bcc)) + .addMBB(LoadCmpBB) + .addImm(ARMCC::NE) + .addReg(ARM::CPSR, RegState::Kill); + StoreBB->addSuccessor(LoadCmpBB); + StoreBB->addSuccessor(DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + addPostLoopLiveIns(DoneBB, LiveRegs); + + MBB.addSuccessor(LoadCmpBB); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + +/// ARM's ldrexd/strexd take a consecutive register pair (represented as a +/// single GPRPair register), Thumb's take two separate registers so we need to +/// extract the subregs from the pair. +static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg, + unsigned Flags, bool IsThumb, + const TargetRegisterInfo *TRI) { + if (IsThumb) { + unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0); + unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1); + MIB.addReg(RegLo, Flags | getKillRegState(Reg.isDead())); + MIB.addReg(RegHi, Flags | getKillRegState(Reg.isDead())); + } else + MIB.addReg(Reg.getReg(), Flags | getKillRegState(Reg.isDead())); +} + +/// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop. +bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + bool IsThumb = STI->isThumb(); + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + unsigned StatusReg = MI.getOperand(1).getReg(); + MachineOperand &Addr = MI.getOperand(2); + MachineOperand &Desired = MI.getOperand(3); + MachineOperand &New = MI.getOperand(4); + + unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0); + unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1); + unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0); + unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1); + + LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + for (auto I = std::prev(MBB.end()); I != MBBI; --I) + LiveRegs.stepBackward(*I); + + MachineFunction *MF = MBB.getParent(); + auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), LoadCmpBB); + MF->insert(++LoadCmpBB->getIterator(), StoreBB); + MF->insert(++StoreBB->getIterator(), DoneBB); + + // .Lloadcmp: + // ldrexd rDestLo, rDestHi, [rAddr] + // cmp rDestLo, rDesiredLo + // sbcs rStatus, rDestHi, rDesiredHi + // bne .Ldone + LoadCmpBB->addLiveIn(Addr.getReg()); + LoadCmpBB->addLiveIn(Dest.getReg()); + LoadCmpBB->addLiveIn(Desired.getReg()); + addPostLoopLiveIns(LoadCmpBB, LiveRegs); + + unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD; + MachineInstrBuilder MIB; + MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD)); + addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI); + MIB.addReg(Addr.getReg()); + AddDefaultPred(MIB); + + unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr; + AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) + .addReg(DestLo, getKillRegState(Dest.isDead())) + .addReg(DesiredLo, getKillRegState(Desired.isDead()))); + + unsigned SBCrr = IsThumb ? ARM::t2SBCrr : ARM::SBCrr; + MIB = BuildMI(LoadCmpBB, DL, TII->get(SBCrr)) + .addReg(StatusReg, RegState::Define | RegState::Dead) + .addReg(DestHi, getKillRegState(Dest.isDead())) + .addReg(DesiredHi, getKillRegState(Desired.isDead())); + AddDefaultPred(MIB); + MIB.addReg(ARM::CPSR, RegState::Kill); + + unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc; + BuildMI(LoadCmpBB, DL, TII->get(Bcc)) + .addMBB(DoneBB) + .addImm(ARMCC::NE) + .addReg(ARM::CPSR, RegState::Kill); + LoadCmpBB->addSuccessor(DoneBB); + LoadCmpBB->addSuccessor(StoreBB); + + // .Lstore: + // strexd rStatus, rNewLo, rNewHi, [rAddr] + // cmp rStatus, #0 + // bne .Lloadcmp + StoreBB->addLiveIn(Addr.getReg()); + StoreBB->addLiveIn(New.getReg()); + addPostLoopLiveIns(StoreBB, LiveRegs); + + unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD; + MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg); + addExclusiveRegPair(MIB, New, 0, IsThumb, TRI); + MIB.addOperand(Addr); + AddDefaultPred(MIB); + + unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; + AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri)) + .addReg(StatusReg, RegState::Kill) + .addImm(0)); + BuildMI(StoreBB, DL, TII->get(Bcc)) + .addMBB(LoadCmpBB) + .addImm(ARMCC::NE) + .addReg(ARM::CPSR, RegState::Kill); + StoreBB->addSuccessor(LoadCmpBB); + StoreBB->addSuccessor(DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + addPostLoopLiveIns(DoneBB, LiveRegs); + + MBB.addSuccessor(LoadCmpBB); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + + bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) { + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); switch (Opcode) { @@ -784,7 +1033,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, .addReg(JumpTarget.getReg(), RegState::Kill); } - MachineInstr *NewMI = std::prev(MBBI); + auto NewMI = std::prev(MBBI); for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) NewMI->addOperand(MBBI->getOperand(i)); @@ -1375,6 +1624,30 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true; case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true; case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true; + + case ARM::CMP_SWAP_8: + if (STI->isThumb()) + return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB, + ARM::tUXTB, NextMBBI); + else + return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB, + ARM::UXTB, NextMBBI); + case ARM::CMP_SWAP_16: + if (STI->isThumb()) + return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH, + ARM::tUXTH, NextMBBI); + else + return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH, + ARM::UXTH, NextMBBI); + case ARM::CMP_SWAP_32: + if (STI->isThumb()) + return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0, + NextMBBI); + else + return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI); + + case ARM::CMP_SWAP_64: + return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI); } } @@ -1384,7 +1657,7 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); while (MBBI != E) { MachineBasicBlock::iterator NMBBI = std::next(MBBI); - Modified |= ExpandMI(MBB, MBBI); + Modified |= ExpandMI(MBB, MBBI, NMBBI); MBBI = NMBBI; } diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index ff2fcfa349dc..13724da5d4f7 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -22,7 +22,6 @@ #include "ARMSubtarget.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -41,7 +40,6 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" @@ -110,11 +108,6 @@ class ARMFastISel final : public FastISel { const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, unsigned Op1, bool Op1IsKill); - unsigned fastEmitInst_rrr(unsigned MachineInstOpcode, - const TargetRegisterClass *RC, - unsigned Op0, bool Op0IsKill, - unsigned Op1, bool Op1IsKill, - unsigned Op2, bool Op2IsKill); unsigned fastEmitInst_ri(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, @@ -164,6 +157,7 @@ class ARMFastISel final : public FastISel { // Utility routines. private: + bool isPositionIndependent() const; bool isTypeLegal(Type *Ty, MVT &VT); bool isLoadTypeLegal(Type *Ty, MVT &VT); bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, @@ -215,7 +209,7 @@ class ARMFastISel final : public FastISel { const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB); void AddLoadStoreOperands(MVT VT, Address &Addr, const MachineInstrBuilder &MIB, - unsigned Flags, bool useAM3); + MachineMemOperand::Flags Flags, bool useAM3); }; } // end anonymous namespace @@ -331,38 +325,6 @@ unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode, return ResultReg; } -unsigned ARMFastISel::fastEmitInst_rrr(unsigned MachineInstOpcode, - const TargetRegisterClass *RC, - unsigned Op0, bool Op0IsKill, - unsigned Op1, bool Op1IsKill, - unsigned Op2, bool Op2IsKill) { - unsigned ResultReg = createResultReg(RC); - const MCInstrDesc &II = TII.get(MachineInstOpcode); - - // Make sure the input operands are sufficiently constrained to be legal - // for this instruction. - Op0 = constrainOperandRegClass(II, Op0, 1); - Op1 = constrainOperandRegClass(II, Op1, 2); - Op2 = constrainOperandRegClass(II, Op1, 3); - - if (II.getNumDefs() >= 1) { - AddOptionalDefs( - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) - .addReg(Op0, Op0IsKill * RegState::Kill) - .addReg(Op1, Op1IsKill * RegState::Kill) - .addReg(Op2, Op2IsKill * RegState::Kill)); - } else { - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) - .addReg(Op0, Op0IsKill * RegState::Kill) - .addReg(Op1, Op1IsKill * RegState::Kill) - .addReg(Op2, Op2IsKill * RegState::Kill)); - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ResultReg) - .addReg(II.ImplicitDefs[0])); - } - return ResultReg; -} - unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, @@ -576,12 +538,15 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) { return ResultReg; } +bool ARMFastISel::isPositionIndependent() const { + return TLI.isPositionIndependent(); +} + unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { // For now 32-bit only. if (VT != MVT::i32 || GV->isThreadLocal()) return 0; - Reloc::Model RelocM = TM.getRelocationModel(); - bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM); + bool IsIndirect = Subtarget->isGVIndirectSymbol(GV); const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass; unsigned DestReg = createResultReg(RC); @@ -591,23 +556,20 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { bool IsThreadLocal = GVar && GVar->isThreadLocal(); if (!Subtarget->isTargetMachO() && IsThreadLocal) return 0; + bool IsPositionIndependent = isPositionIndependent(); // Use movw+movt when possible, it avoids constant pool entries. // Non-darwin targets only support static movt relocations in FastISel. if (Subtarget->useMovt(*FuncInfo.MF) && - (Subtarget->isTargetMachO() || RelocM == Reloc::Static)) { + (Subtarget->isTargetMachO() || !IsPositionIndependent)) { unsigned Opc; unsigned char TF = 0; if (Subtarget->isTargetMachO()) TF = ARMII::MO_NONLAZY; - switch (RelocM) { - case Reloc::PIC_: + if (IsPositionIndependent) Opc = isThumb2 ? ARM::t2MOV_ga_pcrel : ARM::MOV_ga_pcrel; - break; - default: + else Opc = isThumb2 ? ARM::t2MOVi32imm : ARM::MOVi32imm; - break; - } AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF)); } else { @@ -618,12 +580,11 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { Align = DL.getTypeAllocSize(GV->getType()); } - if (Subtarget->isTargetELF() && RelocM == Reloc::PIC_) + if (Subtarget->isTargetELF() && IsPositionIndependent) return ARMLowerPICELF(GV, Align, VT); // Grab index. - unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : - (Subtarget->isThumb() ? 4 : 8); + unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; unsigned Id = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id, ARMCP::CPValue, @@ -633,10 +594,10 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { // Load value. MachineInstrBuilder MIB; if (isThumb2) { - unsigned Opc = (RelocM!=Reloc::PIC_) ? ARM::t2LDRpci : ARM::t2LDRpci_pic; + unsigned Opc = IsPositionIndependent ? ARM::t2LDRpci_pic : ARM::t2LDRpci; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg).addConstantPoolIndex(Idx); - if (RelocM == Reloc::PIC_) + if (IsPositionIndependent) MIB.addImm(Id); AddOptionalDefs(MIB); } else { @@ -648,7 +609,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { .addImm(0); AddOptionalDefs(MIB); - if (RelocM == Reloc::PIC_) { + if (IsPositionIndependent) { unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD; unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); @@ -912,7 +873,8 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) { void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr, const MachineInstrBuilder &MIB, - unsigned Flags, bool useAM3) { + MachineMemOperand::Flags Flags, + bool useAM3) { // addrmode5 output depends on the selection dag addressing dividing the // offset by 4 that it then later multiplies. Do this here as well. if (VT.SimpleTy == MVT::f32 || VT.SimpleTy == MVT::f64) @@ -931,7 +893,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr, // ARM halfword load/stores and signed byte loads need an additional // operand. if (useAM3) { - signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset; + int Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset; MIB.addReg(0); MIB.addImm(Imm); } else { @@ -945,7 +907,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr, // ARM halfword load/stores and signed byte loads need an additional // operand. if (useAM3) { - signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset; + int Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset; MIB.addReg(0); MIB.addImm(Imm); } else { @@ -1062,6 +1024,21 @@ bool ARMFastISel::SelectLoad(const Instruction *I) { if (cast(I)->isAtomic()) return false; + const Value *SV = I->getOperand(0); + if (TLI.supportSwiftError()) { + // Swifterror values can come from either a function parameter with + // swifterror attribute or an alloca with swifterror attribute. + if (const Argument *Arg = dyn_cast(SV)) { + if (Arg->hasSwiftErrorAttr()) + return false; + } + + if (const AllocaInst *Alloca = dyn_cast(SV)) { + if (Alloca->isSwiftError()) + return false; + } + } + // Verify we have a legal type before going any further. MVT VT; if (!isLoadTypeLegal(I->getType(), VT)) @@ -1177,6 +1154,21 @@ bool ARMFastISel::SelectStore(const Instruction *I) { if (cast(I)->isAtomic()) return false; + const Value *PtrV = I->getOperand(1); + if (TLI.supportSwiftError()) { + // Swifterror values can come from either a function parameter with + // swifterror attribute or an alloca with swifterror attribute. + if (const Argument *Arg = dyn_cast(PtrV)) { + if (Arg->hasSwiftErrorAttr()) + return false; + } + + if (const AllocaInst *Alloca = dyn_cast(PtrV)) { + if (Alloca->isSwiftError()) + return false; + } + } + // Verify we have a legal type before going any further. MVT VT; if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT)) @@ -1726,6 +1718,13 @@ bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) { if (!isTypeLegal(Ty, VT)) return false; + // Many ABIs do not provide a libcall for standalone remainder, so we need to + // use divrem (see the RTABI 4.3.1). Since FastISel can't handle non-double + // multi-reg returns, we'll have to bail out. + if (!TLI.hasStandaloneRem(VT)) { + return false; + } + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i8) LC = isSigned ? RTLIB::SREM_I8 : RTLIB::UREM_I8; @@ -1847,6 +1846,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, } // Fallthrough case CallingConv::C: + case CallingConv::CXX_FAST_TLS: // Use target triple & subtarget features to do actual dispatch. if (Subtarget->isAAPCS_ABI()) { if (Subtarget->hasVFP2() && @@ -1858,6 +1858,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); } case CallingConv::ARM_AAPCS_VFP: + case CallingConv::Swift: if (!isVarArg) return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); // Fall through to soft float variant, variadic functions don't @@ -2083,6 +2084,10 @@ bool ARMFastISel::SelectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; + if (TLI.supportSwiftError() && + F.getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) return false; @@ -2295,8 +2300,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, // TODO: Avoid some calling conventions? - PointerType *PT = cast(CS.getCalledValue()->getType()); - FunctionType *FTy = cast(PT->getElementType()); + FunctionType *FTy = CS.getFunctionType(); bool isVarArg = FTy->isVarArg(); // Handle *simple* calls for now. @@ -2345,6 +2349,8 @@ bool ARMFastISel::SelectCall(const Instruction *I, // FIXME: Only handle *easy* calls for now. if (CS.paramHasAttr(AttrInd, Attribute::InReg) || CS.paramHasAttr(AttrInd, Attribute::StructRet) || + CS.paramHasAttr(AttrInd, Attribute::SwiftSelf) || + CS.paramHasAttr(AttrInd, Attribute::SwiftError) || CS.paramHasAttr(AttrInd, Attribute::Nest) || CS.paramHasAttr(AttrInd, Attribute::ByVal)) return false; @@ -2394,22 +2400,15 @@ bool ARMFastISel::SelectCall(const Instruction *I, MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); - unsigned char OpFlags = 0; - - // Add MO_PLT for global address or external symbol in the PIC relocation - // model. - if (Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_) - OpFlags = ARMII::MO_PLT; - // ARM calls don't take a predicate, but tBL / tBLX do. if(isThumb2) AddDefaultPred(MIB); if (UseReg) MIB.addReg(CalleeReg); else if (!IntrMemName) - MIB.addGlobalAddress(GV, 0, OpFlags); + MIB.addGlobalAddress(GV, 0, 0); else - MIB.addExternalSymbol(IntrMemName, OpFlags); + MIB.addExternalSymbol(IntrMemName, 0); // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) @@ -2942,8 +2941,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT) { - bool UseGOT_PREL = - !(GV->hasHiddenVisibility() || GV->hasLocalLinkage()); + bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); LLVMContext *Context = &MF->getFunction()->getContext(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); @@ -3006,6 +3004,7 @@ bool ARMFastISel::fastLowerArguments() { case CallingConv::ARM_AAPCS_VFP: case CallingConv::ARM_AAPCS: case CallingConv::ARM_APCS: + case CallingConv::Swift: break; } @@ -3019,6 +3018,8 @@ bool ARMFastISel::fastLowerArguments() { if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) || F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || + F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) || + F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) || F->getAttributes().hasAttribute(Idx, Attribute::ByVal)) return false; diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index c5990bb7d1fb..e8c9f610ea64 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -98,35 +98,32 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects(); } -static bool isCSRestore(MachineInstr *MI, - const ARMBaseInstrInfo &TII, +static bool isCSRestore(MachineInstr &MI, const ARMBaseInstrInfo &TII, const MCPhysReg *CSRegs) { // Integer spill area is handled with "pop". - if (isPopOpcode(MI->getOpcode())) { + if (isPopOpcode(MI.getOpcode())) { // The first two operands are predicates. The last two are // imp-def and imp-use of SP. Check everything in between. - for (int i = 5, e = MI->getNumOperands(); i != e; ++i) - if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs)) + for (int i = 5, e = MI.getNumOperands(); i != e; ++i) + if (!isCalleeSavedRegister(MI.getOperand(i).getReg(), CSRegs)) return false; return true; } - if ((MI->getOpcode() == ARM::LDR_POST_IMM || - MI->getOpcode() == ARM::LDR_POST_REG || - MI->getOpcode() == ARM::t2LDR_POST) && - isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) && - MI->getOperand(1).getReg() == ARM::SP) + if ((MI.getOpcode() == ARM::LDR_POST_IMM || + MI.getOpcode() == ARM::LDR_POST_REG || + MI.getOpcode() == ARM::t2LDR_POST) && + isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs) && + MI.getOperand(1).getReg() == ARM::SP) return true; return false; } -static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, DebugLoc dl, - const ARMBaseInstrInfo &TII, unsigned DestReg, - unsigned SrcReg, int NumBytes, - unsigned MIFlags = MachineInstr::NoFlags, - ARMCC::CondCodes Pred = ARMCC::AL, - unsigned PredReg = 0) { +static void emitRegPlusImmediate( + bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg, + unsigned SrcReg, int NumBytes, unsigned MIFlags = MachineInstr::NoFlags, + ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) { if (isARM) emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes, Pred, PredReg, TII, MIFlags); @@ -136,7 +133,7 @@ static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB, } static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, DebugLoc dl, + MachineBasicBlock::iterator &MBBI, const DebugLoc &dl, const ARMBaseInstrInfo &TII, int NumBytes, unsigned MIFlags = MachineInstr::NoFlags, ARMCC::CondCodes Pred = ARMCC::AL, @@ -145,9 +142,9 @@ static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB, MIFlags, Pred, PredReg); } -static int sizeOfSPAdjustment(const MachineInstr *MI) { +static int sizeOfSPAdjustment(const MachineInstr &MI) { int RegSize; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case ARM::VSTMDDB_UPD: RegSize = 8; break; @@ -165,7 +162,7 @@ static int sizeOfSPAdjustment(const MachineInstr *MI) { int count = 0; // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+ // pred) so the list starts at 4. - for (int i = MI->getNumOperands() - 1; i >= 4; --i) + for (int i = MI.getNumOperands() - 1; i >= 4; --i) count += RegSize; return count; } @@ -206,7 +203,8 @@ struct StackAdjustingInsts { } void emitDefCFAOffsets(MachineModuleInfo &MMI, MachineBasicBlock &MBB, - DebugLoc dl, const ARMBaseInstrInfo &TII, bool HasFP) { + const DebugLoc &dl, const ARMBaseInstrInfo &TII, + bool HasFP) { unsigned CFAOffset = 0; for (auto &Info : Insts) { if (HasFP && !Info.BeforeFPSet) @@ -235,7 +233,7 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI, const TargetInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc DL, const unsigned Reg, + const DebugLoc &DL, const unsigned Reg, const unsigned Alignment, const bool MustBeSingleInstruction) { const ARMSubtarget &AST = @@ -355,7 +353,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, case ARM::R10: case ARM::R11: case ARM::R12: - if (STI.isTargetDarwin()) { + if (STI.splitFramePushPop()) { GPRCS2Size += 4; break; } @@ -416,7 +414,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // .cfi_offset operations will reflect that. if (DPRGapSize) { assert(DPRGapSize == 4 && "unexpected alignment requirements for DPRs"); - if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, DPRGapSize)) + if (tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, DPRGapSize)) DefCFAOffsetCandidates.addExtraBytes(LastPush, DPRGapSize); else { emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize, @@ -430,7 +428,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // Since vpush register list cannot have gaps, there may be multiple vpush // instructions in the prologue. while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) { - DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(MBBI)); + DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI)); LastPush = MBBI++; } } @@ -485,7 +483,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), ARM::SP) - .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP, RegState::Kill) .addReg(ARM::R4, RegState::Kill) .setMIFlags(MachineInstr::FrameSetup))); NumBytes = 0; @@ -494,7 +492,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, if (NumBytes) { // Adjust SP after all the callee-save spills. if (AFI->getNumAlignedDPRCS2Regs() == 0 && - tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes)) + tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, NumBytes)) DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes); else { emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, @@ -522,7 +520,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // that push. if (HasFP) { MachineBasicBlock::iterator AfterPush = std::next(GPRCS1Push); - unsigned PushSize = sizeOfSPAdjustment(GPRCS1Push); + unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push); emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII, FramePtr, ARM::SP, PushSize + FramePtrOffsetInPush, @@ -559,7 +557,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, case ARM::R10: case ARM::R11: case ARM::R12: - if (STI.isTargetDarwin()) + if (STI.splitFramePushPop()) break; // fallthrough case ARM::R0: @@ -592,7 +590,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, case ARM::R10: case ARM::R11: case ARM::R12: - if (STI.isTargetDarwin()) { + if (STI.splitFramePushPop()) { unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); unsigned Offset = MFI->getObjectOffset(FI); unsigned CFIIndex = MMI.addFrameInst( @@ -727,8 +725,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, if (MBBI != MBB.begin()) { do { --MBBI; - } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); - if (!isCSRestore(MBBI, TII, CSRegs)) + } while (MBBI != MBB.begin() && isCSRestore(*MBBI, TII, CSRegs)); + if (!isCSRestore(*MBBI, TII, CSRegs)) ++MBBI; } @@ -774,8 +772,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, .addReg(FramePtr)); } } else if (NumBytes && - !tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes)) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + !tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes)) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); // Increment past our save areas. if (AFI->getDPRCalleeSavedAreaSize()) { @@ -904,33 +902,27 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, unsigned LastReg = 0; for (; i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); - if (!(Func)(Reg, STI.isTargetDarwin())) continue; + if (!(Func)(Reg, STI.splitFramePushPop())) continue; // D-registers in the aligned area DPRCS2 are NOT spilled here. if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) continue; - // Add the callee-saved register as live-in unless it's LR and - // @llvm.returnaddress is called. If LR is returned for - // @llvm.returnaddress then it's already added to the function and - // entry block live-in sets. - bool isKill = true; - if (Reg == ARM::LR) { - if (MF.getFrameInfo()->isReturnAddressTaken() && - MF.getRegInfo().isLiveIn(Reg)) - isKill = false; - } - - if (isKill) + bool isLiveIn = MF.getRegInfo().isLiveIn(Reg); + if (!isLiveIn) MBB.addLiveIn(Reg); - // If NoGap is true, push consecutive registers and then leave the rest // for other instructions. e.g. // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11} if (NoGap && LastReg && LastReg != Reg-1) break; LastReg = Reg; - Regs.push_back(std::make_pair(Reg, isKill)); + // Do not set a kill flag on values that are also marked as live-in. This + // happens with the @llvm-returnaddress intrinsic and with arguments + // passed in callee saved registers. + // Omitting the kill flags is conservatively correct even if the live-in + // is not used after all. + Regs.push_back(std::make_pair(Reg, /*isKill=*/!isLiveIn)); } if (Regs.empty()) @@ -991,7 +983,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, bool DeleteRet = false; for (; i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); - if (!(Func)(Reg, STI.isTargetDarwin())) continue; + if (!(Func)(Reg, STI.splitFramePushPop())) continue; // The aligned reloads from area DPRCS2 are not inserted here. if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) @@ -1027,7 +1019,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, for (unsigned i = 0, e = Regs.size(); i < e; ++i) MIB.addReg(Regs[i], getDefRegState(true)); if (DeleteRet && MI != MBB.end()) { - MIB.copyImplicitOps(&*MI); + MIB.copyImplicitOps(*MI); MI->eraseFromParent(); } MI = MIB; @@ -1367,7 +1359,7 @@ static unsigned GetFunctionSizeInBytes(const MachineFunction &MF, unsigned FnSize = 0; for (auto &MBB : MF) { for (auto &MI : MBB) - FnSize += TII.GetInstSizeInBytes(&MI); + FnSize += TII.GetInstSizeInBytes(MI); } return FnSize; } @@ -1485,6 +1477,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, bool CS1Spilled = false; bool LRSpilled = false; unsigned NumGPRSpills = 0; + unsigned NumFPRSpills = 0; SmallVector UnspilledCS1GPRs; SmallVector UnspilledCS2GPRs; const ARMBaseRegisterInfo *RegInfo = static_cast( @@ -1539,13 +1532,22 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, CanEliminateFrame = false; } - if (!ARM::GPRRegClass.contains(Reg)) + if (!ARM::GPRRegClass.contains(Reg)) { + if (Spilled) { + if (ARM::SPRRegClass.contains(Reg)) + NumFPRSpills++; + else if (ARM::DPRRegClass.contains(Reg)) + NumFPRSpills += 2; + else if (ARM::QPRRegClass.contains(Reg)) + NumFPRSpills += 4; + } continue; + } if (Spilled) { NumGPRSpills++; - if (!STI.isTargetDarwin()) { + if (!STI.splitFramePushPop()) { if (Reg == ARM::LR) LRSpilled = true; CS1Spilled = true; @@ -1567,7 +1569,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, break; } } else { - if (!STI.isTargetDarwin()) { + if (!STI.splitFramePushPop()) { UnspilledCS1GPRs.push_back(Reg); continue; } @@ -1613,12 +1615,21 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // FIXME: We could add logic to be more precise about negative offsets // and which instructions will need a scratch register for them. Is it // worth the effort and added fragility? - bool BigStack = (RS && (MFI->estimateStackSize(MF) + - ((hasFP(MF) && AFI->hasStackFrame()) ? 4 : 0) >= - estimateRSStackSizeLimit(MF, this))) || + unsigned EstimatedStackSize = + MFI->estimateStackSize(MF) + 4 * (NumGPRSpills + NumFPRSpills); + if (hasFP(MF)) { + if (AFI->hasStackFrame()) + EstimatedStackSize += 4; + } else { + // If FP is not used, SP will be used to access arguments, so count the + // size of arguments into the estimation. + EstimatedStackSize += MF.getInfo()->getArgumentStackSize(); + } + EstimatedStackSize += 16; // For possible paddings. + + bool BigStack = EstimatedStackSize >= estimateRSStackSizeLimit(MF, this) || MFI->hasVarSizedObjects() || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF)); - bool ExtraCSSpill = false; if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { AFI->setHasStackFrame(true); @@ -1712,6 +1723,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, } else if (!AFI->isThumb1OnlyFunction()) { // note: Thumb1 functions spill to R12, not the stack. Reserve a slot // closest to SP or frame pointer. + assert(RS && "Register scavenging not provided"); const TargetRegisterClass *RC = &ARM::GPRRegClass; RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), @@ -1726,19 +1738,18 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, } } - -void ARMFrameLowering:: -eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const { +MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { const ARMBaseInstrInfo &TII = *static_cast(MF.getSubtarget().getInstrInfo()); if (!hasReservedCallFrame(MF)) { // If we have alloca, convert as follows: // ADJCALLSTACKDOWN -> sub, sp, sp, amount // ADJCALLSTACKUP -> add, sp, sp, amount - MachineInstr *Old = I; - DebugLoc dl = Old->getDebugLoc(); - unsigned Amount = Old->getOperand(0).getImm(); + MachineInstr &Old = *I; + DebugLoc dl = Old.getDebugLoc(); + unsigned Amount = Old.getOperand(0).getImm(); if (Amount != 0) { // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next @@ -1751,25 +1762,26 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, bool isARM = !AFI->isThumbFunction(); // Replace the pseudo instruction with a new instruction... - unsigned Opc = Old->getOpcode(); - int PIdx = Old->findFirstPredOperandIdx(); - ARMCC::CondCodes Pred = (PIdx == -1) - ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm(); + unsigned Opc = Old.getOpcode(); + int PIdx = Old.findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = + (PIdx == -1) ? ARMCC::AL + : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm(); if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { // Note: PredReg is operand 2 for ADJCALLSTACKDOWN. - unsigned PredReg = Old->getOperand(2).getReg(); + unsigned PredReg = Old.getOperand(2).getReg(); emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags, Pred, PredReg); } else { // Note: PredReg is operand 3 for ADJCALLSTACKUP. - unsigned PredReg = Old->getOperand(3).getReg(); + unsigned PredReg = Old.getOperand(3).getReg(); assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags, Pred, PredReg); } } } - MBB.erase(I); + return MBB.erase(I); } /// Get the minimum constant for ARM that is greater than or equal to the @@ -2162,7 +2174,7 @@ void ARMFrameLowering::adjustForSegmentedStacks( PrevStackMBB->addSuccessor(McrMBB); -#ifdef XDEBUG +#ifdef EXPENSIVE_CHECKS MF.verify(); #endif } diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index 66f4dfb6ef52..21cd78da395c 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -74,7 +74,7 @@ public: bool(*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs) const; - void + MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp index 0157c0a35286..0d904ecb6296 100644 --- a/lib/Target/ARM/ARMHazardRecognizer.cpp +++ b/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -50,8 +50,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { // Skip over one non-VFP / NEON instruction. if (!LastMI->isBarrier() && - // On A9, AGU and NEON/FPU are muxed. - !(TII.getSubtarget().isLikeA9() && LastMI->mayLoadOrStore()) && + !(TII.getSubtarget().hasMuxedUnits() && LastMI->mayLoadOrStore()) && (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) { MachineBasicBlock::iterator I = LastMI; if (I != LastMI->getParent()->begin()) { diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 6e7edbf9fb15..20db3d39bcae 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -29,7 +29,6 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetLowering.h" @@ -44,11 +43,6 @@ DisableShifterOp("disable-shifter-op", cl::Hidden, cl::desc("Disable isel of shifter-op"), cl::init(false)); -static cl::opt -CheckVMLxHazard("check-vmlx-hazard", cl::Hidden, - cl::desc("Check fp vmla / vmls hazard at isel time"), - cl::init(true)); - //===--------------------------------------------------------------------===// /// ARMDAGToDAGISel - ARM specific code to select ARM machine /// instructions for SelectionDAG operations. @@ -84,12 +78,11 @@ public: /// getI32Imm - Return a target constant of type i32 with the specified /// value. - inline SDValue getI32Imm(unsigned Imm, SDLoc dl) { + inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) { return CurDAG->getTargetConstant(Imm, dl, MVT::i32); } - SDNode *Select(SDNode *N) override; - + void Select(SDNode *N) override; bool hasNoVMLxHazardUse(SDNode *N) const; bool isShifterOpProfitable(const SDValue &Shift, @@ -200,57 +193,61 @@ public: #include "ARMGenDAGISel.inc" private: - /// SelectARMIndexedLoad - Indexed (pre/post inc/dec) load matching code for - /// ARM. - SDNode *SelectARMIndexedLoad(SDNode *N); - SDNode *SelectT2IndexedLoad(SDNode *N); + /// Indexed (pre/post inc/dec) load matching code for ARM. + bool tryARMIndexedLoad(SDNode *N); + bool tryT1IndexedLoad(SDNode *N); + bool tryT2IndexedLoad(SDNode *N); /// SelectVLD - Select NEON load intrinsics. NumVecs should be /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// loads of D registers and even subregs and odd subregs of Q registers. /// For NumVecs <= 2, QOpcodes1 is not used. - SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, - const uint16_t *DOpcodes, - const uint16_t *QOpcodes0, const uint16_t *QOpcodes1); + void SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *DOpcodes, const uint16_t *QOpcodes0, + const uint16_t *QOpcodes1); /// SelectVST - Select NEON store intrinsics. NumVecs should /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// stores of D registers and even subregs and odd subregs of Q registers. /// For NumVecs <= 2, QOpcodes1 is not used. - SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, - const uint16_t *DOpcodes, - const uint16_t *QOpcodes0, const uint16_t *QOpcodes1); + void SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *DOpcodes, const uint16_t *QOpcodes0, + const uint16_t *QOpcodes1); /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should /// be 2, 3 or 4. The opcode arrays specify the instructions used for /// load/store of D registers and Q registers. - SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, - bool isUpdating, unsigned NumVecs, - const uint16_t *DOpcodes, const uint16_t *QOpcodes); + void SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, + unsigned NumVecs, const uint16_t *DOpcodes, + const uint16_t *QOpcodes); /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. (Q registers are not supported.) - SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, - const uint16_t *Opcodes); + void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes); /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be /// generated to force the table registers to be consecutive. - SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc); + void SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc); - /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM. - SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned); + /// Try to select SBFX/UBFX instructions for ARM. + bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned); // Select special operations if node forms integer ABS pattern - SDNode *SelectABSOp(SDNode *N); + bool tryABSOp(SDNode *N); + + bool tryReadRegister(SDNode *N); + bool tryWriteRegister(SDNode *N); - SDNode *SelectReadRegister(SDNode *N); - SDNode *SelectWriteRegister(SDNode *N); + bool tryInlineAsm(SDNode *N); - SDNode *SelectInlineAsm(SDNode *N); + void SelectConcatVector(SDNode *N); - SDNode *SelectConcatVector(SDNode *N); + bool trySMLAWSMULW(SDNode *N); + + void SelectCMP_SWAP(SDNode *N); /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. @@ -269,7 +266,7 @@ private: SDNode *createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); // Get the alignment operand for a NEON VLD or VST instruction. - SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs, + SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs, bool is64BitVector); /// Returns the number of instructions required to materialize the given @@ -426,11 +423,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { if (OptLevel == CodeGenOpt::None) return true; - if (!CheckVMLxHazard) - return true; - - if (!Subtarget->isCortexA7() && !Subtarget->isCortexA8() && - !Subtarget->isCortexA9() && !Subtarget->isSwift()) + if (!Subtarget->hasVMLxHazards()) return true; if (!N->hasOneUse()) @@ -484,6 +477,7 @@ unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const { if (Subtarget->isThumb()) { if (Val <= 255) return 1; // MOV if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW + if (Val <= 510) return 2; // MOV + ADDi8 if (~Val <= 255) return 2; // MOV + MVN if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL } else { @@ -548,11 +542,9 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, unsigned PowerOfTwo = 0; SDValue NewMulConst; if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) { - BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32, - N.getOperand(0), NewMulConst) - .getNode()), - 0); + HandleSDNode Handle(N); replaceDAGValue(N.getOperand(1), NewMulConst); + BaseReg = Handle.getValue(); Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl, PowerOfTwo), SDLoc(N), MVT::i32); @@ -623,6 +615,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol && N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } else @@ -803,6 +796,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, FI, TLI->getPointerTy(CurDAG->getDataLayout())); } else if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol && N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } @@ -1070,6 +1064,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, FI, TLI->getPointerTy(CurDAG->getDataLayout())); } else if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol && N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } @@ -1190,6 +1185,7 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, return false; // We want to select register offset instead } else if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol && N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } else { @@ -1297,6 +1293,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol && N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); if (Base.getOpcode() == ISD::TargetConstantPool) @@ -1468,15 +1465,15 @@ bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base, //===--------------------------------------------------------------------===// /// getAL - Returns a ARMCC::AL immediate node. -static inline SDValue getAL(SelectionDAG *CurDAG, SDLoc dl) { +static inline SDValue getAL(SelectionDAG *CurDAG, const SDLoc &dl) { return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, dl, MVT::i32); } -SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) { +bool ARMDAGToDAGISel::tryARMIndexedLoad(SDNode *N) { LoadSDNode *LD = cast(N); ISD::MemIndexedMode AM = LD->getAddressingMode(); if (AM == ISD::UNINDEXED) - return nullptr; + return false; EVT LoadedVT = LD->getMemoryVT(); SDValue Offset, AMOpc; @@ -1530,26 +1527,53 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) { SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, AMOpc, getAL(CurDAG, SDLoc(N)), CurDAG->getRegister(0, MVT::i32), Chain }; - return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, - MVT::i32, MVT::Other, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, + MVT::i32, MVT::Other, Ops)); + return true; } else { SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG, SDLoc(N)), CurDAG->getRegister(0, MVT::i32), Chain }; - return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, - MVT::i32, MVT::Other, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, + MVT::i32, MVT::Other, Ops)); + return true; } } - return nullptr; + return false; +} + +bool ARMDAGToDAGISel::tryT1IndexedLoad(SDNode *N) { + LoadSDNode *LD = cast(N); + EVT LoadedVT = LD->getMemoryVT(); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED || LD->getExtensionType() != ISD::NON_EXTLOAD || + AM != ISD::POST_INC || LoadedVT.getSimpleVT().SimpleTy != MVT::i32) + return false; + + auto *COffs = dyn_cast(LD->getOffset()); + if (!COffs || COffs->getZExtValue() != 4) + return false; + + // A T1 post-indexed load is just a single register LDM: LDM r0!, {r1}. + // The encoding of LDM is not how the rest of ISel expects a post-inc load to + // look however, so we use a pseudo here and switch it for a tLDMIA_UPD after + // ISel. + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[]= { Base, getAL(CurDAG, SDLoc(N)), + CurDAG->getRegister(0, MVT::i32), Chain }; + ReplaceNode(N, CurDAG->getMachineNode(ARM::tLDR_postidx, SDLoc(N), MVT::i32, MVT::i32, + MVT::Other, Ops)); + return true; } -SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { +bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) { LoadSDNode *LD = cast(N); ISD::MemIndexedMode AM = LD->getAddressingMode(); if (AM == ISD::UNINDEXED) - return nullptr; + return false; EVT LoadedVT = LD->getMemoryVT(); bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; @@ -1576,7 +1600,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST; break; default: - return nullptr; + return false; } Match = true; } @@ -1586,11 +1610,12 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, Offset, getAL(CurDAG, SDLoc(N)), CurDAG->getRegister(0, MVT::i32), Chain }; - return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32, - MVT::Other, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32, + MVT::Other, Ops)); + return true; } - return nullptr; + return false; } /// \brief Form a GPRPair pseudo register from a pair of GPR regs. @@ -1685,7 +1710,7 @@ SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, /// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand /// of a NEON VLD or VST instruction. The supported values depend on the /// number of registers being loaded. -SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, SDLoc dl, +SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs, bool is64BitVector) { unsigned NumRegs = NumVecs; if (!is64BitVector && NumVecs < 3) @@ -1806,17 +1831,17 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) { return Opc; // If not one we handle, return it unchanged. } -SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, - const uint16_t *DOpcodes, - const uint16_t *QOpcodes0, - const uint16_t *QOpcodes1) { +void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *DOpcodes, + const uint16_t *QOpcodes0, + const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); SDLoc dl(N); SDValue MemAddr, Align; unsigned AddrOpIdx = isUpdating ? 1 : 2; if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) - return nullptr; + return; SDValue Chain = N->getOperand(0); EVT VT = N->getValueType(0); @@ -1922,13 +1947,16 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, MemOp[0] = cast(N)->getMemOperand(); cast(VLd)->setMemRefs(MemOp, MemOp + 1); - if (NumVecs == 1) - return VLd; + if (NumVecs == 1) { + ReplaceNode(N, VLd); + return; + } // Extract out the subregisters. SDValue SuperReg = SDValue(VLd, 0); - assert(ARM::dsub_7 == ARM::dsub_0+7 && - ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 && + ARM::qsub_3 == ARM::qsub_0 + 3, + "Unexpected subreg numbering"); unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0); for (unsigned Vec = 0; Vec < NumVecs; ++Vec) ReplaceUses(SDValue(N, Vec), @@ -1936,13 +1964,13 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); if (isUpdating) ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2)); - return nullptr; + CurDAG->RemoveDeadNode(N); } -SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, - const uint16_t *DOpcodes, - const uint16_t *QOpcodes0, - const uint16_t *QOpcodes1) { +void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *DOpcodes, + const uint16_t *QOpcodes0, + const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); SDLoc dl(N); @@ -1950,7 +1978,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, unsigned AddrOpIdx = isUpdating ? 1 : 2; unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) - return nullptr; + return; MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast(N)->getMemOperand(); @@ -2042,7 +2070,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, // Transfer memoperands. cast(VSt)->setMemRefs(MemOp, MemOp + 1); - return VSt; + ReplaceNode(N, VSt); + return; } // Otherwise, quad registers are stored with two separate instructions, @@ -2083,13 +2112,13 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, Ops); cast(VStB)->setMemRefs(MemOp, MemOp + 1); - return VStB; + ReplaceNode(N, VStB); } -SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, - bool isUpdating, unsigned NumVecs, - const uint16_t *DOpcodes, - const uint16_t *QOpcodes) { +void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, + unsigned NumVecs, + const uint16_t *DOpcodes, + const uint16_t *QOpcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); SDLoc dl(N); @@ -2097,7 +2126,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned AddrOpIdx = isUpdating ? 1 : 2; unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) - return nullptr; + return; MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast(N)->getMemOperand(); @@ -2188,13 +2217,16 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, QOpcodes[OpcodeIndex]); SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); cast(VLdLn)->setMemRefs(MemOp, MemOp + 1); - if (!IsLoad) - return VLdLn; + if (!IsLoad) { + ReplaceNode(N, VLdLn); + return; + } // Extract the subregisters. SuperReg = SDValue(VLdLn, 0); - assert(ARM::dsub_7 == ARM::dsub_0+7 && - ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 && + ARM::qsub_3 == ARM::qsub_0 + 3, + "Unexpected subreg numbering"); unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; for (unsigned Vec = 0; Vec < NumVecs; ++Vec) ReplaceUses(SDValue(N, Vec), @@ -2202,18 +2234,17 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1)); if (isUpdating) ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2)); - return nullptr; + CurDAG->RemoveDeadNode(N); } -SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, - unsigned NumVecs, - const uint16_t *Opcodes) { +void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); SDLoc dl(N); SDValue MemAddr, Align; if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align)) - return nullptr; + return; MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast(N)->getMemOperand(); @@ -2277,7 +2308,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, SuperReg = SDValue(VLdDup, 0); // Extract the subregisters. - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering"); unsigned SubIdx = ARM::dsub_0; for (unsigned Vec = 0; Vec < NumVecs; ++Vec) ReplaceUses(SDValue(N, Vec), @@ -2285,11 +2316,11 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); if (isUpdating) ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2)); - return nullptr; + CurDAG->RemoveDeadNode(N); } -SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, - unsigned Opc) { +void ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, + unsigned Opc) { assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range"); SDLoc dl(N); EVT VT = N->getValueType(0); @@ -2318,13 +2349,12 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, Ops.push_back(N->getOperand(FirstTblReg + NumVecs)); Ops.push_back(getAL(CurDAG, dl)); // predicate Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register - return CurDAG->getMachineNode(Opc, dl, VT, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops)); } -SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, - bool isSigned) { +bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) { if (!Subtarget->hasV6T2Ops()) - return nullptr; + return false; unsigned Opc = isSigned ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX) @@ -2338,7 +2368,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, // The immediate is a mask of the low bits iff imm & (imm+1) == 0 if (And_imm & (And_imm + 1)) - return nullptr; + return false; unsigned Srl_imm = 0; if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, @@ -2358,7 +2388,8 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, SDValue Ops[] = { N->getOperand(0).getOperand(0), CurDAG->getTargetConstant(LSB, dl, MVT::i32), getAL(CurDAG, dl), Reg0, Reg0 }; - return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + return true; } // ARM models shift instructions as MOVsi with shifter operand. @@ -2368,17 +2399,19 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, MVT::i32); SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc, getAL(CurDAG, dl), Reg0, Reg0 }; - return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops); + CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops); + return true; } SDValue Ops[] = { N->getOperand(0).getOperand(0), CurDAG->getTargetConstant(LSB, dl, MVT::i32), CurDAG->getTargetConstant(Width, dl, MVT::i32), getAL(CurDAG, dl), Reg0 }; - return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + return true; } } - return nullptr; + return false; } // Otherwise, we're looking for a shift of a shift @@ -2392,13 +2425,35 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, unsigned Width = 32 - Srl_imm - 1; int LSB = Srl_imm - Shl_imm; if (LSB < 0) - return nullptr; + return false; SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0).getOperand(0), CurDAG->getTargetConstant(LSB, dl, MVT::i32), CurDAG->getTargetConstant(Width, dl, MVT::i32), getAL(CurDAG, dl), Reg0 }; - return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + return true; + } + } + + // Or we are looking for a shift of an and, with a mask operand + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_imm) && + isShiftedMask_32(And_imm)) { + unsigned Srl_imm = 0; + unsigned LSB = countTrailingZeros(And_imm); + // Shift must be the same as the ands lsb + if (isInt32Immediate(N->getOperand(1), Srl_imm) && Srl_imm == LSB) { + assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); + unsigned MSB = 31 - countLeadingZeros(And_imm); + // Note: The width operand is encoded as width-1. + unsigned Width = MSB - LSB; + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0).getOperand(0), + CurDAG->getTargetConstant(Srl_imm, dl, MVT::i32), + CurDAG->getTargetConstant(Width, dl, MVT::i32), + getAL(CurDAG, dl), Reg0 }; + CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + return true; } } @@ -2407,20 +2462,21 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, unsigned LSB = 0; if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, LSB) && !isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRA, LSB)) - return nullptr; + return false; if (LSB + Width > 32) - return nullptr; + return false; SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0).getOperand(0), CurDAG->getTargetConstant(LSB, dl, MVT::i32), CurDAG->getTargetConstant(Width - 1, dl, MVT::i32), getAL(CurDAG, dl), Reg0 }; - return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + return true; } - return nullptr; + return false; } /// Target-specific DAG combining for ISD::XOR. @@ -2433,16 +2489,16 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, /// Y = sra (X, size(X)-1); xor (add (X, Y), Y) /// ARM instruction selection detects the latter and matches it to /// ARM::ABS or ARM::t2ABS machine node. -SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){ +bool ARMDAGToDAGISel::tryABSOp(SDNode *N){ SDValue XORSrc0 = N->getOperand(0); SDValue XORSrc1 = N->getOperand(1); EVT VT = N->getValueType(0); if (Subtarget->isThumb1Only()) - return nullptr; + return false; if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA) - return nullptr; + return false; SDValue ADDSrc0 = XORSrc0.getOperand(0); SDValue ADDSrc1 = XORSrc0.getOperand(1); @@ -2456,57 +2512,214 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){ XType.isInteger() && SRAConstant != nullptr && Size == SRAConstant->getZExtValue()) { unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS; - return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0); + CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0); + return true; + } + + return false; +} + +static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1, + bool Accumulate) { + // For SM*WB, we need to some form of sext. + // For SM*WT, we need to search for (sra X, 16) + // Src1 then gets set to X. + if ((SignExt.getOpcode() == ISD::SIGN_EXTEND || + SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG || + SignExt.getOpcode() == ISD::AssertSext) && + SignExt.getValueType() == MVT::i32) { + + *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB; + Src1 = SignExt.getOperand(0); + return true; } - return nullptr; + if (SignExt.getOpcode() != ISD::SRA) + return false; + + ConstantSDNode *SRASrc1 = dyn_cast(SignExt.getOperand(1)); + if (!SRASrc1 || SRASrc1->getZExtValue() != 16) + return false; + + SDValue Op0 = SignExt.getOperand(0); + + // The sign extend operand for SM*WB could be generated by a shl and ashr. + if (Op0.getOpcode() == ISD::SHL) { + SDValue SHL = Op0; + ConstantSDNode *SHLSrc1 = dyn_cast(SHL.getOperand(1)); + if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16) + return false; + + *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB; + Src1 = Op0.getOperand(0); + return true; + } + *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT; + Src1 = SignExt.getOperand(0); + return true; } -SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) { +static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0, + SDValue &Src1, bool Accumulate) { + // First we look for: + // (add (or (srl ?, 16), (shl ?, 16))) + if (OR.getOpcode() != ISD::OR) + return false; + + SDValue SRL = OR.getOperand(0); + SDValue SHL = OR.getOperand(1); + + if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { + SRL = OR.getOperand(1); + SHL = OR.getOperand(0); + if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) + return false; + } + + ConstantSDNode *SRLSrc1 = dyn_cast(SRL.getOperand(1)); + ConstantSDNode *SHLSrc1 = dyn_cast(SHL.getOperand(1)); + if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 || + SHLSrc1->getZExtValue() != 16) + return false; + + // The first operands to the shifts need to be the two results from the + // same smul_lohi node. + if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || + SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) + return false; + + SDNode *SMULLOHI = SRL.getOperand(0).getNode(); + if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || + SHL.getOperand(0) != SDValue(SMULLOHI, 1)) + return false; + + // Now we have: + // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) + // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. + // For SMLAWB the 16-bit value will signed extended somehow. + // For SMLAWT only the SRA is required. + + // Check both sides of SMUL_LOHI + if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) { + Src0 = SMULLOHI->getOperand(1); + } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1, + Accumulate)) { + Src0 = SMULLOHI->getOperand(0); + } else { + return false; + } + return true; +} + +bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) { + SDLoc dl(N); + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + SDValue A, B; + unsigned Opc = 0; + + if (N->getOpcode() == ISD::ADD) { + if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR) + return false; + + SDValue Acc; + if (SearchSignedMulLong(Src0, &Opc, A, B, true)) { + Acc = Src1; + } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) { + Acc = Src0; + } else { + return false; + } + if (Opc == 0) + return false; + + SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32) }; + CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops); + return true; + } else if (N->getOpcode() == ISD::OR && + SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) { + if (Opc == 0) + return false; + + SDValue Ops[] = { A, B, getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32)}; + CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + return true; + } + return false; +} + +/// We've got special pseudo-instructions for these +void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) { + unsigned Opcode; + EVT MemTy = cast(N)->getMemoryVT(); + if (MemTy == MVT::i8) + Opcode = ARM::CMP_SWAP_8; + else if (MemTy == MVT::i16) + Opcode = ARM::CMP_SWAP_16; + else if (MemTy == MVT::i32) + Opcode = ARM::CMP_SWAP_32; + else + llvm_unreachable("Unknown AtomicCmpSwap type"); + + SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3), + N->getOperand(0)}; + SDNode *CmpSwap = CurDAG->getMachineNode( + Opcode, SDLoc(N), + CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other), Ops); + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + + ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0)); + ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2)); + CurDAG->RemoveDeadNode(N); +} + +void ARMDAGToDAGISel::SelectConcatVector(SDNode *N) { // The only time a CONCAT_VECTORS operation can have legal types is when // two 64-bit vectors are concatenated to a 128-bit vector. EVT VT = N->getValueType(0); if (!VT.is128BitVector() || N->getNumOperands() != 2) llvm_unreachable("unexpected CONCAT_VECTORS"); - return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)); + ReplaceNode(N, createDRegPairNode(VT, N->getOperand(0), N->getOperand(1))); } -SDNode *ARMDAGToDAGISel::Select(SDNode *N) { +void ARMDAGToDAGISel::Select(SDNode *N) { SDLoc dl(N); if (N->isMachineOpcode()) { N->setNodeId(-1); - return nullptr; // Already selected. + return; // Already selected. } switch (N->getOpcode()) { default: break; - case ISD::WRITE_REGISTER: { - SDNode *ResNode = SelectWriteRegister(N); - if (ResNode) - return ResNode; + case ISD::ADD: + case ISD::OR: + if (trySMLAWSMULW(N)) + return; break; - } - case ISD::READ_REGISTER: { - SDNode *ResNode = SelectReadRegister(N); - if (ResNode) - return ResNode; + case ISD::WRITE_REGISTER: + if (tryWriteRegister(N)) + return; break; - } - case ISD::INLINEASM: { - SDNode *ResNode = SelectInlineAsm(N); - if (ResNode) - return ResNode; + case ISD::READ_REGISTER: + if (tryReadRegister(N)) + return; break; - } - case ISD::XOR: { + case ISD::INLINEASM: + if (tryInlineAsm(N)) + return; + break; + case ISD::XOR: // Select special operations if XOR node forms integer ABS pattern - SDNode *ResNode = SelectABSOp(N); - if (ResNode) - return ResNode; + if (tryABSOp(N)) + return; // Other cases are autogenerated. break; - } case ISD::Constant: { unsigned Val = cast(N)->getZExtValue(); // If we can't materialize the constant we need to use a literal pool @@ -2530,11 +2743,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { CurDAG->getRegister(0, MVT::i32), CurDAG->getEntryNode() }; - ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, - Ops); + ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, + Ops); } - ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0)); - return nullptr; + ReplaceNode(N, ResNode); + return; } // Other cases are autogenerated. @@ -2551,25 +2764,27 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { MachineFrameInfo *MFI = MF->getFrameInfo(); if (MFI->getObjectAlignment(FI) < 4) MFI->setObjectAlignment(FI, 4); - return CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI, - CurDAG->getTargetConstant(0, dl, MVT::i32)); + CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI, + CurDAG->getTargetConstant(0, dl, MVT::i32)); + return; } else { unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ? ARM::t2ADDri : ARM::ADDri); SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, dl, MVT::i32), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + return; } } case ISD::SRL: - if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false)) - return I; + if (tryV6T2BitfieldExtractOp(N, false)) + return; break; case ISD::SIGN_EXTEND_INREG: case ISD::SRA: - if (SDNode *I = SelectV6T2BitfieldExtractOp(N, true)) - return I; + if (tryV6T2BitfieldExtractOp(N, true)) + return; break; case ISD::MUL: if (Subtarget->isThumb1Only()) @@ -2587,11 +2802,13 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); if (Subtarget->isThumb()) { SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; - return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops); + CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops); + return; } else { SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; - return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops); + CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops); + return; } } if (isPowerOf2_32(RHSV+1)) { // 2^n-1? @@ -2604,19 +2821,63 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); if (Subtarget->isThumb()) { SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; - return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops); + CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops); + return; } else { SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; - return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops); + CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops); + return; } } } break; case ISD::AND: { // Check for unsigned bitfield extract - if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false)) - return I; + if (tryV6T2BitfieldExtractOp(N, false)) + return; + + // If an immediate is used in an AND node, it is possible that the immediate + // can be more optimally materialized when negated. If this is the case we + // can negate the immediate and use a BIC instead. + auto *N1C = dyn_cast(N->getOperand(1)); + if (N1C && N1C->hasOneUse() && Subtarget->isThumb()) { + uint32_t Imm = (uint32_t) N1C->getZExtValue(); + + // In Thumb2 mode, an AND can take a 12-bit immediate. If this + // immediate can be negated and fit in the immediate operand of + // a t2BIC, don't do any manual transform here as this can be + // handled by the generic ISel machinery. + bool PreferImmediateEncoding = + Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm)); + if (!PreferImmediateEncoding && + ConstantMaterializationCost(Imm) > + ConstantMaterializationCost(~Imm)) { + // The current immediate costs more to materialize than a negated + // immediate, so negate the immediate and use a BIC. + SDValue NewImm = + CurDAG->getConstant(~N1C->getZExtValue(), dl, MVT::i32); + // If the new constant didn't exist before, reposition it in the topological + // ordering so it is just before N. Otherwise, don't touch its location. + if (NewImm->getNodeId() == -1) + CurDAG->RepositionNode(N->getIterator(), NewImm.getNode()); + + if (!Subtarget->hasThumb2()) { + SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), + N->getOperand(0), NewImm, getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32)}; + ReplaceNode(N, CurDAG->getMachineNode(ARM::tBIC, dl, MVT::i32, Ops)); + return; + } else { + SDValue Ops[] = {N->getOperand(0), NewImm, getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32)}; + ReplaceNode(N, + CurDAG->getMachineNode(ARM::t2BICrr, dl, MVT::i32, Ops)); + return; + } + } + } // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits @@ -2632,7 +2893,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { if (!Opc) break; SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); - ConstantSDNode *N1C = dyn_cast(N1); + N1C = dyn_cast(N1); if (!N1C) break; if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) { @@ -2649,29 +2910,34 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { dl, MVT::i32); SDValue Ops[] = { N0.getOperand(0), Imm16, getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(Opc, dl, VT, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops)); + return; } } break; } case ARMISD::VMOVRRD: - return CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32, - N->getOperand(0), getAL(CurDAG, dl), - CurDAG->getRegister(0, MVT::i32)); + ReplaceNode(N, CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32, + N->getOperand(0), getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32))); + return; case ISD::UMUL_LOHI: { if (Subtarget->isThumb1Only()) break; if (Subtarget->isThumb()) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops); + ReplaceNode( + N, CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops)); + return; } else { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? - ARM::UMULL : ARM::UMULLv5, - dl, MVT::i32, MVT::i32, Ops); + ReplaceNode(N, CurDAG->getMachineNode( + Subtarget->hasV6Ops() ? ARM::UMULL : ARM::UMULLv5, dl, + MVT::i32, MVT::i32, Ops)); + return; } } case ISD::SMUL_LOHI: { @@ -2680,30 +2946,76 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { if (Subtarget->isThumb()) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops); + ReplaceNode( + N, CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops)); + return; } else { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? - ARM::SMULL : ARM::SMULLv5, - dl, MVT::i32, MVT::i32, Ops); + ReplaceNode(N, CurDAG->getMachineNode( + Subtarget->hasV6Ops() ? ARM::SMULL : ARM::SMULLv5, dl, + MVT::i32, MVT::i32, Ops)); + return; } } + case ARMISD::UMAAL: { + unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL; + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + N->getOperand(2), N->getOperand(3), + getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32) }; + ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::i32, Ops)); + return; + } case ARMISD::UMLAL:{ + // UMAAL is similar to UMLAL but it adds two 32-bit values to the + // 64-bit multiplication result. + if (Subtarget->hasV6Ops() && N->getOperand(2).getOpcode() == ARMISD::ADDC && + N->getOperand(3).getOpcode() == ARMISD::ADDE) { + + SDValue Addc = N->getOperand(2); + SDValue Adde = N->getOperand(3); + + if (Adde.getOperand(2).getNode() == Addc.getNode()) { + + ConstantSDNode *Op0 = dyn_cast(Adde.getOperand(0)); + ConstantSDNode *Op1 = dyn_cast(Adde.getOperand(1)); + + if (Op0 && Op1 && Op0->getZExtValue() == 0 && Op1->getZExtValue() == 0) + { + // Select UMAAL instead: UMAAL RdLo, RdHi, Rn, Rm + // RdLo = one operand to be added, lower 32-bits of res + // RdHi = other operand to be added, upper 32-bits of res + // Rn = first multiply operand + // Rm = second multiply operand + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + Addc.getOperand(0), Addc.getOperand(1), + getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32) }; + unsigned opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL; + CurDAG->SelectNodeTo(N, opc, MVT::i32, MVT::i32, Ops); + return; + } + } + } + if (Subtarget->isThumb()) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)}; - return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops); + ReplaceNode( + N, CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops)); + return; }else{ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? - ARM::UMLAL : ARM::UMLALv5, - dl, MVT::i32, MVT::i32, Ops); + ReplaceNode(N, CurDAG->getMachineNode( + Subtarget->hasV6Ops() ? ARM::UMLAL : ARM::UMLALv5, dl, + MVT::i32, MVT::i32, Ops)); + return; } } case ARMISD::SMLAL:{ @@ -2711,25 +3023,29 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)}; - return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops); + ReplaceNode( + N, CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops)); + return; }else{ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? - ARM::SMLAL : ARM::SMLALv5, - dl, MVT::i32, MVT::i32, Ops); + ReplaceNode(N, CurDAG->getMachineNode( + Subtarget->hasV6Ops() ? ARM::SMLAL : ARM::SMLALv5, dl, + MVT::i32, MVT::i32, Ops)); + return; } } case ISD::LOAD: { - SDNode *ResNode = nullptr; - if (Subtarget->isThumb() && Subtarget->hasThumb2()) - ResNode = SelectT2IndexedLoad(N); - else - ResNode = SelectARMIndexedLoad(N); - if (ResNode) - return ResNode; + if (Subtarget->isThumb() && Subtarget->hasThumb2()) { + if (tryT2IndexedLoad(N)) + return; + } else if (Subtarget->isThumb()) { + if (tryT1IndexedLoad(N)) + return; + } else if (tryARMIndexedLoad(N)) + return; // Other cases are autogenerated. break; } @@ -2770,13 +3086,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } ReplaceUses(SDValue(N, 0), SDValue(Chain.getNode(), Chain.getResNo())); - return nullptr; + CurDAG->RemoveDeadNode(N); + return; } case ARMISD::VZIP: { unsigned Opc = 0; EVT VT = N->getValueType(0); switch (VT.getSimpleVT().SimpleTy) { - default: return nullptr; + default: return; case MVT::v8i8: Opc = ARM::VZIPd8; break; case MVT::v4i16: Opc = ARM::VZIPd16; break; case MVT::v2f32: @@ -2790,13 +3107,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Pred = getAL(CurDAG, dl); SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; - return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops)); + return; } case ARMISD::VUZP: { unsigned Opc = 0; EVT VT = N->getValueType(0); switch (VT.getSimpleVT().SimpleTy) { - default: return nullptr; + default: return; case MVT::v8i8: Opc = ARM::VUZPd8; break; case MVT::v4i16: Opc = ARM::VUZPd16; break; case MVT::v2f32: @@ -2810,13 +3128,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Pred = getAL(CurDAG, dl); SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; - return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops)); + return; } case ARMISD::VTRN: { unsigned Opc = 0; EVT VT = N->getValueType(0); switch (VT.getSimpleVT().SimpleTy) { - default: return nullptr; + default: return; case MVT::v8i8: Opc = ARM::VTRNd8; break; case MVT::v4i16: Opc = ARM::VTRNd16; break; case MVT::v2f32: @@ -2829,7 +3148,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Pred = getAL(CurDAG, dl); SDValue PredReg = CurDAG->getRegister(0, MVT::i32); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; - return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops)); + return; } case ARMISD::BUILD_VECTOR: { EVT VecVT = N->getValueType(0); @@ -2837,55 +3157,68 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { unsigned NumElts = VecVT.getVectorNumElements(); if (EltVT == MVT::f64) { assert(NumElts == 2 && "unexpected type for BUILD_VECTOR"); - return createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)); + ReplaceNode( + N, createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1))); + return; } assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR"); - if (NumElts == 2) - return createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)); + if (NumElts == 2) { + ReplaceNode( + N, createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1))); + return; + } assert(NumElts == 4 && "unexpected type for BUILD_VECTOR"); - return createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1), - N->getOperand(2), N->getOperand(3)); + ReplaceNode(N, + createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1), + N->getOperand(2), N->getOperand(3))); + return; } case ARMISD::VLD2DUP: { static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, ARM::VLD2DUPd32 }; - return SelectVLDDup(N, false, 2, Opcodes); + SelectVLDDup(N, false, 2, Opcodes); + return; } case ARMISD::VLD3DUP: { static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd32Pseudo }; - return SelectVLDDup(N, false, 3, Opcodes); + SelectVLDDup(N, false, 3, Opcodes); + return; } case ARMISD::VLD4DUP: { static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd32Pseudo }; - return SelectVLDDup(N, false, 4, Opcodes); + SelectVLDDup(N, false, 4, Opcodes); + return; } case ARMISD::VLD2DUP_UPD: { static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed, ARM::VLD2DUPd16wb_fixed, ARM::VLD2DUPd32wb_fixed }; - return SelectVLDDup(N, true, 2, Opcodes); + SelectVLDDup(N, true, 2, Opcodes); + return; } case ARMISD::VLD3DUP_UPD: { static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd32Pseudo_UPD }; - return SelectVLDDup(N, true, 3, Opcodes); + SelectVLDDup(N, true, 3, Opcodes); + return; } case ARMISD::VLD4DUP_UPD: { static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd32Pseudo_UPD }; - return SelectVLDDup(N, true, 4, Opcodes); + SelectVLDDup(N, true, 4, Opcodes); + return; } case ARMISD::VLD1_UPD: { @@ -2897,7 +3230,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD1q16wb_fixed, ARM::VLD1q32wb_fixed, ARM::VLD1q64wb_fixed }; - return SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr); + SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr); + return; } case ARMISD::VLD2_UPD: { @@ -2908,7 +3242,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed, ARM::VLD2q16PseudoWB_fixed, ARM::VLD2q32PseudoWB_fixed }; - return SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr); + SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr); + return; } case ARMISD::VLD3_UPD: { @@ -2922,7 +3257,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q32oddPseudo_UPD }; - return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + return; } case ARMISD::VLD4_UPD: { @@ -2936,7 +3272,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q32oddPseudo_UPD }; - return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + return; } case ARMISD::VLD2LN_UPD: { @@ -2945,7 +3282,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD2LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq32Pseudo_UPD }; - return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes); + SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes); + return; } case ARMISD::VLD3LN_UPD: { @@ -2954,7 +3292,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD3LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq32Pseudo_UPD }; - return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes); + SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes); + return; } case ARMISD::VLD4LN_UPD: { @@ -2963,7 +3302,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD4LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq32Pseudo_UPD }; - return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes); + SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes); + return; } case ARMISD::VST1_UPD: { @@ -2975,7 +3315,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST1q16wb_fixed, ARM::VST1q32wb_fixed, ARM::VST1q64wb_fixed }; - return SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr); + SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr); + return; } case ARMISD::VST2_UPD: { @@ -2986,7 +3327,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed, ARM::VST2q16PseudoWB_fixed, ARM::VST2q32PseudoWB_fixed }; - return SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr); + SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr); + return; } case ARMISD::VST3_UPD: { @@ -3000,7 +3342,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD, ARM::VST3q16oddPseudo_UPD, ARM::VST3q32oddPseudo_UPD }; - return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + return; } case ARMISD::VST4_UPD: { @@ -3014,7 +3357,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, ARM::VST4q16oddPseudo_UPD, ARM::VST4q32oddPseudo_UPD }; - return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + return; } case ARMISD::VST2LN_UPD: { @@ -3023,7 +3367,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST2LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq32Pseudo_UPD }; - return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes); + SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes); + return; } case ARMISD::VST3LN_UPD: { @@ -3032,7 +3377,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST3LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq32Pseudo_UPD }; - return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes); + SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes); + return; } case ARMISD::VST4LN_UPD: { @@ -3041,7 +3387,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST4LNd32Pseudo_UPD }; static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq32Pseudo_UPD }; - return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes); + SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes); + return; } case ISD::INTRINSIC_VOID: @@ -3051,12 +3398,44 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { default: break; + case Intrinsic::arm_mrrc: + case Intrinsic::arm_mrrc2: { + SDLoc dl(N); + SDValue Chain = N->getOperand(0); + unsigned Opc; + + if (Subtarget->isThumb()) + Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::t2MRRC : ARM::t2MRRC2); + else + Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::MRRC : ARM::MRRC2); + + SmallVector Ops; + Ops.push_back(getI32Imm(cast(N->getOperand(2))->getZExtValue(), dl)); /* coproc */ + Ops.push_back(getI32Imm(cast(N->getOperand(3))->getZExtValue(), dl)); /* opc */ + Ops.push_back(getI32Imm(cast(N->getOperand(4))->getZExtValue(), dl)); /* CRm */ + + // The mrrc2 instruction in ARM doesn't allow predicates, the top 4 bits of the encoded + // instruction will always be '1111' but it is possible in assembly language to specify + // AL as a predicate to mrrc2 but it doesn't make any difference to the encoded instruction. + if (Opc != ARM::MRRC2) { + Ops.push_back(getAL(CurDAG, dl)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + } + + Ops.push_back(Chain); + + // Writes to two registers. + const EVT RetType[] = {MVT::i32, MVT::i32, MVT::Other}; + + ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, RetType, Ops)); + return; + } case Intrinsic::arm_ldaexd: case Intrinsic::arm_ldrexd: { SDLoc dl(N); SDValue Chain = N->getOperand(0); SDValue MemAddr = N->getOperand(2); - bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2(); + bool isThumb = Subtarget->isThumb() && Subtarget->hasV8MBaselineOps(); bool IsAcquire = IntNo == Intrinsic::arm_ldaexd; unsigned NewOpc = isThumb ? (IsAcquire ? ARM::t2LDAEXD : ARM::t2LDREXD) @@ -3072,11 +3451,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ResTys.push_back(MVT::Other); // Place arguments in the right order. - SmallVector Ops; - Ops.push_back(MemAddr); - Ops.push_back(getAL(CurDAG, dl)); - Ops.push_back(CurDAG->getRegister(0, MVT::i32)); - Ops.push_back(Chain); + SDValue Ops[] = {MemAddr, getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32), Chain}; SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops); // Transfer memoperands. MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); @@ -3112,7 +3488,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ReplaceUses(SDValue(N, 1), Result); } ReplaceUses(SDValue(N, 2), OutChain); - return nullptr; + CurDAG->RemoveDeadNode(N); + return; } case Intrinsic::arm_stlexd: case Intrinsic::arm_strexd: { @@ -3150,7 +3527,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { MemOp[0] = cast(N)->getMemOperand(); cast(St)->setMemRefs(MemOp, MemOp + 1); - return St; + ReplaceNode(N, St); + return; } case Intrinsic::arm_neon_vld1: { @@ -3158,7 +3536,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD1d32, ARM::VLD1d64 }; static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, ARM::VLD1q32, ARM::VLD1q64}; - return SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr); + SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr); + return; } case Intrinsic::arm_neon_vld2: { @@ -3166,7 +3545,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD2d32, ARM::VLD1q64 }; static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo, ARM::VLD2q32Pseudo }; - return SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr); + SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr); + return; } case Intrinsic::arm_neon_vld3: { @@ -3180,7 +3560,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo, ARM::VLD3q16oddPseudo, ARM::VLD3q32oddPseudo }; - return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + return; } case Intrinsic::arm_neon_vld4: { @@ -3194,7 +3575,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo, ARM::VLD4q16oddPseudo, ARM::VLD4q32oddPseudo }; - return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + return; } case Intrinsic::arm_neon_vld2lane: { @@ -3203,7 +3585,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD2LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo }; - return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes); + SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes); + return; } case Intrinsic::arm_neon_vld3lane: { @@ -3212,7 +3595,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD3LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo }; - return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes); + SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes); + return; } case Intrinsic::arm_neon_vld4lane: { @@ -3221,7 +3605,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD4LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo }; - return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes); + SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes); + return; } case Intrinsic::arm_neon_vst1: { @@ -3229,15 +3614,17 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST1d32, ARM::VST1d64 }; static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16, ARM::VST1q32, ARM::VST1q64 }; - return SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr); + SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr); + return; } case Intrinsic::arm_neon_vst2: { static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16, ARM::VST2d32, ARM::VST1q64 }; - static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo, - ARM::VST2q32Pseudo }; - return SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr); + static const uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo, + ARM::VST2q32Pseudo }; + SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr); + return; } case Intrinsic::arm_neon_vst3: { @@ -3251,7 +3638,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo, ARM::VST3q16oddPseudo, ARM::VST3q32oddPseudo }; - return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + return; } case Intrinsic::arm_neon_vst4: { @@ -3265,7 +3653,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo, ARM::VST4q16oddPseudo, ARM::VST4q32oddPseudo }; - return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + return; } case Intrinsic::arm_neon_vst2lane: { @@ -3274,7 +3663,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST2LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo }; - return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes); + SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes); + return; } case Intrinsic::arm_neon_vst3lane: { @@ -3283,7 +3673,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST3LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo }; - return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes); + SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes); + return; } case Intrinsic::arm_neon_vst4lane: { @@ -3292,7 +3683,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST4LNd32Pseudo }; static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo }; - return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes); + SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes); + return; } } break; @@ -3305,18 +3697,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { break; case Intrinsic::arm_neon_vtbl2: - return SelectVTBL(N, false, 2, ARM::VTBL2); + SelectVTBL(N, false, 2, ARM::VTBL2); + return; case Intrinsic::arm_neon_vtbl3: - return SelectVTBL(N, false, 3, ARM::VTBL3Pseudo); + SelectVTBL(N, false, 3, ARM::VTBL3Pseudo); + return; case Intrinsic::arm_neon_vtbl4: - return SelectVTBL(N, false, 4, ARM::VTBL4Pseudo); + SelectVTBL(N, false, 4, ARM::VTBL4Pseudo); + return; case Intrinsic::arm_neon_vtbx2: - return SelectVTBL(N, true, 2, ARM::VTBX2); + SelectVTBL(N, true, 2, ARM::VTBX2); + return; case Intrinsic::arm_neon_vtbx3: - return SelectVTBL(N, true, 3, ARM::VTBX3Pseudo); + SelectVTBL(N, true, 3, ARM::VTBX3Pseudo); + return; case Intrinsic::arm_neon_vtbx4: - return SelectVTBL(N, true, 4, ARM::VTBX4Pseudo); + SelectVTBL(N, true, 4, ARM::VTBX4Pseudo); + return; } break; } @@ -3324,13 +3722,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case ARMISD::VTBL1: { SDLoc dl(N); EVT VT = N->getValueType(0); - SmallVector Ops; - - Ops.push_back(N->getOperand(0)); - Ops.push_back(N->getOperand(1)); - Ops.push_back(getAL(CurDAG, dl)); // Predicate - Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register - return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops); + SDValue Ops[] = {N->getOperand(0), N->getOperand(1), + getAL(CurDAG, dl), // Predicate + CurDAG->getRegister(0, MVT::i32)}; // Predicate Register + ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops)); + return; } case ARMISD::VTBL2: { SDLoc dl(N); @@ -3341,19 +3737,22 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue V1 = N->getOperand(1); SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0); - SmallVector Ops; - Ops.push_back(RegSeq); - Ops.push_back(N->getOperand(2)); - Ops.push_back(getAL(CurDAG, dl)); // Predicate - Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register - return CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops); + SDValue Ops[] = {RegSeq, N->getOperand(2), getAL(CurDAG, dl), // Predicate + CurDAG->getRegister(0, MVT::i32)}; // Predicate Register + ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops)); + return; } case ISD::CONCAT_VECTORS: - return SelectConcatVector(N); + SelectConcatVector(N); + return; + + case ISD::ATOMIC_CMP_SWAP: + SelectCMP_SWAP(N); + return; } - return SelectCode(N); + SelectCode(N); } // Inspect a register string of the form @@ -3362,8 +3761,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { // and obtain the integer operands from them, adding these operands to the // provided vector. static void getIntOperandsFromRegisterString(StringRef RegString, - SelectionDAG *CurDAG, SDLoc DL, - std::vector& Ops) { + SelectionDAG *CurDAG, + const SDLoc &DL, + std::vector &Ops) { SmallVector Fields; RegString.split(Fields, ':'); @@ -3444,6 +3844,9 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) { .Case("basepri_max", 0x12) .Case("faultmask", 0x13) .Case("control", 0x14) + .Case("msplim", 0x0a) + .Case("psplim", 0x0b) + .Case("sp", 0x18) .Default(-1); } @@ -3473,11 +3876,27 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead, if (!Subtarget->hasV7Ops() && SYSmvalue >= 0x11 && SYSmvalue <= 0x13) return -1; + if (Subtarget->has8MSecExt() && Flags.lower() == "ns") { + Flags = ""; + SYSmvalue |= 0x80; + } + + if (!Subtarget->has8MSecExt() && + (SYSmvalue == 0xa || SYSmvalue == 0xb || SYSmvalue > 0x14)) + return -1; + + if (!Subtarget->hasV8MMainlineOps() && + (SYSmvalue == 0x8a || SYSmvalue == 0x8b || SYSmvalue == 0x91 || + SYSmvalue == 0x93)) + return -1; + // If it was a read then we won't be expecting flags and so at this point // we can return the mask. if (IsRead) { - assert (Flags.empty() && "Unexpected flags for reading M class register."); - return SYSmvalue; + if (Flags.empty()) + return SYSmvalue; + else + return -1; } // We know we are now handling a write so need to get the mask for the flags. @@ -3563,7 +3982,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) { // Lower the read_register intrinsic to ARM specific DAG nodes // using the supplied metadata string to select the instruction node to use // and the registers/masks to construct as operands for the node. -SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){ +bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){ const MDNodeSDNode *MD = dyn_cast(N->getOperand(1)); const MDString *RegString = dyn_cast(MD->getMD()->getOperand(0)); bool IsThumb2 = Subtarget->isThumb2(); @@ -3592,7 +4011,8 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){ Ops.push_back(getAL(CurDAG, DL)); Ops.push_back(CurDAG->getRegister(0, MVT::i32)); Ops.push_back(N->getOperand(0)); - return CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops)); + return true; } std::string SpecialReg = RegString->getString().lower(); @@ -3602,8 +4022,10 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){ Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; - return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked, - DL, MVT::i32, MVT::Other, Ops); + ReplaceNode( + N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked, + DL, MVT::i32, MVT::Other, Ops)); + return true; } // The VFP registers are read by creating SelectionDAG nodes with opcodes @@ -3623,27 +4045,37 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){ // If an opcode was found then we can lower the read to a VFP instruction. if (Opcode) { if (!Subtarget->hasVFP2()) - return nullptr; + return false; if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8()) - return nullptr; + return false; Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; - return CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops); + ReplaceNode(N, + CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops)); + return true; } // If the target is M Class then need to validate that the register string // is an acceptable value, so check that a mask can be constructed from the // string. if (Subtarget->isMClass()) { - int SYSmValue = getMClassRegisterMask(SpecialReg, "", true, Subtarget); + StringRef Flags = "", Reg = SpecialReg; + if (Reg.endswith("_ns")) { + Flags = "ns"; + Reg = Reg.drop_back(3); + } + + int SYSmValue = getMClassRegisterMask(Reg, Flags, true, Subtarget); if (SYSmValue == -1) - return nullptr; + return false; SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; - return CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops); + ReplaceNode( + N, CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops)); + return true; } // Here we know the target is not M Class so we need to check if it is one @@ -3651,24 +4083,27 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){ if (SpecialReg == "apsr" || SpecialReg == "cpsr") { Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; - return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS, DL, - MVT::i32, MVT::Other, Ops); + ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS, + DL, MVT::i32, MVT::Other, Ops)); + return true; } if (SpecialReg == "spsr") { Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; - return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys, - DL, MVT::i32, MVT::Other, Ops); + ReplaceNode( + N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys, DL, + MVT::i32, MVT::Other, Ops)); + return true; } - return nullptr; + return false; } // Lower the write_register intrinsic to ARM specific DAG nodes // using the supplied metadata string to select the instruction node to use // and the registers/masks to use in the nodes -SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){ +bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){ const MDNodeSDNode *MD = dyn_cast(N->getOperand(1)); const MDString *RegString = dyn_cast(MD->getMD()->getOperand(0)); bool IsThumb2 = Subtarget->isThumb2(); @@ -3698,7 +4133,8 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){ Ops.push_back(CurDAG->getRegister(0, MVT::i32)); Ops.push_back(N->getOperand(0)); - return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops)); + return true; } std::string SpecialReg = RegString->getString().lower(); @@ -3707,8 +4143,10 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){ Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), N->getOperand(2), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; - return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked, - DL, MVT::Other, Ops); + ReplaceNode( + N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked, + DL, MVT::Other, Ops)); + return true; } // The VFP registers are written to by creating SelectionDAG nodes with @@ -3724,16 +4162,17 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){ if (Opcode) { if (!Subtarget->hasVFP2()) - return nullptr; + return false; Ops = { N->getOperand(2), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; - return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops)); + return true; } - SmallVector Fields; - StringRef(SpecialReg).split(Fields, '_', 1, false); - std::string Reg = Fields[0].str(); - StringRef Flags = Fields.size() == 2 ? Fields[1] : ""; + std::pair Fields; + Fields = StringRef(SpecialReg).rsplit('_'); + std::string Reg = Fields.first.str(); + StringRef Flags = Fields.second; // If the target was M Class then need to validate the special register value // and retrieve the mask for use in the instruction node. @@ -3745,12 +4184,13 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){ } int SYSmValue = getMClassRegisterMask(Reg, Flags, false, Subtarget); if (SYSmValue == -1) - return nullptr; + return false; SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32), N->getOperand(2), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; - return CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops); + ReplaceNode(N, CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops)); + return true; } // We then check to see if a valid mask can be constructed for one of the @@ -3761,14 +4201,15 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){ Ops = { CurDAG->getTargetConstant(Mask, DL, MVT::i32), N->getOperand(2), getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; - return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR, - DL, MVT::Other, Ops); + ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR, + DL, MVT::Other, Ops)); + return true; } - return nullptr; + return false; } -SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){ +bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){ std::vector AsmNodeOperands; unsigned Flag, Kind; bool Changed = false; @@ -3823,6 +4264,17 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){ if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx)) IsTiedToChangedOp = OpChanged[DefIdx]; + // Memory operands to inline asm in the SelectionDAG are modeled with two + // operands: a constant of value InlineAsm::Kind_Mem followed by the input + // operand. If we get here and we have a Kind_Mem, skip the next operand (so + // it doesn't get misinterpreted), and continue. We do this here because + // it's important to update the OpChanged array correctly before moving on. + if (Kind == InlineAsm::Kind_Mem) { + SDValue op = N->getOperand(++i); + AsmNodeOperands.push_back(op); + continue; + } + if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef && Kind != InlineAsm::Kind_RegDefEarlyClobber) continue; @@ -3912,12 +4364,13 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){ if (Glue.getNode()) AsmNodeOperands.push_back(Glue); if (!Changed) - return nullptr; + return false; SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N), CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands); New->setNodeId(-1); - return New.getNode(); + ReplaceNode(N, New.getNode()); + return true; } diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 978e99cf511e..d6e7caf98a80 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -65,6 +65,13 @@ ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true)); +// Disabled for causing self-hosting failures once returned-attribute inference +// was enabled. +static cl::opt +EnableThisRetForwarding("arm-this-return-forwarding", cl::Hidden, + cl::desc("Directly forward this return"), + cl::init(false)); + namespace { class ARMCCState : public CCState { public: @@ -240,7 +247,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Set the correct calling convention for ARMv7k WatchOS. It's just // AAPCS_VFP for functions as simple as libcalls. - if (Subtarget->isTargetWatchOS()) { + if (Subtarget->isTargetWatchABI()) { for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP); } @@ -254,7 +261,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // RTLIB if (Subtarget->isAAPCS_ABI() && (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || - Subtarget->isTargetAndroid())) { + Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { static const struct { const RTLIB::Libcall Op; const char * const Name; @@ -390,10 +397,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::SDIV_I32, "__rt_sdiv", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::UDIV_I32, "__rt_udiv", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP }, }; for (const auto &LC : LibraryCalls) { @@ -410,17 +413,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); } - // The half <-> float conversion functions are always soft-float, but are - // needed for some targets which use a hard-float calling convention by - // default. - if (Subtarget->isAAPCS_ABI()) { - setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); - setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); - setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); - } else { - setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); - setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); - setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); + // The half <-> float conversion functions are always soft-float on + // non-watchos platforms, but are needed for some targets which use a + // hard-float calling convention by default. + if (!Subtarget->isTargetWatchABI()) { + if (Subtarget->isAAPCS_ABI()) { + setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); + } else { + setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); + setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); + setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); + } } // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have @@ -581,6 +586,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v1i64, Expand); + setOperationAction(ISD::CTPOP, MVT::v2i64, Expand); + + setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); + setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); // NEON does not have single instruction CTTZ for vectors. setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); @@ -712,6 +722,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); } + } else { + // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. + setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); + setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); } setOperationAction(ISD::SADDO, MVT::i32, Custom); @@ -758,10 +772,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) setOperationAction(ISD::CTLZ, MVT::i32, Expand); - // These just redirect to CTTZ and CTLZ on ARM. - setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); - // @llvm.readcyclecounter requires the Performance Monitors extension. // Default to the 0 expansion on unsupported platforms. // FIXME: Technically there are older ARM CPUs that have @@ -773,19 +783,30 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (!Subtarget->hasV6Ops()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); - if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) && - !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) { + bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide() + : Subtarget->hasDivideInARMMode(); + if (!hasDivide) { // These are expanded into libcalls if the cpu doesn't have HW divider. setOperationAction(ISD::SDIV, MVT::i32, LibCall); setOperationAction(ISD::UDIV, MVT::i32, LibCall); } + if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) { + setOperationAction(ISD::SDIV, MVT::i32, Custom); + setOperationAction(ISD::UDIV, MVT::i32, Custom); + + setOperationAction(ISD::SDIV, MVT::i64, Custom); + setOperationAction(ISD::UDIV, MVT::i64, Custom); + } + setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); // Register based DivRem for AEABI (RTABI 4.2) - if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) { + if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || + Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI()) { setOperationAction(ISD::SREM, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i64, Custom); + HasStandaloneRem = false; setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod"); setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod"); @@ -807,6 +828,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SDIVREM, MVT::i32, Custom); setOperationAction(ISD::UDIVREM, MVT::i32, Custom); + setOperationAction(ISD::SDIVREM, MVT::i64, Custom); + setOperationAction(ISD::UDIVREM, MVT::i64, Custom); } else { setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); @@ -833,21 +856,21 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use - // the default expansion. If we are targeting a single threaded system, - // then set them all for expand so we can lower them later into their - // non-atomic form. - if (TM.Options.ThreadModel == ThreadModel::Single) - setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); - else if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) { + // the default expansion. + InsertFencesForAtomic = false; + if (Subtarget->hasAnyDataBarrier() && + (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { // ATOMIC_FENCE needs custom lowering; the others should have been expanded // to ldrex/strex loops already. setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); + if (!Subtarget->isThumb() || !Subtarget->isMClass()) + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); // On v8, we have particularly efficient implementations of atomic fences // if they can be combined with nearby atomic loads and stores. - if (!Subtarget->hasV8Ops()) { + if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) { // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. - setInsertFencesForAtomic(true); + InsertFencesForAtomic = true; } } else { // If there's anything we can use as a barrier, go through custom lowering @@ -909,6 +932,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + // Thumb-1 cannot currently select ARMISD::SUBE. + if (!Subtarget->isThumb1Only()) + setOperationAction(ISD::SETCCE, MVT::i32, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); @@ -956,7 +983,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->hasSinCos()) { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); - if (Subtarget->isTargetWatchOS()) { + if (Subtarget->isTargetWatchABI()) { setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); } @@ -1039,7 +1066,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setMinStackArgumentAlignment(4); // Prefer likely predicted branches to selects on out-of-order cores. - PredictableSelectIsExpensive = Subtarget->isLikeA9(); + PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } @@ -1106,7 +1133,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CALL: return "ARMISD::CALL"; case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; - case ARMISD::tCALL: return "ARMISD::tCALL"; case ARMISD::BRCOND: return "ARMISD::BRCOND"; case ARMISD::BR_JT: return "ARMISD::BR_JT"; case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; @@ -1123,6 +1149,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CMOV: return "ARMISD::CMOV"; + case ARMISD::SSAT: return "ARMISD::SSAT"; + case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; @@ -1199,6 +1227,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; @@ -1373,7 +1402,10 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, case CallingConv::ARM_APCS: case CallingConv::GHC: return CC; + case CallingConv::PreserveMost: + return CallingConv::PreserveMost; case CallingConv::ARM_AAPCS_VFP: + case CallingConv::Swift: return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; case CallingConv::C: if (!Subtarget->isAAPCS_ABI()) @@ -1415,18 +1447,18 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); case CallingConv::GHC: return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); + case CallingConv::PreserveMost: + return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); } } /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. -SDValue -ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, - CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, - SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals, - bool isThisReturn, SDValue ThisVal) const { +SDValue ARMTargetLowering::LowerCallResult( + SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, + SDValue ThisVal) const { // Assign locations to each value returned by this call. SmallVector RVLocs; @@ -1442,7 +1474,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference - if (i == 0 && isThisReturn) { + if (i == 0 && isThisReturn && EnableThisRetForwarding) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); @@ -1506,23 +1538,21 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, } /// LowerMemOpCallTo - Store the argument to the stack. -SDValue -ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, - SDValue StackPtr, SDValue Arg, - SDLoc dl, SelectionDAG &DAG, - const CCValAssign &VA, - ISD::ArgFlagsTy Flags) const { +SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, + SDValue Arg, const SDLoc &dl, + SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); return DAG.getStore( Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), - false, false, 0); + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); } -void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, +void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, @@ -1704,7 +1734,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), - false, false, false, DAG.InferPtrAlignment(AddArg)); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(j, Load)); @@ -1780,20 +1809,27 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. bool isDirect = false; - bool isARMFunc = false; + + const TargetMachine &TM = getTargetMachine(); + const Module *Mod = MF.getFunction()->getParent(); + const GlobalValue *GV = nullptr; + if (GlobalAddressSDNode *G = dyn_cast(Callee)) + GV = G->getGlobal(); + bool isStub = + !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); + + bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); bool isLocalARMFunc = false; ARMFunctionInfo *AFI = MF.getInfo(); auto PtrVt = getPointerTy(DAG.getDataLayout()); if (Subtarget->genLongCalls()) { - assert((Subtarget->isTargetWindows() || - getTargetMachine().getRelocationModel() == Reloc::Static) && - "long-calls with non-static relocation model!"); + assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && + "long-calls codegen is not position independent!"); // Handle a global address or an external symbol. If it's not one of // those, the target's already in a register, so we don't need to do // anything extra. - if (GlobalAddressSDNode *G = dyn_cast(Callee)) { - const GlobalValue *GV = G->getGlobal(); + if (isa(Callee)) { // Create a constant pool entry for the callee address unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = @@ -1804,8 +1840,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, - false, false, 0); + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } else if (ExternalSymbolSDNode *S=dyn_cast(Callee)) { const char *Sym = S->getSymbol(); @@ -1819,54 +1854,55 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, - false, false, 0); - } - } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { - const GlobalValue *GV = G->getGlobal(); - isDirect = true; - bool isDef = GV->isStrongDefinitionForLinker(); - bool isStub = (!isDef && Subtarget->isTargetMachO()) && - getTargetMachine().getRelocationModel() != Reloc::Static; - isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); - // ARM call to a local ARM function is predicable. - isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); - // tBX takes a register source operand. - if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { - assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); - Callee = DAG.getNode( - ARMISD::WrapperPIC, dl, PtrVt, - DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); - Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), - false, false, true, 0); - } else if (Subtarget->isTargetCOFF()) { - assert(Subtarget->isTargetWindows() && - "Windows is the only supported COFF target"); - unsigned TargetFlags = GV->hasDLLImportStorageClass() - ? ARMII::MO_DLLIMPORT - : ARMII::MO_NO_FLAG; - Callee = - DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags); - if (GV->hasDLLImportStorageClass()) + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + } + } else if (isa(Callee)) { + // If we're optimizing for minimum size and the function is called three or + // more times in this block, we can improve codesize by calling indirectly + // as BLXr has a 16-bit encoding. + auto *GV = cast(Callee)->getGlobal(); + auto *BB = CLI.CS->getParent(); + bool PreferIndirect = + Subtarget->isThumb() && MF.getFunction()->optForMinSize() && + std::count_if(GV->user_begin(), GV->user_end(), [&BB](const User *U) { + return isa(U) && cast(U)->getParent() == BB; + }) > 2; + + if (!PreferIndirect) { + isDirect = true; + bool isDef = GV->isStrongDefinitionForLinker(); + + // ARM call to a local ARM function is predicable. + isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); + // tBX takes a register source operand. + if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { + assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); + Callee = DAG.getNode( + ARMISD::WrapperPIC, dl, PtrVt, + DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); Callee = - DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), - DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), + DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(DAG.getMachineFunction()), - false, false, false, 0); - } else { - // On ELF targets for PIC code, direct calls should go through the PLT - unsigned OpFlags = 0; - if (Subtarget->isTargetELF() && - getTargetMachine().getRelocationModel() == Reloc::PIC_) - OpFlags = ARMII::MO_PLT; - Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags); + /* Alignment = */ 0, MachineMemOperand::MOInvariant); + } else if (Subtarget->isTargetCOFF()) { + assert(Subtarget->isTargetWindows() && + "Windows is the only supported COFF target"); + unsigned TargetFlags = GV->hasDLLImportStorageClass() + ? ARMII::MO_DLLIMPORT + : ARMII::MO_NO_FLAG; + Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, + TargetFlags); + if (GV->hasDLLImportStorageClass()) + Callee = + DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), + DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), + MachinePointerInfo::getGOT(DAG.getMachineFunction())); + } else { + Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); + } } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { isDirect = true; - bool isStub = Subtarget->isTargetMachO() && - getTargetMachine().getRelocationModel() != Reloc::Static; - isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); // tBX takes a register source operand. const char *Sym = S->getSymbol(); if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { @@ -1878,17 +1914,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, - false, false, 0); + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); } else { - unsigned OpFlags = 0; - // On ELF targets for PIC code, direct calls should go through the PLT - if (Subtarget->isTargetELF() && - getTargetMachine().getRelocationModel() == Reloc::PIC_) - OpFlags = ARMII::MO_PLT; - Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); } } @@ -1898,11 +1928,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else - CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; + CallOpc = ARMISD::CALL; } else { if (!isDirect && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; - else if (doesNotRet && isDirect && Subtarget->hasRAS() && + else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && // Emit regular call when code size is the priority !MF.getFunction()->optForMinSize()) // "mov lr, pc; b _foo" to avoid confusing the RSP @@ -2042,7 +2072,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, if (!Def) return false; if (!Flags.isByVal()) { - if (!TII->isLoadFromStackSlot(Def, FI)) + if (!TII->isLoadFromStackSlot(*Def, FI)) return false; } else { return false; @@ -2082,9 +2112,9 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG& DAG) const { - const Function *CallerF = DAG.getMachineFunction().getFunction(); + MachineFunction &MF = DAG.getMachineFunction(); + const Function *CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF->getCallingConv(); - bool CCMatch = CallerCC == CalleeCC; assert(Subtarget->supportsTailCall()); @@ -2122,41 +2152,25 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; } - // If the calling conventions do not match, then we'd better make sure the - // results are returned in the same way as what the caller expects. - if (!CCMatch) { - SmallVector RVLocs1; - ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, - *DAG.getContext(), Call); - CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg)); - - SmallVector RVLocs2; - ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, - *DAG.getContext(), Call); - CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg)); - - if (RVLocs1.size() != RVLocs2.size()) + // Check that the call results are passed in the same way. + LLVMContext &C = *DAG.getContext(); + if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, + CCAssignFnForNode(CalleeCC, true, isVarArg), + CCAssignFnForNode(CallerCC, true, isVarArg))) + return false; + // The callee has to preserve all registers the caller needs to preserve. + const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + if (CalleeCC != CallerCC) { + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; - for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { - if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) - return false; - if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) - return false; - if (RVLocs1[i].isRegLoc()) { - if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) - return false; - } else { - if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) - return false; - } - } } // If Caller's vararg or byval argument has been split between registers and // stack, do not perform tail call, since part of the argument is in caller's // local frame. - const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction(). - getInfo(); + const ARMFunctionInfo *AFI_Caller = MF.getInfo(); if (AFI_Caller->getArgRegsSaveSize()) return false; @@ -2166,13 +2180,10 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector ArgLocs; - ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext(), Call); + ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC, false, isVarArg)); if (CCInfo.getNextStackOffset()) { - MachineFunction &MF = DAG.getMachineFunction(); - // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2209,6 +2220,10 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } } + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) + return false; } return true; @@ -2226,7 +2241,7 @@ ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, } static SDValue LowerInterruptReturn(SmallVectorImpl &RetOps, - SDLoc DL, SelectionDAG &DAG) { + const SDLoc &DL, SelectionDAG &DAG) { const MachineFunction &MF = DAG.getMachineFunction(); const Function *F = MF.getFunction(); @@ -2259,11 +2274,11 @@ static SDValue LowerInterruptReturn(SmallVectorImpl &RetOps, } SDValue -ARMTargetLowering::LowerReturn(SDValue Chain, - CallingConv::ID CallConv, bool isVarArg, +ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, - SDLoc dl, SelectionDAG &DAG) const { + const SDLoc &dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. SmallVector RVLocs; @@ -2521,9 +2536,9 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SDLoc DL(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); const BlockAddress *BA = cast(Op)->getBlockAddress(); - Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; - if (RelocM == Reloc::Static) { + bool IsPositionIndependent = isPositionIndependent(); + if (!IsPositionIndependent) { CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); } else { unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; @@ -2534,11 +2549,10 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); - SDValue Result = - DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - false, false, false, 0); - if (RelocM == Reloc::Static) + SDValue Result = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + if (!IsPositionIndependent) return Result; SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); @@ -2584,7 +2598,8 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, SDValue FuncTLVGet = DAG.getLoad(MVT::i32, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), - false, true, true, 4); + /* Alignment = */ 4, MachineMemOperand::MONonTemporal | + MachineMemOperand::MOInvariant); Chain = FuncTLVGet.getValue(1); MachineFunction &F = DAG.getMachineFunction(); @@ -2610,6 +2625,61 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); } +SDValue +ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); + + SDValue Chain = DAG.getEntryNode(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc DL(Op); + + // Load the current TEB (thread environment block) + SDValue Ops[] = {Chain, + DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), + DAG.getConstant(15, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(13, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32)}; + SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList(MVT::i32, MVT::Other), Ops); + + SDValue TEB = CurrentTEB.getValue(0); + Chain = CurrentTEB.getValue(1); + + // Load the ThreadLocalStoragePointer from the TEB + // A pointer to the TLS array is located at offset 0x2c from the TEB. + SDValue TLSArray = + DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); + TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); + + // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 + // offset into the TLSArray. + + // Load the TLS index from the C runtime + SDValue TLSIndex = + DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); + TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); + TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); + + SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, + DAG.getConstant(2, DL, MVT::i32)); + SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, + DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), + MachinePointerInfo()); + + // Get the offset of the start of the .tls section (section base) + const auto *GA = cast(Op); + auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); + SDValue Offset = DAG.getLoad( + PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, + DAG.getTargetConstantPool(CPV, PtrVT, 4)), + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + + return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); +} + // Lower ISD::GlobalTLSAddress using the "general dynamic" model SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, @@ -2625,10 +2695,9 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); - Argument = - DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - false, false, false, 0); + Argument = DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), Argument, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); SDValue Chain = Argument.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); @@ -2645,8 +2714,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), - DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args), - 0); + DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; @@ -2680,8 +2748,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, - false, false, 0); + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); Chain = Offset.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); @@ -2689,8 +2756,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, - false, false, 0); + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } else { // local exec model assert(model == TLSModel::LocalExec); @@ -2700,8 +2766,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, - false, false, 0); + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } // The address of the thread local variable is the add of the thread @@ -2714,6 +2779,9 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->isTargetDarwin()) return LowerGlobalTLSAddressDarwin(Op, DAG); + if (Subtarget->isTargetWindows()) + return LowerGlobalTLSAddressWindows(Op, DAG); + // TODO: implement the "local dynamic" model assert(Subtarget->isTargetELF() && "Only ELF implemented here"); GlobalAddressSDNode *GA = cast(Op); @@ -2738,9 +2806,9 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); - if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { - bool UseGOT_PREL = - !(GV->hasHiddenVisibility() || GV->hasLocalLinkage()); + const TargetMachine &TM = getTargetMachine(); + if (isPositionIndependent()) { + bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -2756,15 +2824,14 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, - false, false, 0); + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); SDValue Chain = Result.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); if (UseGOT_PREL) - Result = DAG.getLoad(PtrVT, dl, Chain, Result, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), - false, false, false, 0); + Result = + DAG.getLoad(PtrVT, dl, Chain, Result, + MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } @@ -2781,8 +2848,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); return DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, - false, false, 0); + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } } @@ -2791,7 +2857,6 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); - Reloc::Model RelocM = getTargetMachine().getRelocationModel(); if (Subtarget->useMovt(DAG.getMachineFunction())) ++NumMovwMovt; @@ -2799,15 +2864,14 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into multiple nodes unsigned Wrapper = - RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper; + isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); - if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) + if (Subtarget->isGVIndirectSymbol(GV)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), - false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } @@ -2833,8 +2897,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, TargetFlags)); if (GV->hasDLLImportStorageClass()) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), - false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } @@ -2873,7 +2936,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, "RBIT intrinsic must have i32 type!"); return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1)); } - case Intrinsic::arm_thread_pointer: { + case Intrinsic::thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); } @@ -2882,10 +2945,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); - Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; - unsigned PCAdj = (RelocM != Reloc::PIC_) - ? 0 : (Subtarget->isThumb() ? 4 : 8); + bool IsPositionIndependent = isPositionIndependent(); + unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex, ARMCP::CPLSDA, PCAdj); @@ -2893,10 +2955,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, - false, false, 0); + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); - if (RelocM == Reloc::PIC_) { + if (IsPositionIndependent) { SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } @@ -2962,7 +3023,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, if (Subtarget->isMClass()) { // Only a full system barrier exists in the M-class architectures. Domain = ARM_MB::SY; - } else if (Subtarget->isSwift() && Ord == Release) { + } else if (Subtarget->preferISHSTBarriers() && + Ord == AtomicOrdering::Release) { // Swift happens to implement ISHST barriers in a way that's compatible with // Release semantics but weaker than ISH so we'd be fools not to use // it. Beware: other processors probably don't! @@ -3012,13 +3074,14 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), - MachinePointerInfo(SV), false, false, 0); + MachinePointerInfo(SV)); } -SDValue -ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, - SDValue &Root, SelectionDAG &DAG, - SDLoc dl) const { +SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, + CCValAssign &NextVA, + SDValue &Root, + SelectionDAG &DAG, + const SDLoc &dl) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -3041,8 +3104,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); ArgValue2 = DAG.getLoad( MVT::i32, dl, Root, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, - false, false, 0); + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); } else { Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); @@ -3060,13 +3122,11 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, // these values; otherwise, this reassembles a (byval) structure that // was split between registers and memory. // Return: The frame index registers were stored into. -int -ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, - SDLoc dl, SDValue &Chain, - const Value *OrigArg, - unsigned InRegsParamRecordIdx, - int ArgOffset, - unsigned ArgSize) const { +int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, + const SDLoc &dl, SDValue &Chain, + const Value *OrigArg, + unsigned InRegsParamRecordIdx, + int ArgOffset, unsigned ArgSize) const { // Currently, two use-cases possible: // Case #1. Non-var-args function, and we meet first byval parameter. // Setup first unallocated register as first byval register; @@ -3104,9 +3164,8 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { unsigned VReg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(OrigArg, 4 * i), false, false, 0); + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(OrigArg, 4 * i)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); } @@ -3117,17 +3176,16 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, } // Setup stack frame, the va_list pointer will start from. -void -ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, - SDLoc dl, SDValue &Chain, - unsigned ArgOffset, - unsigned TotalArgRegsSaveSize, - bool ForceMutable) const { +void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + const SDLoc &dl, SDValue &Chain, + unsigned ArgOffset, + unsigned TotalArgRegsSaveSize, + bool ForceMutable) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); // Try to store any remaining integer argument regs - // to their spots on the stack so that they may be loaded by deferencing + // to their spots on the stack so that they may be loaded by dereferencing // the result of va_next. // If there is no regs to be stored, just point address after last // argument passed via stack. @@ -3137,14 +3195,10 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, AFI->setVarArgsFrameIndex(FrameIndex); } -SDValue -ARMTargetLowering::LowerFormalArguments(SDValue Chain, - CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl - &Ins, - SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) - const { +SDValue ARMTargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -3226,10 +3280,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, if (VA.isMemLoc()) { int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgValue2 = DAG.getLoad( - MVT::f64, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), - false, false, false, 0); + ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FI)); } else { ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); @@ -3322,10 +3375,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - InVals.push_back(DAG.getLoad( - VA.getValVT(), dl, Chain, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), - false, false, false, 0)); + InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FI))); } lastInsIndex = index; } @@ -3369,10 +3421,9 @@ static bool isFloatingPointZero(SDValue Op) { /// Returns appropriate ARM CMP (cmp) and corresponding condition code for /// the given operands. -SDValue -ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARMcc, SelectionDAG &DAG, - SDLoc dl) const { +SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDValue &ARMcc, SelectionDAG &DAG, + const SDLoc &dl) const { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); if (!isLegalICmpImmediate(C)) { @@ -3428,9 +3479,8 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. -SDValue -ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, - SDLoc dl) const { +SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, + SelectionDAG &DAG, const SDLoc &dl) const { assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); SDValue Cmp; if (!isFloatingPointZero(RHS)) @@ -3647,7 +3697,7 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, } } -SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal, +SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const { if (Subtarget->isFPOnlySP() && VT == MVT::f64) { @@ -3673,14 +3723,149 @@ SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal, } } +static bool isGTorGE(ISD::CondCode CC) { + return CC == ISD::SETGT || CC == ISD::SETGE; +} + +static bool isLTorLE(ISD::CondCode CC) { + return CC == ISD::SETLT || CC == ISD::SETLE; +} + +// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. +// All of these conditions (and their <= and >= counterparts) will do: +// x < k ? k : x +// x > k ? x : k +// k < x ? x : k +// k > x ? k : x +static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, + const SDValue TrueVal, const SDValue FalseVal, + const ISD::CondCode CC, const SDValue K) { + return (isGTorGE(CC) && + ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || + (isLTorLE(CC) && + ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); +} + +// Similar to isLowerSaturate(), but checks for upper-saturating conditions. +static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, + const SDValue TrueVal, const SDValue FalseVal, + const ISD::CondCode CC, const SDValue K) { + return (isGTorGE(CC) && + ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || + (isLTorLE(CC) && + ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); +} + +// Check if two chained conditionals could be converted into SSAT. +// +// SSAT can replace a set of two conditional selectors that bound a number to an +// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: +// +// x < -k ? -k : (x > k ? k : x) +// x < -k ? -k : (x < k ? x : k) +// x > -k ? (x > k ? k : x) : -k +// x < k ? (x < -k ? -k : x) : k +// etc. +// +// It returns true if the conversion can be done, false otherwise. +// Additionally, the variable is returned in parameter V and the constant in K. +static bool isSaturatingConditional(const SDValue &Op, SDValue &V, + uint64_t &K) { + + SDValue LHS1 = Op.getOperand(0); + SDValue RHS1 = Op.getOperand(1); + SDValue TrueVal1 = Op.getOperand(2); + SDValue FalseVal1 = Op.getOperand(3); + ISD::CondCode CC1 = cast(Op.getOperand(4))->get(); + + const SDValue Op2 = isa(TrueVal1) ? FalseVal1 : TrueVal1; + if (Op2.getOpcode() != ISD::SELECT_CC) + return false; + + SDValue LHS2 = Op2.getOperand(0); + SDValue RHS2 = Op2.getOperand(1); + SDValue TrueVal2 = Op2.getOperand(2); + SDValue FalseVal2 = Op2.getOperand(3); + ISD::CondCode CC2 = cast(Op2.getOperand(4))->get(); + + // Find out which are the constants and which are the variables + // in each conditional + SDValue *K1 = isa(LHS1) ? &LHS1 : isa(RHS1) + ? &RHS1 + : NULL; + SDValue *K2 = isa(LHS2) ? &LHS2 : isa(RHS2) + ? &RHS2 + : NULL; + SDValue K2Tmp = isa(TrueVal2) ? TrueVal2 : FalseVal2; + SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; + SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; + SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; + + // We must detect cases where the original operations worked with 16- or + // 8-bit values. In such case, V2Tmp != V2 because the comparison operations + // must work with sign-extended values but the select operations return + // the original non-extended value. + SDValue V2TmpReg = V2Tmp; + if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) + V2TmpReg = V2Tmp->getOperand(0); + + // Check that the registers and the constants have the correct values + // in both conditionals + if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || + V2TmpReg != V2) + return false; + + // Figure out which conditional is saturating the lower/upper bound. + const SDValue *LowerCheckOp = + isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) + ? &Op + : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 + : NULL; + const SDValue *UpperCheckOp = + isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) + ? &Op + : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 + : NULL; + + if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) + return false; + + // Check that the constant in the lower-bound check is + // the opposite of the constant in the upper-bound check + // in 1's complement. + int64_t Val1 = cast(*K1)->getSExtValue(); + int64_t Val2 = cast(*K2)->getSExtValue(); + int64_t PosVal = std::max(Val1, Val2); + + if (((Val1 > Val2 && UpperCheckOp == &Op) || + (Val1 < Val2 && UpperCheckOp == &Op2)) && + Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) { + + V = V2; + K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive + return true; + } + + return false; +} + SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc dl(Op); + + // Try to convert two saturating conditional selects into a single SSAT + SDValue SatValue; + uint64_t SatConstant; + if (isSaturatingConditional(Op, SatValue, SatConstant)) + return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, + DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); + SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); - SDLoc dl(Op); if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, @@ -3781,10 +3966,9 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { return DAG.getConstant(0, SDLoc(Op), MVT::i32); if (LoadSDNode *Ld = dyn_cast(Op)) - return DAG.getLoad(MVT::i32, SDLoc(Op), - Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), - Ld->isVolatile(), Ld->isNonTemporal(), - Ld->isInvariant(), Ld->getAlignment()); + return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); llvm_unreachable("Unknown VFP cmp argument!"); } @@ -3801,21 +3985,17 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, if (LoadSDNode *Ld = dyn_cast(Op)) { SDValue Ptr = Ld->getBasePtr(); - RetVal1 = DAG.getLoad(MVT::i32, dl, - Ld->getChain(), Ptr, - Ld->getPointerInfo(), - Ld->isVolatile(), Ld->isNonTemporal(), - Ld->isInvariant(), Ld->getAlignment()); + RetVal1 = + DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), + Ld->getAlignment(), Ld->getMemOperand()->getFlags()); EVT PtrType = Ptr.getValueType(); unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); SDValue NewPtr = DAG.getNode(ISD::ADD, dl, PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); - RetVal2 = DAG.getLoad(MVT::i32, dl, - Ld->getChain(), NewPtr, - Ld->getPointerInfo().getWithOffset(4), - Ld->isVolatile(), Ld->isNonTemporal(), - Ld->isInvariant(), NewAlign); + RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, + Ld->getPointerInfo().getWithOffset(4), NewAlign, + Ld->getMemOperand()->getFlags()); return; } @@ -3908,8 +4088,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { if (getTargetMachine().Options.UnsafeFPMath && (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE || CC == ISD::SETUNE)) { - SDValue Result = OptimizeVFPBrcond(Op, DAG); - if (Result.getNode()) + if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) return Result; } @@ -3950,19 +4129,17 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, Addr, Op.getOperand(2), JTI); } - if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { + if (isPositionIndependent()) { Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, - MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), - false, false, false, 0); + MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } else { Addr = DAG.getLoad(PTy, dl, Chain, Addr, - MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), - false, false, false, 0); + MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); Chain = Addr.getValue(1); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } @@ -4156,7 +4333,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ SDValue Offset = DAG.getConstant(4, dl, MVT::i32); return DAG.getLoad(VT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), - MachinePointerInfo(), false, false, false, 0); + MachinePointerInfo()); } // Return LR, which contains the return address. Mark it an implicit live-in. @@ -4178,8 +4355,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, - MachinePointerInfo(), - false, false, false, 0); + MachinePointerInfo()); return FrameAddr; } @@ -4322,7 +4498,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { /// not support i64 elements, so sometimes the zero vectors will need to be /// explicitly constructed. Regardless, use a canonical VMOV to create the /// zero vector. -static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) { +static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert(VT.isVector() && "Expected a vector type"); // The canonical modified immediate encoding of a zero vector is....0! SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); @@ -4826,12 +5002,36 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { return Result; } +static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue Carry = Op.getOperand(2); + SDValue Cond = Op.getOperand(3); + SDLoc DL(Op); + + assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); + + assert(Carry.getOpcode() != ISD::CARRY_FALSE); + SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); + + SDValue FVal = DAG.getConstant(0, DL, MVT::i32); + SDValue TVal = DAG.getConstant(1, DL, MVT::i32); + SDValue ARMcc = DAG.getConstant( + IntCCToARMCC(cast(Cond)->get()), DL, MVT::i32); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, + Cmp.getValue(1), SDValue()); + return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, + CCR, Chain.getValue(1)); +} + /// isNEONModifiedImm - Check if the specified splat value corresponds to a /// valid vector constant for a NEON instruction with a "modified immediate" /// operand (e.g., VMOV). If so, return the encoded value. static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, - SDLoc dl, EVT &VT, bool is128Bits, + const SDLoc &dl, EVT &VT, bool is128Bits, NEONModImmType type) { unsigned OpCmode, Imm; @@ -4979,7 +5179,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, return SDValue(); // Try splatting with a VMOV.f32... - APFloat FPVal = CFP->getValueAPF(); + const APFloat &FPVal = CFP->getValueAPF(); int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); if (ImmVal != -1) { @@ -5421,7 +5621,7 @@ static bool isReverseMask(ArrayRef M, EVT VT) { // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, - const ARMSubtarget *ST, SDLoc dl) { + const ARMSubtarget *ST, const SDLoc &dl) { uint64_t Val; if (!isa(N)) return SDValue(); @@ -5502,7 +5702,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SDValue Value; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; if (i > 0) isOnlyLowElement = false; @@ -5585,7 +5785,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); - SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); + SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); Val = LowerBUILD_VECTOR(Val, DAG, ST); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); @@ -5635,7 +5835,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SDValue Vec = DAG.getUNDEF(VT); for (unsigned i = 0 ; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); @@ -5681,7 +5881,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { // A shuffle can only come from building a vector from various @@ -5808,7 +6008,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) + if (Entry.isUndef()) continue; auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); @@ -5845,7 +6045,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], - ShuffleOps[1], &Mask[0]); + ShuffleOps[1], Mask); return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } @@ -5895,7 +6095,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, - SDLoc dl) { + const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); @@ -5982,12 +6182,12 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); - if (V2.getNode()->getOpcode() == ISD::UNDEF) + if (V2.getNode()->isUndef()) return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask)); + DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask)); + DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); } static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, @@ -6024,7 +6224,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); if (EltSize <= 32) { - if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) Lane = 0; @@ -6040,7 +6240,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { !isa(V1.getOperand(0))) { bool IsScalarToVector = true; for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) - if (V1.getOperand(i).getOpcode() != ISD::UNDEF) { + if (!V1.getOperand(i).isUndef()) { IsScalarToVector = false; break; } @@ -6067,8 +6267,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (isVREVMask(ShuffleMask, VT, 16)) return DAG.getNode(ARMISD::VREV16, dl, VT, V1); - if (V2->getOpcode() == ISD::UNDEF && - isSingletonVEXTMask(ShuffleMask, VT, Imm)) { + if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } @@ -6103,8 +6302,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // -> // concat(VZIP(v1, v2):0, :1) // - if (V1->getOpcode() == ISD::CONCAT_VECTORS && - V2->getOpcode() == ISD::UNDEF) { + if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { SDValue SubV1 = V1->getOperand(0); SDValue SubV2 = V1->getOperand(1); EVT SubVT = SubV1.getValueType(); @@ -6175,11 +6373,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); - if (VT == MVT::v8i8) { - SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG); - if (NewOp.getNode()) + if (VT == MVT::v8i8) + if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) return NewOp; - } return SDValue(); } @@ -6218,11 +6414,11 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDValue Val = DAG.getUNDEF(MVT::v2f64); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - if (Op0.getOpcode() != ISD::UNDEF) + if (!Op0.isUndef()) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), DAG.getIntPtrConstant(0, dl)); - if (Op1.getOpcode() != ISD::UNDEF) + if (!Op1.isUndef()) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), DAG.getIntPtrConstant(1, dl)); @@ -6351,17 +6547,16 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { // The load already has the right type. if (ExtendedTy == LD->getMemoryVT()) return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), - LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), - LD->isNonTemporal(), LD->isInvariant(), - LD->getAlignment()); + LD->getBasePtr(), LD->getPointerInfo(), + LD->getAlignment(), LD->getMemOperand()->getFlags()); // We need to create a zextload/sextload. We cannot just create a load // followed by a zext/zext node because LowerMUL is also run during normal // operation legalization where we can't create illegal types. return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), - LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(), - LD->isNonTemporal(), LD->getAlignment()); + LD->getMemoryVT(), LD->getAlignment(), + LD->getMemOperand()->getFlags()); } /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, @@ -6387,8 +6582,9 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { assert(BVN->getOpcode() == ISD::BUILD_VECTOR && BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32, - BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); + return DAG.getBuildVector( + MVT::v2i32, SDLoc(N), + {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); } // Construct a new BUILD_VECTOR with elements truncated to half the size. assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); @@ -6405,8 +6601,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::getVectorVT(TruncVT, NumElts), Ops); + return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { @@ -6506,8 +6701,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } -static SDValue -LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { +static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, + SelectionDAG &DAG) { // TODO: Should this propagate fast-math-flags? // Convert to float @@ -6528,8 +6723,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { // float4 result = as_float4(as_int4(xf*recip) + 0xb000); X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); - Y = DAG.getConstant(0xb000, dl, MVT::i32); - Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y); + Y = DAG.getConstant(0xb000, dl, MVT::v4i32); X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); // Convert back to short. @@ -6538,8 +6732,8 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { return X; } -static SDValue -LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { +static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, + SelectionDAG &DAG) { // TODO: Should this propagate fast-math-flags? SDValue N2; @@ -6567,8 +6761,7 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { // float4 result = as_float4(as_int4(xf*recip) + 0x89); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); - N1 = DAG.getConstant(0x89, dl, MVT::i32); - N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); + N1 = DAG.getConstant(0x89, dl, MVT::v4i32); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); // Convert back to integer and return. @@ -6679,8 +6872,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { // float4 result = as_float4(as_int4(xf*recip) + 2); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); - N1 = DAG.getConstant(2, dl, MVT::i32); - N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); + N1 = DAG.getConstant(2, dl, MVT::v4i32); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); // Convert back to integer and return. @@ -6766,21 +6958,21 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) - .setCallee(CC, RetTy, Callee, std::move(Args), 0) + .setCallee(CC, RetTy, Callee, std::move(Args)) .setDiscardResult(ShouldUseSRet); std::pair CallResult = LowerCallTo(CLI); if (!ShouldUseSRet) return CallResult.first; - SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, - MachinePointerInfo(), false, false, false, 0); + SDValue LoadSin = + DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); // Address of cos field. SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); - SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, - MachinePointerInfo(), false, false, false, 0); + SDValue LoadCos = + DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, @@ -6819,7 +7011,7 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, CLI.setDebugLoc(dl) .setChain(Chain) .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), - ES, std::move(Args), 0); + ES, std::move(Args)); return LowerCallTo(CLI).first; } @@ -6867,13 +7059,13 @@ void ARMTargetLowering::ExpandDIV_Windows( } static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { - // Monotonic load/store is legal for all targets - if (cast(Op)->getOrdering() <= Monotonic) - return Op; + if (isStrongerThanMonotonic(cast(Op)->getOrdering())) + // Acquire/Release load/store is not legal for targets without a dmb or + // equivalent available. + return SDValue(); - // Acquire/Release load/store is not legal for targets without a - // dmb or equivalent available. - return SDValue(); + // Monotonic load/store is legal for all targets. + return Op; } static void ReplaceREADCYCLECOUNTER(SDNode *N, @@ -6899,6 +7091,46 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N, Results.push_back(Cycles32.getValue(1)); } +static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { + SDLoc dl(V.getNode()); + SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); + SDValue VHi = DAG.getAnyExtOrTrunc( + DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), + dl, MVT::i32); + SDValue RegClass = + DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); + SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); + SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); + const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; + return SDValue( + DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); +} + +static void ReplaceCMP_SWAP_64Results(SDNode *N, + SmallVectorImpl & Results, + SelectionDAG &DAG) { + assert(N->getValueType(0) == MVT::i64 && + "AtomicCmpSwap on types less than 64 should be legal"); + SDValue Ops[] = {N->getOperand(1), + createGPRPairNode(DAG, N->getOperand(2)), + createGPRPairNode(DAG, N->getOperand(3)), + N->getOperand(0)}; + SDNode *CmpSwap = DAG.getMachineNode( + ARM::CMP_SWAP_64, SDLoc(N), + DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); + + MachineFunction &MF = DAG.getMachineFunction(); + MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + + Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32, + SDValue(CmpSwap, 0))); + Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32, + SDValue(CmpSwap, 0))); + Results.push_back(SDValue(CmpSwap, 2)); +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); @@ -6948,6 +7180,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG); + case ISD::SETCCE: return LowerSETCCE(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); @@ -6956,8 +7189,14 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); - case ISD::SDIV: return LowerSDIV(Op, DAG); - case ISD::UDIV: return LowerUDIV(Op, DAG); + case ISD::SDIV: + if (Subtarget->isTargetWindows()) + return LowerDIV_Windows(Op, DAG, /* Signed */ true); + return LowerSDIV(Op, DAG); + case ISD::UDIV: + if (Subtarget->isTargetWindows()) + return LowerDIV_Windows(Op, DAG, /* Signed */ false); + return LowerUDIV(Op, DAG); case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: @@ -7005,6 +7244,13 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::UREM: Res = LowerREM(N, DAG); break; + case ISD::SDIVREM: + case ISD::UDIVREM: + Res = LowerDivRem(SDValue(N, 0), DAG); + assert(Res.getNumOperands() == 2 && "DivRem needs two values"); + Results.push_back(Res.getValue(0)); + Results.push_back(Res.getValue(1)); + return; case ISD::READCYCLECOUNTER: ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; @@ -7013,6 +7259,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, Results); + case ISD::ATOMIC_CMP_SWAP: + ReplaceCMP_SWAP_64Results(N, Results, DAG); + return; } if (Res.getNode()) Results.push_back(Res); @@ -7024,11 +7273,12 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and /// registers the function context. -void ARMTargetLowering:: -SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, - MachineBasicBlock *DispatchBB, int FI) const { +void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, + MachineBasicBlock *MBB, + MachineBasicBlock *DispatchBB, + int FI) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - DebugLoc dl = MI->getDebugLoc(); + DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); MachineConstantPool *MCP = MF->getConstantPool(); @@ -7139,10 +7389,10 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, } } -void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, +void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - DebugLoc dl = MI->getDebugLoc(); + DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); MachineFrameInfo *MFI = MF->getFrameInfo(); @@ -7182,7 +7432,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // Get an ordered list of the machine basic blocks for the jump table. std::vector LPadList; - SmallPtrSet InvokeBBs; + SmallPtrSet InvokeBBs; LPadList.reserve(CallSiteNumToLPad.size()); for (unsigned I = 1; I <= MaxCSNum; ++I) { SmallVectorImpl &MBBList = CallSiteNumToLPad[I]; @@ -7200,7 +7450,6 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); unsigned MJTI = JTI->createJumpTableIndex(LPadList); - Reloc::Model RelocM = getTargetMachine().getRelocationModel(); // Create the MBBs for the dispatch code. @@ -7244,6 +7493,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // registers being marked as clobbered. MIB.addRegMask(RI.getNoPreservedMask()); + bool IsPositionIndependent = isPositionIndependent(); unsigned NumLPads = LPadList.size(); if (Subtarget->isThumb2()) { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); @@ -7357,7 +7607,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, .addMemOperand(JTMMOLd)); unsigned NewVReg6 = NewVReg5; - if (RelocM == Reloc::PIC_) { + if (IsPositionIndependent) { NewVReg6 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) .addReg(ARM::CPSR, RegState::Define) @@ -7440,7 +7690,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, .addImm(0) .addMemOperand(JTMMOLd)); - if (RelocM == Reloc::PIC_) { + if (IsPositionIndependent) { BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) .addReg(NewVReg5, RegState::Kill) .addReg(NewVReg4) @@ -7524,7 +7774,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, (*I)->setIsEHPad(false); // The instruction is gone now. - MI->eraseFromParent(); + MI.eraseFromParent(); } static @@ -7576,8 +7826,8 @@ static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { /// Emit a post-increment load operation with given size. The instructions /// will be added to BB at Pos. -static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos, - const TargetInstrInfo *TII, DebugLoc dl, +static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, + const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2) { unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); @@ -7608,8 +7858,8 @@ static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos, /// Emit a post-increment store operation with given size. The instructions /// will be added to BB at Pos. -static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos, - const TargetInstrInfo *TII, DebugLoc dl, +static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, + const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2) { unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); @@ -7637,7 +7887,7 @@ static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos, } MachineBasicBlock * -ARMTargetLowering::EmitStructByval(MachineInstr *MI, +ARMTargetLowering::EmitStructByval(MachineInstr &MI, MachineBasicBlock *BB) const { // This pseudo instruction has 3 operands: dst, src, size // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). @@ -7646,11 +7896,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); - unsigned dest = MI->getOperand(0).getReg(); - unsigned src = MI->getOperand(1).getReg(); - unsigned SizeVal = MI->getOperand(2).getImm(); - unsigned Align = MI->getOperand(3).getImm(); - DebugLoc dl = MI->getDebugLoc(); + unsigned dest = MI.getOperand(0).getReg(); + unsigned src = MI.getOperand(1).getReg(); + unsigned SizeVal = MI.getOperand(2).getImm(); + unsigned Align = MI.getOperand(3).getImm(); + DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -7722,7 +7972,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, srcIn = srcOut; destIn = destOut; } - MI->eraseFromParent(); // The instruction is gone now. + MI.eraseFromParent(); // The instruction is gone now. return BB; } @@ -7848,7 +8098,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, // Add epilogue to handle BytesLeft. BB = exitMBB; - MachineInstr *StartOfExit = exitMBB->begin(); + auto StartOfExit = exitMBB->begin(); // [scratch, srcOut] = LDRB_POST(srcLoop, 1) // [destOut] = STRB_POST(scratch, destLoop, 1) @@ -7866,16 +8116,16 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, destIn = destOut; } - MI->eraseFromParent(); // The instruction is gone now. + MI.eraseFromParent(); // The instruction is gone now. return BB; } MachineBasicBlock * -ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, +ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, MachineBasicBlock *MBB) const { const TargetMachine &TM = getTargetMachine(); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); assert(Subtarget->isTargetWindows() && "__chkstk is only supported on Windows"); @@ -7930,24 +8180,26 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) - .addReg(ARM::SP).addReg(ARM::R4))); + .addReg(ARM::SP, RegState::Kill) + .addReg(ARM::R4, RegState::Kill) + .setMIFlags(MachineInstr::FrameSetup))); - MI->eraseFromParent(); + MI.eraseFromParent(); return MBB; } MachineBasicBlock * -ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI, +ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const { - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); - MF->push_back(ContBB); + MF->insert(++MBB->getIterator(), ContBB); ContBB->splice(ContBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - MBB->addSuccessor(ContBB); + ContBB->transferSuccessorsAndUpdatePHIs(MBB); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); MF->push_back(TrapBB); @@ -7955,74 +8207,89 @@ ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI, MBB->addSuccessor(TrapBB); BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ)) - .addReg(MI->getOperand(0).getReg()) + .addReg(MI.getOperand(0).getReg()) .addMBB(TrapBB); + AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::t2B)).addMBB(ContBB)); + MBB->addSuccessor(ContBB); - MI->eraseFromParent(); + MI.eraseFromParent(); return ContBB; } MachineBasicBlock * -ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, +ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - DebugLoc dl = MI->getDebugLoc(); + DebugLoc dl = MI.getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: { - MI->dump(); + MI.dump(); llvm_unreachable("Unexpected instr type to insert"); } + + // Thumb1 post-indexed loads are really just single-register LDMs. + case ARM::tLDR_postidx: { + BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) + .addOperand(MI.getOperand(1)) // Rn_wb + .addOperand(MI.getOperand(2)) // Rn + .addOperand(MI.getOperand(3)) // PredImm + .addOperand(MI.getOperand(4)) // PredReg + .addOperand(MI.getOperand(0)); // Rt + MI.eraseFromParent(); + return BB; + } + // The Thumb2 pre-indexed stores have the same MI operands, they just // define them differently in the .td files from the isel patterns, so // they need pseudos. case ARM::t2STR_preidx: - MI->setDesc(TII->get(ARM::t2STR_PRE)); + MI.setDesc(TII->get(ARM::t2STR_PRE)); return BB; case ARM::t2STRB_preidx: - MI->setDesc(TII->get(ARM::t2STRB_PRE)); + MI.setDesc(TII->get(ARM::t2STRB_PRE)); return BB; case ARM::t2STRH_preidx: - MI->setDesc(TII->get(ARM::t2STRH_PRE)); + MI.setDesc(TII->get(ARM::t2STRH_PRE)); return BB; case ARM::STRi_preidx: case ARM::STRBi_preidx: { - unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ? - ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM; + unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM + : ARM::STRB_PRE_IMM; // Decode the offset. - unsigned Offset = MI->getOperand(4).getImm(); + unsigned Offset = MI.getOperand(4).getImm(); bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; Offset = ARM_AM::getAM2Offset(Offset); if (isSub) Offset = -Offset; - MachineMemOperand *MMO = *MI->memoperands_begin(); + MachineMemOperand *MMO = *MI.memoperands_begin(); BuildMI(*BB, MI, dl, TII->get(NewOpc)) - .addOperand(MI->getOperand(0)) // Rn_wb - .addOperand(MI->getOperand(1)) // Rt - .addOperand(MI->getOperand(2)) // Rn - .addImm(Offset) // offset (skip GPR==zero_reg) - .addOperand(MI->getOperand(5)) // pred - .addOperand(MI->getOperand(6)) - .addMemOperand(MMO); - MI->eraseFromParent(); + .addOperand(MI.getOperand(0)) // Rn_wb + .addOperand(MI.getOperand(1)) // Rt + .addOperand(MI.getOperand(2)) // Rn + .addImm(Offset) // offset (skip GPR==zero_reg) + .addOperand(MI.getOperand(5)) // pred + .addOperand(MI.getOperand(6)) + .addMemOperand(MMO); + MI.eraseFromParent(); return BB; } case ARM::STRr_preidx: case ARM::STRBr_preidx: case ARM::STRH_preidx: { unsigned NewOpc; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("unexpected opcode!"); case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); - for (unsigned i = 0; i < MI->getNumOperands(); ++i) - MIB.addOperand(MI->getOperand(i)); - MI->eraseFromParent(); + for (unsigned i = 0; i < MI.getNumOperands(); ++i) + MIB.addOperand(MI.getOperand(i)); + MI.eraseFromParent(); return BB; } @@ -8055,8 +8322,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); - BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) - .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); + BuildMI(BB, dl, TII->get(ARM::tBcc)) + .addMBB(sinkMBB) + .addImm(MI.getOperand(3).getImm()) + .addReg(MI.getOperand(4).getReg()); // copy0MBB: // %FalseValue = ... @@ -8070,12 +8339,13 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... BB = sinkMBB; - BuildMI(*BB, BB->begin(), dl, - TII->get(ARM::PHI), MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) - .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) + .addReg(MI.getOperand(1).getReg()) + .addMBB(copy0MBB) + .addReg(MI.getOperand(2).getReg()) + .addMBB(thisMBB); - MI->eraseFromParent(); // The pseudo instruction is gone now. + MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } @@ -8086,10 +8356,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Compare both parts that make up the double comparison separately for // equality. - bool RHSisZero = MI->getOpcode() == ARM::BCCZi64; + bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; - unsigned LHS1 = MI->getOperand(1).getReg(); - unsigned LHS2 = MI->getOperand(2).getReg(); + unsigned LHS1 = MI.getOperand(1).getReg(); + unsigned LHS2 = MI.getOperand(2).getReg(); if (RHSisZero) { AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) @@ -8098,8 +8368,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, .addReg(LHS2).addImm(0) .addImm(ARMCC::EQ).addReg(ARM::CPSR); } else { - unsigned RHS1 = MI->getOperand(3).getReg(); - unsigned RHS2 = MI->getOperand(4).getReg(); + unsigned RHS1 = MI.getOperand(3).getReg(); + unsigned RHS2 = MI.getOperand(4).getReg(); AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(LHS1).addReg(RHS1)); @@ -8108,9 +8378,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, .addImm(ARMCC::EQ).addReg(ARM::CPSR); } - MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB(); + MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); - if (MI->getOperand(0).getImm() == ARMCC::NE) + if (MI.getOperand(0).getImm() == ARMCC::NE) std::swap(destMBB, exitMBB); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) @@ -8120,7 +8390,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, else BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); - MI->eraseFromParent(); // The pseudo instruction is gone now. + MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } @@ -8157,9 +8427,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, Fn->insert(BBI, RSBBB); Fn->insert(BBI, SinkBB); - unsigned int ABSSrcReg = MI->getOperand(1).getReg(); - unsigned int ABSDstReg = MI->getOperand(0).getReg(); - bool ABSSrcKIll = MI->getOperand(1).isKill(); + unsigned int ABSSrcReg = MI.getOperand(1).getReg(); + unsigned int ABSDstReg = MI.getOperand(0).getReg(); + bool ABSSrcKIll = MI.getOperand(1).isKill(); bool isThumb2 = Subtarget->isThumb2(); MachineRegisterInfo &MRI = Fn->getRegInfo(); // In Thumb mode S must not be specified if source register is the SP or @@ -8204,7 +8474,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, .addReg(ABSSrcReg).addMBB(BB); // remove ABS instruction - MI->eraseFromParent(); + MI.eraseFromParent(); // return last added BB return SinkBB; @@ -8223,38 +8493,38 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, /// when it is expanded into LDM/STM. This is done as a post-isel lowering /// instead of as a custom inserter because we need the use list from the SDNode. static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, - MachineInstr *MI, const SDNode *Node) { + MachineInstr &MI, const SDNode *Node) { bool isThumb1 = Subtarget->isThumb1Only(); - DebugLoc DL = MI->getDebugLoc(); - MachineFunction *MF = MI->getParent()->getParent(); + DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineInstrBuilder MIB(*MF, MI); // If the new dst/src is unused mark it as dead. if (!Node->hasAnyUseOfValue(0)) { - MI->getOperand(0).setIsDead(true); + MI.getOperand(0).setIsDead(true); } if (!Node->hasAnyUseOfValue(1)) { - MI->getOperand(1).setIsDead(true); + MI.getOperand(1).setIsDead(true); } // The MEMCPY both defines and kills the scratch registers. - for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { + for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); MIB.addReg(TmpReg, RegState::Define|RegState::Dead); } } -void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, +void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { - if (MI->getOpcode() == ARM::MEMCPY) { + if (MI.getOpcode() == ARM::MEMCPY) { attachMEMCPYScratchRegs(Subtarget, MI, Node); return; } - const MCInstrDesc *MCID = &MI->getDesc(); + const MCInstrDesc *MCID = &MI.getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional // operand is still set to noreg. If needed, set the optional operand's @@ -8263,24 +8533,24 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, // e.g. ADCS (..., CPSR) -> ADC (... opt:CPSR). // Rename pseudo opcodes. - unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode()); + unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); if (NewOpc) { const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); MCID = &TII->get(NewOpc); - assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 && + assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 && "converted opcode should be the same except for cc_out"); - MI->setDesc(*MCID); + MI.setDesc(*MCID); // Add the optional cc_out operand - MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); + MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); } unsigned ccOutIdx = MCID->getNumOperands() - 1; // Any ARM instruction that sets the 's' bit should specify an optional // "cc_out" operand in the last operand position. - if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { + if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { assert(!NewOpc && "Optional cc_out operand required"); return; } @@ -8288,14 +8558,14 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, // since we already have an optional CPSR def. bool definesCPSR = false; bool deadCPSR = false; - for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands(); - i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; + ++i) { + const MachineOperand &MO = MI.getOperand(i); if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { definesCPSR = true; if (MO.isDead()) deadCPSR = true; - MI->RemoveOperand(i); + MI.RemoveOperand(i); break; } } @@ -8305,14 +8575,14 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, } assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); if (deadCPSR) { - assert(!MI->getOperand(ccOutIdx).getReg() && + assert(!MI.getOperand(ccOutIdx).getReg() && "expect uninitialized optional cc_out operand"); return; } // If this instruction was defined with an optional CPSR def and its dag node // had a live implicit CPSR def, then activate the optional CPSR def. - MachineOperand &MO = MI->getOperand(ccOutIdx); + MachineOperand &MO = MI.getOperand(ccOutIdx); MO.setReg(ARM::CPSR); MO.setIsDef(true); } @@ -8442,16 +8712,12 @@ SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N0.getNode()->hasOneUse()) { - SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes); - if (Result.getNode()) + if (N0.getNode()->hasOneUse()) + if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) return Result; - } - if (N1.getNode()->hasOneUse()) { - SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes); - if (Result.getNode()) + if (N1.getNode()->hasOneUse()) + if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) return Result; - } return SDValue(); } @@ -8533,7 +8799,7 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, // Get widened type and narrowed type. MVT widenType; unsigned numElem = VT.getVectorNumElements(); - + EVT inputLaneType = Vec.getValueType().getVectorElementType(); switch (inputLaneType.getSimpleVT().SimpleTy) { case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; @@ -8559,11 +8825,6 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - if (Subtarget->isThumb1Only()) return SDValue(); - - // Only perform the checks after legalize when the pattern is available. - if (DCI.isBeforeLegalize()) return SDValue(); - // Look for multiply add opportunities. // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where // each add nodes consumes a value from ISD::UMUL_LOHI and there is @@ -8691,14 +8952,97 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, return resNode; } +static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // UMAAL is similar to UMLAL except that it adds two unsigned values. + // While trying to combine for the other MLAL nodes, first search for the + // chance to use UMAAL. Check if Addc uses another addc node which can first + // be combined into a UMLAL. The other pattern is AddcNode being combined + // into an UMLAL and then using another addc is handled in ISelDAGToDAG. + + if (!Subtarget->hasV6Ops()) + return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); + + SDNode *PrevAddc = nullptr; + if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC) + PrevAddc = AddcNode->getOperand(0).getNode(); + else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC) + PrevAddc = AddcNode->getOperand(1).getNode(); + + // If there's no addc chains, just return a search for any MLAL. + if (PrevAddc == nullptr) + return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); + + // Try to convert the addc operand to an MLAL and if that fails try to + // combine AddcNode. + SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget); + if (MLAL != SDValue(PrevAddc, 0)) + return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); + + // Find the converted UMAAL or quit if it doesn't exist. + SDNode *UmlalNode = nullptr; + SDValue AddHi; + if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { + UmlalNode = AddcNode->getOperand(0).getNode(); + AddHi = AddcNode->getOperand(1); + } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { + UmlalNode = AddcNode->getOperand(1).getNode(); + AddHi = AddcNode->getOperand(0); + } else { + return SDValue(); + } + + // The ADDC should be glued to an ADDE node, which uses the same UMLAL as + // the ADDC as well as Zero. + auto *Zero = dyn_cast(UmlalNode->getOperand(3)); + + if (!Zero || Zero->getZExtValue() != 0) + return SDValue(); + + // Check that we have a glued ADDC node. + if (AddcNode->getValueType(1) != MVT::Glue) + return SDValue(); + + // Look for the glued ADDE. + SDNode* AddeNode = AddcNode->getGluedUser(); + if (!AddeNode) + return SDValue(); + + if ((AddeNode->getOperand(0).getNode() == Zero && + AddeNode->getOperand(1).getNode() == UmlalNode) || + (AddeNode->getOperand(0).getNode() == UmlalNode && + AddeNode->getOperand(1).getNode() == Zero)) { + + SelectionDAG &DAG = DCI.DAG; + SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), + UmlalNode->getOperand(2), AddHi }; + SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), + DAG.getVTList(MVT::i32, MVT::i32), Ops); + + // Replace the ADDs' nodes uses by the UMAAL node's values. + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); + + // Return original node to notify the driver to stop replacing. + return SDValue(AddcNode, 0); + } + return SDValue(); +} + /// PerformADDCCombine - Target-specific dag combine transform from -/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL. +/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or +/// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL static SDValue PerformADDCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - return AddCombineTo64bitMLAL(N, DCI, Subtarget); + if (Subtarget->isThumb1Only()) return SDValue(); + // Only perform the checks after legalize when the pattern is available. + if (DCI.isBeforeLegalize()) return SDValue(); + + return AddCombineTo64bitUMAAL(N, DCI, Subtarget); } /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with @@ -8710,15 +9054,13 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, const ARMSubtarget *Subtarget){ // Attempt to create vpaddl for this add. - SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget); - if (Result.getNode()) + if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget)) return Result; // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) - if (N0.getNode()->hasOneUse()) { - SDValue Result = combineSelectAndUse(N, N0, N1, DCI); - if (Result.getNode()) return Result; - } + if (N0.getNode()->hasOneUse()) + if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) + return Result; return SDValue(); } @@ -8731,8 +9073,7 @@ static SDValue PerformADDCombine(SDNode *N, SDValue N1 = N->getOperand(1); // First try with the default operand order. - SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget); - if (Result.getNode()) + if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) return Result; // If that didn't work, try again with the operands commuted. @@ -8747,10 +9088,9 @@ static SDValue PerformSUBCombine(SDNode *N, SDValue N1 = N->getOperand(1); // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) - if (N1.getNode()->hasOneUse()) { - SDValue Result = combineSelectAndUse(N, N1, N0, DCI); - if (Result.getNode()) return Result; - } + if (N1.getNode()->hasOneUse()) + if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) + return Result; return SDValue(); } @@ -8920,8 +9260,7 @@ static SDValue PerformANDCombine(SDNode *N, if (!Subtarget->isThumb1Only()) { // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) - SDValue Result = combineSelectAndUseCommutative(N, true, DCI); - if (Result.getNode()) + if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) return Result; } @@ -8963,8 +9302,7 @@ static SDValue PerformORCombine(SDNode *N, if (!Subtarget->isThumb1Only()) { // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) - SDValue Result = combineSelectAndUseCommutative(N, false, DCI); - if (Result.getNode()) + if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) return Result; } @@ -9137,8 +9475,7 @@ static SDValue PerformXORCombine(SDNode *N, if (!Subtarget->isThumb1Only()) { // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) - SDValue Result = combineSelectAndUseCommutative(N, false, DCI); - if (Result.getNode()) + if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) return Result; } @@ -9300,17 +9637,15 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; SDLoc DL(LD); SDValue BasePtr = LD->getBasePtr(); - SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, - LD->getPointerInfo(), LD->isVolatile(), - LD->isNonTemporal(), LD->isInvariant(), - LD->getAlignment()); + SDValue NewLD1 = + DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), + LD->getAlignment(), LD->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); - SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, - LD->getPointerInfo(), LD->isVolatile(), - LD->isNonTemporal(), LD->isInvariant(), - std::min(4U, LD->getAlignment() / 2)); + SDValue NewLD2 = DAG.getLoad( + MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(), + std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); if (DCI.DAG.getDataLayout().isBigEndian()) @@ -9364,11 +9699,9 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, // into a pair of GPRs, which is fine when the value is used as a scalar, // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. SelectionDAG &DAG = DCI.DAG; - if (N->getNumOperands() == 2) { - SDValue RV = PerformVMOVDRRCombine(N, DAG); - if (RV.getNode()) + if (N->getNumOperands() == 2) + if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) return RV; - } // Load i64 elements as f64 values so that type legalization does not split // them up into i32 values. @@ -9385,7 +9718,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, DCI.AddToWorklist(V.getNode()); } EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops); + SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, BV); } @@ -9434,7 +9767,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Assume only bit cast to i32 will go away. if (Elt->getOperand(0).getValueType() == MVT::i32) ++NumOfBitCastedElts; - } else if (Elt.getOpcode() == ISD::UNDEF || isa(Elt)) + } else if (Elt.isUndef() || isa(Elt)) // Constants are statically casted, thus do not count them as // relevant operands. --NumOfRelevantElts; @@ -9461,7 +9794,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDLoc dl(N); for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { SDValue V = N->getOperand(Idx); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; if (V.getOpcode() == ISD::BITCAST && V->getOperand(0).getValueType() == MVT::i32) @@ -9529,8 +9862,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); SDValue Concat0Op1 = Op0.getOperand(1); SDValue Concat1Op1 = Op1.getOperand(1); - if (Concat0Op1.getOpcode() != ISD::UNDEF || - Concat1Op1.getOpcode() != ISD::UNDEF) + if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) return SDValue(); // Skip the transformation if any of the types are illegal. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -9557,7 +9889,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { NewMask.push_back(NewElt); } return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, - DAG.getUNDEF(VT), NewMask.data()); + DAG.getUNDEF(VT), NewMask); } /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, @@ -9953,7 +10285,7 @@ static SDValue PerformSTORECombine(SDNode *N, SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), - ShuffleVec.data()); + ShuffleVec); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. @@ -9984,8 +10316,8 @@ static SDValue PerformSTORECombine(SDNode *N, StoreType, ShuffWide, DAG.getIntPtrConstant(I, DL)); SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); Chains.push_back(Ch); @@ -10004,18 +10336,18 @@ static SDValue PerformSTORECombine(SDNode *N, bool isBigEndian = DAG.getDataLayout().isBigEndian(); SDLoc DL(St); SDValue BasePtr = St->getBasePtr(); - SDValue NewST1 = DAG.getStore(St->getChain(), DL, - StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ), - BasePtr, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); + SDValue NewST1 = DAG.getStore( + St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), + BasePtr, St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(isBigEndian ? 0 : 1), - OffsetPtr, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), - std::min(4U, St->getAlignment() / 2)); + OffsetPtr, St->getPointerInfo(), + std::min(4U, St->getAlignment() / 2), + St->getMemOperand()->getFlags()); } if (StVal.getValueType() == MVT::i64 && @@ -10038,9 +10370,8 @@ static SDValue PerformSTORECombine(SDNode *N, DCI.AddToWorklist(ExtElt.getNode()); DCI.AddToWorklist(V.getNode()); return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment(), - St->getAAInfo()); + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags(), St->getAAInfo()); } // If this is a legal vector store, try to combine it into a VST1_UPD. @@ -10066,7 +10397,8 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); SDValue Op = N->getOperand(0); - if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || + Op.getOpcode() != ISD::FMUL) return SDValue(); SDValue ConstVec = Op->getOperand(1); @@ -10123,7 +10455,7 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, SDValue Op = N->getOperand(0); unsigned OpOpcode = Op.getNode()->getOpcode(); - if (!N->getValueType(0).isVector() || + if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) return SDValue(); @@ -10464,7 +10796,7 @@ static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero, // The operand to BFI is already a mask suitable for removing the bits it // sets. ConstantSDNode *CI = cast(Op.getOperand(2)); - APInt Mask = CI->getAPIntValue(); + const APInt &Mask = CI->getAPIntValue(); KnownZero &= Mask; KnownOne &= Mask; return; @@ -10522,7 +10854,7 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D } else { assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); } - + if (Op1->getOpcode() != ISD::OR) return SDValue(); @@ -10552,7 +10884,7 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D SDLoc dl(X); EVT VT = X.getValueType(); unsigned BitInX = AndC->getAPIntValue().logBase2(); - + if (BitInX != 0) { // We must shift X first. X = DAG.getNode(ISD::SRL, dl, VT, X, @@ -10573,6 +10905,46 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D return V; } +/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. +SDValue +ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { + SDValue Cmp = N->getOperand(4); + if (Cmp.getOpcode() != ARMISD::CMPZ) + // Only looking at NE cases. + return SDValue(); + + EVT VT = N->getValueType(0); + SDLoc dl(N); + SDValue LHS = Cmp.getOperand(0); + SDValue RHS = Cmp.getOperand(1); + SDValue Chain = N->getOperand(0); + SDValue BB = N->getOperand(1); + SDValue ARMcc = N->getOperand(2); + ARMCC::CondCodes CC = + (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); + + // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) + // -> (brcond Chain BB CC CPSR Cmp) + if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && + LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && + LHS->getOperand(0)->hasOneUse()) { + auto *LHS00C = dyn_cast(LHS->getOperand(0)->getOperand(0)); + auto *LHS01C = dyn_cast(LHS->getOperand(0)->getOperand(1)); + auto *LHS1C = dyn_cast(LHS->getOperand(1)); + auto *RHSC = dyn_cast(RHS); + if ((LHS00C && LHS00C->getZExtValue() == 0) && + (LHS01C && LHS01C->getZExtValue() == 1) && + (LHS1C && LHS1C->getZExtValue() == 1) && + (RHSC && RHSC->getZExtValue() == 0)) { + return DAG.getNode( + ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), + LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); + } + } + + return SDValue(); +} + /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. SDValue ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { @@ -10626,6 +10998,21 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { N->getOperand(3), NewCmp); } + // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) + // -> (cmov F T CC CPSR Cmp) + if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { + auto *LHS0C = dyn_cast(LHS->getOperand(0)); + auto *LHS1C = dyn_cast(LHS->getOperand(1)); + auto *RHSC = dyn_cast(RHS); + if ((LHS0C && LHS0C->getZExtValue() == 0) && + (LHS1C && LHS1C->getZExtValue() == 1) && + (RHSC && RHSC->getZExtValue() == 0)) { + return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, + LHS->getOperand(2), LHS->getOperand(3), + LHS->getOperand(4)); + } + } + if (Res.getNode()) { APInt KnownZero, KnownOne; DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne); @@ -10676,6 +11063,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); + case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD2DUP: case ARMISD::VLD3DUP: @@ -11198,22 +11586,37 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { - if (Subtarget->isThumb1Only()) - return false; - EVT VT; SDValue Ptr; - bool isSEXTLoad = false; + bool isSEXTLoad = false, isNonExt; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); + isNonExt = !ST->isTruncatingStore(); } else return false; + if (Subtarget->isThumb1Only()) { + // Thumb-1 can do a limited post-inc load or store as an updating LDM. It + // must be non-extending/truncating, i32, with an offset of 4. + assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); + if (Op->getOpcode() != ISD::ADD || !isNonExt) + return false; + auto *RHS = dyn_cast(Op->getOperand(1)); + if (!RHS || RHS->getZExtValue() != 4) + return false; + + Offset = Op->getOperand(1); + Base = Op->getOperand(0); + AM = ISD::POST_INC; + return true; + } + bool isInc; bool isLegal = false; if (Subtarget->isThumb2()) @@ -11322,6 +11725,26 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { return false; } +const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { + // At this point, we have to lower this constraint to something else, so we + // lower it to an "r" or "w". However, by doing this we will force the result + // to be in register, while the X constraint is much more permissive. + // + // Although we are correct (we are free to emit anything, without + // constraints), we might break use cases that would expect us to be more + // efficient and emit something else. + if (!Subtarget->hasVFP2()) + return "r"; + if (ConstraintVT.isFloatingPoint()) + return "w"; + if (ConstraintVT.isVector() && Subtarget->hasNEON() && + (ConstraintVT.getSizeInBits() == 64 || + ConstraintVT.getSizeInBits() == 128)) + return "w"; + + return "r"; +} + /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. ARMTargetLowering::ConstraintType @@ -11640,7 +12063,8 @@ static TargetLowering::ArgListTy getDivRemArgList( } SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { - assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) && + assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || + Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI()) && "Register-based DivRem lowering only"); unsigned Opcode = Op->getOpcode(); assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && @@ -11664,7 +12088,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) + .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); @@ -11702,7 +12126,7 @@ SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { // Lower call CallLoweringInfo CLI(DAG); CLI.setChain(InChain) - .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0) + .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); std::pair CallResult = LowerCallTo(CLI); @@ -11950,23 +12374,20 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const { - if (!getInsertFencesForAtomic()) - return nullptr; - switch (Ord) { - case NotAtomic: - case Unordered: + case AtomicOrdering::NotAtomic: + case AtomicOrdering::Unordered: llvm_unreachable("Invalid fence: unordered/non-atomic"); - case Monotonic: - case Acquire: + case AtomicOrdering::Monotonic: + case AtomicOrdering::Acquire: return nullptr; // Nothing to do - case SequentiallyConsistent: + case AtomicOrdering::SequentiallyConsistent: if (!IsStore) return nullptr; // Nothing to do /*FALLTHROUGH*/ - case Release: - case AcquireRelease: - if (Subtarget->isSwift()) + case AtomicOrdering::Release: + case AtomicOrdering::AcquireRelease: + if (Subtarget->preferISHSTBarriers()) return makeDMB(Builder, ARM_MB::ISHST); // FIXME: add a comment with a link to documentation justifying this. else @@ -11978,19 +12399,16 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const { - if (!getInsertFencesForAtomic()) - return nullptr; - switch (Ord) { - case NotAtomic: - case Unordered: + case AtomicOrdering::NotAtomic: + case AtomicOrdering::Unordered: llvm_unreachable("Invalid fence: unordered/not-atomic"); - case Monotonic: - case Release: + case AtomicOrdering::Monotonic: + case AtomicOrdering::Release: return nullptr; // Nothing to do - case Acquire: - case AcquireRelease: - case SequentiallyConsistent: + case AtomicOrdering::Acquire: + case AtomicOrdering::AcquireRelease: + case AtomicOrdering::SequentiallyConsistent: return makeDMB(Builder, ARM_MB::ISH); } llvm_unreachable("Unknown fence ordering in emitTrailingFence"); @@ -12031,7 +12449,17 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { - return true; + // At -O0, fast-regalloc cannot cope with the live vregs necessary to + // implement cmpxchg without spilling. If the address being exchanged is also + // on the stack and close enough to the spill slot, this can lead to a + // situation where the monitor always gets cleared and the atomic operation + // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. + return getTargetMachine().getOptLevel() != 0; +} + +bool ARMTargetLowering::shouldInsertFencesForAtomic( + const Instruction *I) const { + return InsertFencesForAtomic; } // This has so far only been implemented for MachO. @@ -12080,7 +12508,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast(Addr->getType())->getElementType(); - bool IsAcquire = isAtLeastAcquire(Ord); + bool IsAcquire = isAcquireOrStronger(Ord); // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i32, i32} and we have to recombine them into a @@ -12124,7 +12552,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - bool IsRelease = isAtLeastRelease(Ord); + bool IsRelease = isReleaseOrStronger(Ord); // Since the intrinsics must have legal type, the i64 intrinsics take two // parameters: "i32, i32". We must marshal Val into the appropriate form diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 96b56c3ec330..4906686616bc 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -43,7 +43,6 @@ namespace llvm { CALL, // Function call. CALL_PRED, // Function call that's predicable. CALL_NOLINK, // Function call with branch not branch-and-link. - tCALL, // Thumb function call. BRCOND, // Conditional branch. BR_JT, // Jumptable branch. BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). @@ -61,6 +60,8 @@ namespace llvm { CMOV, // ARM conditional move instructions. + SSAT, // Signed saturation + BCC_i64, SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. @@ -164,6 +165,7 @@ namespace llvm { UMLAL, // 64bit Unsigned Accumulate Multiply SMLAL, // 64bit Signed Accumulate Multiply + UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other @@ -251,13 +253,14 @@ namespace llvm { EVT VT) const override; MachineBasicBlock * - EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *MBB) const override; + EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB) const override; - void AdjustInstrPostInstrSelection(MachineInstr *MI, + void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const; + SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; @@ -335,6 +338,8 @@ namespace llvm { getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; + const char *LowerXConstraint(EVT ConstraintVT) const override; + /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. If hasMemory is /// true it means one of the asm constraint of the inline asm instruction @@ -453,6 +458,7 @@ namespace llvm { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; + bool shouldInsertFencesForAtomic(const Instruction *I) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; @@ -468,6 +474,14 @@ namespace llvm { bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; + bool supportSwiftError() const override { + return true; + } + + bool hasStandaloneRem(EVT VT) const override { + return HasStandaloneRem; + } + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -486,29 +500,34 @@ namespace llvm { /// unsigned ARMPCLabelIndex; + // TODO: remove this, and have shouldInsertFencesForAtomic do the proper + // check. + bool InsertFencesForAtomic; + + bool HasStandaloneRem = true; + void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); std::pair getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const; typedef SmallVector, 8> RegsToPassVector; - void PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, - SDValue Chain, SDValue &Arg, - RegsToPassVector &RegsToPass, + void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, + SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVectorImpl &MemOpChains, ISD::ArgFlagsTy Flags) const; SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, - SDLoc dl) const; + const SDLoc &dl) const; CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC, bool isVarArg) const; CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const; SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, - SDLoc dl, SelectionDAG &DAG, + const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; @@ -527,6 +546,7 @@ namespace llvm { SelectionDAG &DAG, TLSModel::Model model) const; SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddressWindows(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; @@ -576,9 +596,9 @@ namespace llvm { SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals, - bool isThisReturn, SDValue ThisVal) const; + const SDLoc &dl, SelectionDAG &DAG, + SmallVectorImpl &InVals, bool isThisReturn, + SDValue ThisVal) const; bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && @@ -590,23 +610,19 @@ namespace llvm { const SmallVectorImpl &Exits) const override; SDValue - LowerFormalArguments(SDValue Chain, - CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, - SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) const override; - - int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, - SDLoc dl, SDValue &Chain, - const Value *OrigArg, - unsigned InRegsParamRecordIdx, - int ArgOffset, + LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, + const SDLoc &dl, SelectionDAG &DAG, + SmallVectorImpl &InVals) const override; + + int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl, + SDValue &Chain, const Value *OrigArg, + unsigned InRegsParamRecordIdx, int ArgOffset, unsigned ArgSize) const; void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, - SDLoc dl, SDValue &Chain, - unsigned ArgOffset, - unsigned TotalArgRegsSaveSize, + const SDLoc &dl, SDValue &Chain, + unsigned ArgOffset, unsigned TotalArgRegsSaveSize, bool ForceMutable = false) const; SDValue @@ -634,42 +650,39 @@ namespace llvm { const SmallVectorImpl &Outs, LLVMContext &Context) const override; - SDValue - LowerReturn(SDValue Chain, - CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - SDLoc dl, SelectionDAG &DAG) const override; + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SDLoc &dl, SelectionDAG &DAG) const override; bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(CallInst *CI) const override; - SDValue getCMOV(SDLoc dl, EVT VT, SDValue FalseVal, SDValue TrueVal, + SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const; SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const; - SDValue getVFPCmp(SDValue LHS, SDValue RHS, - SelectionDAG &DAG, SDLoc dl) const; + SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const; + SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, + const SDLoc &dl) const; SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const; SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; - void SetupEntryBlockForSjLj(MachineInstr *MI, - MachineBasicBlock *MBB, + void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const; - void EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const; + void EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const; - bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const; + bool RemapAddSubWithFlags(MachineInstr &MI, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitStructByval(MachineInstr *MI, + MachineBasicBlock *EmitStructByval(MachineInstr &MI, MachineBasicBlock *MBB) const; - MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI, + MachineBasicBlock *EmitLowered__chkstk(MachineInstr &MI, MachineBasicBlock *MBB) const; - MachineBasicBlock *EmitLowered__dbzchk(MachineInstr *MI, + MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; }; diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index e79608d360ca..37a83f70a1fb 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -246,23 +246,33 @@ def shr_imm64 : Operand, ImmLeaf 0 && Imm <= 64; }]> { let ParserMatchClass = shr_imm64_asm_operand; } + +// ARM Assembler operand for ldr Rd, =expression which generates an offset +// to a constant pool entry or a MOV depending on the value of expression +def const_pool_asm_operand : AsmOperandClass { let Name = "ConstPoolAsmImm"; } +def const_pool_asm_imm : Operand { + let ParserMatchClass = const_pool_asm_operand; +} + + //===----------------------------------------------------------------------===// // ARM Assembler alias templates. // -class ARMInstAlias - : InstAlias, Requires<[IsARM]>; -class tInstAlias - : InstAlias, Requires<[IsThumb]>; -class t2InstAlias - : InstAlias, Requires<[IsThumb2]>; -class VFP2InstAlias - : InstAlias, Requires<[HasVFP2]>; -class VFP2DPInstAlias - : InstAlias, Requires<[HasVFP2,HasDPVFP]>; -class VFP3InstAlias - : InstAlias, Requires<[HasVFP3]>; -class NEONInstAlias - : InstAlias, Requires<[HasNEON]>; +// Note: When EmitPriority == 1, the alias will be used for printing +class ARMInstAlias + : InstAlias, Requires<[IsARM]>; +class tInstAlias + : InstAlias, Requires<[IsThumb]>; +class t2InstAlias + : InstAlias, Requires<[IsThumb2]>; +class VFP2InstAlias + : InstAlias, Requires<[HasVFP2]>; +class VFP2DPInstAlias + : InstAlias, Requires<[HasVFP2,HasDPVFP]>; +class VFP3InstAlias + : InstAlias, Requires<[HasVFP3]>; +class NEONInstAlias + : InstAlias, Requires<[HasNEON]>; class VFP2MnemonicAlias : MnemonicAlias, @@ -563,12 +573,12 @@ class AIstrex opcod, dag oops, dag iops, InstrItinClass itin, class AIldaex opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> : AIldr_ex_or_acq, - Requires<[IsARM, HasV8]>; + Requires<[IsARM, HasAcquireRelease, HasV7Clrex]>; class AIstlex opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> : AIstr_ex_or_rel, - Requires<[IsARM, HasV8]> { + Requires<[IsARM, HasAcquireRelease, HasV7Clrex]> { bits<4> Rd; let Inst{15-12} = Rd; } @@ -593,12 +603,12 @@ class AIswp pattern> class AIldracq opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> : AIldr_ex_or_acq, - Requires<[IsARM, HasV8]>; + Requires<[IsARM, HasAcquireRelease]>; class AIstrrel opcod, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> : AIstr_ex_or_rel, - Requires<[IsARM, HasV8]> { + Requires<[IsARM, HasAcquireRelease]> { let Inst{15-12} = 0b1111; } @@ -1379,11 +1389,6 @@ class T2Ipostldst opcod, bit load, bit pre, let DecoderMethod = "DecodeT2LdStPre"; } -// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode. -class Tv5Pat : Pat { - list Predicates = [IsThumb, IsThumb1Only, HasV5T]; -} - // T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode. class T1Pat : Pat { list Predicates = [IsThumb, IsThumb1Only]; @@ -1495,6 +1500,32 @@ class ASI5 opcod1, bits<2> opcod2, dag oops, dag iops, let D = VFPNeonDomain; } +class AHI5 opcod1, bits<2> opcod2, dag oops, dag iops, + InstrItinClass itin, + string opc, string asm, list pattern> + : VFPI { + list Predicates = [HasFullFP16]; + + // Instruction operands. + bits<5> Sd; + bits<13> addr; + + // Encode instruction operands. + let Inst{23} = addr{8}; // U (add = (U == '1')) + let Inst{22} = Sd{0}; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Sd{4-1}; + let Inst{7-0} = addr{7-0}; // imm8 + + let Inst{27-24} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-8} = 0b1001; // Half precision + + // Loads & stores operate on both NEON and VFP pipelines. + let D = VFPNeonDomain; +} + // VFP Load / store multiple pseudo instructions. class PseudoVFPLdStM pattern> @@ -1817,6 +1848,114 @@ class ASbIn opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, let Inst{22} = Sd{0}; } +// Half precision, unary, predicated +class AHuI opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, + string asm, list pattern> + : VFPAI { + list Predicates = [HasFullFP16]; + + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-8} = 0b1001; // Half precision + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; +} + +// Half precision, unary, non-predicated +class AHuInp opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, + string asm, list pattern> + : VFPXI { + list Predicates = [HasFullFP16]; + + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + let Inst{31-28} = 0b1111; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-8} = 0b1001; // Half precision + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; +} + +// Half precision, binary +class AHbI opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list pattern> + : VFPAI { + list Predicates = [HasFullFP16]; + + // Instruction operands. + bits<5> Sd; + bits<5> Sn; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-8} = 0b1001; // Half precision + let Inst{6} = op6; + let Inst{4} = op4; +} + +// Half precision, binary, not predicated +class AHbInp opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops, + InstrItinClass itin, string asm, list pattern> + : VFPXI { + list Predicates = [HasFullFP16]; + + // Instruction operands. + bits<5> Sd; + bits<5> Sn; + bits<5> Sm; + + let Inst{31-28} = 0b1111; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-8} = 0b1001; // Half precision + let Inst{6} = opcod3; + let Inst{4} = 0; +} + // VFP conversion instructions class AVConv1I opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4, dag oops, dag iops, InstrItinClass itin, string opc, string asm, @@ -2321,22 +2460,25 @@ class NEONFPPat : Pat { } // VFP/NEON Instruction aliases for type suffices. -class VFPDataTypeInstAlias : - InstAlias, Requires<[HasVFP2]>; +// Note: When EmitPriority == 1, the alias will be used for printing +class VFPDataTypeInstAlias : + InstAlias, Requires<[HasVFP2]>; -multiclass VFPDTAnyInstAlias { - def : VFPDataTypeInstAlias; - def : VFPDataTypeInstAlias; - def : VFPDataTypeInstAlias; - def : VFPDataTypeInstAlias; +// Note: When EmitPriority == 1, the alias will be used for printing +multiclass VFPDTAnyInstAlias { + def : VFPDataTypeInstAlias; + def : VFPDataTypeInstAlias; + def : VFPDataTypeInstAlias; + def : VFPDataTypeInstAlias; } -multiclass NEONDTAnyInstAlias { +// Note: When EmitPriority == 1, the alias will be used for printing +multiclass NEONDTAnyInstAlias { let Predicates = [HasNEON] in { - def : VFPDataTypeInstAlias; - def : VFPDataTypeInstAlias; - def : VFPDataTypeInstAlias; - def : VFPDataTypeInstAlias; + def : VFPDataTypeInstAlias; + def : VFPDataTypeInstAlias; + def : VFPDataTypeInstAlias; + def : VFPDataTypeInstAlias; } } diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index cf973d68085f..98b1b4ca4272 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -90,29 +90,29 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const { return 0; } -void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI, - Reloc::Model RM) const { +void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const { MachineFunction &MF = *MI->getParent()->getParent(); const ARMSubtarget &Subtarget = MF.getSubtarget(); + const TargetMachine &TM = MF.getTarget(); if (!Subtarget.useMovt(MF)) { - if (RM == Reloc::PIC_) - expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12, RM); + if (TM.isPositionIndependent()) + expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12); else - expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12, RM); + expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12); return; } - if (RM != Reloc::PIC_) { - expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12, RM); + if (!TM.isPositionIndependent()) { + expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12); return; } const GlobalValue *GV = cast((*MI->memoperands_begin())->getValue()); - if (!Subtarget.GVIsIndirectSymbol(GV, RM)) { - expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12, RM); + if (!Subtarget.isGVIndirectSymbol(GV)) { + expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12); return; } @@ -123,9 +123,9 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI, MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg) .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY); - unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; + auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( - MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4); + MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4); MIB.addMemOperand(MMO); MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg); MIB.addReg(Reg, RegState::Kill).addImm(0); diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h index 90f34ea08401..4b1b7097b18d 100644 --- a/lib/Target/ARM/ARMInstrInfo.h +++ b/lib/Target/ARM/ARMInstrInfo.h @@ -39,8 +39,7 @@ public: const ARMRegisterInfo &getRegisterInfo() const override { return RI; } private: - void expandLoadStackGuard(MachineBasicBlock::iterator MI, - Reloc::Model RM) const override; + void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override; }; } diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index c446ba3109e4..060376b0a273 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -90,12 +90,6 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, SDTCisVT<1, i32>, SDTCisVT<4, i32>]>; -def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>, - SDTCisVT<2, i32>, SDTCisVT<3, i32>, - SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >; -def ARMUmlal : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>; -def ARMSmlal : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>; - // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; @@ -128,6 +122,8 @@ def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall, def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, [SDNPInGlue]>; +def ARMssatnoshift : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; + def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; @@ -201,6 +197,12 @@ def NoV6 : Predicate<"!Subtarget->hasV6Ops()">; def HasV6M : Predicate<"Subtarget->hasV6MOps()">, AssemblerPredicate<"HasV6MOps", "armv6m or armv6t2">; +def HasV8MBaseline : Predicate<"Subtarget->hasV8MBaselineOps()">, + AssemblerPredicate<"HasV8MBaselineOps", + "armv8m.base">; +def HasV8MMainline : Predicate<"Subtarget->hasV8MMainlineOps()">, + AssemblerPredicate<"HasV8MMainlineOps", + "armv8m.main">; def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">, AssemblerPredicate<"HasV6T2Ops", "armv6t2">; def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; @@ -235,6 +237,8 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto", "crypto">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; +def HasRAS : Predicate<"Subtarget->hasRAS()">, + AssemblerPredicate<"FeatureRAS", "ras">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<"FeatureFP16","half-float conversions">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, @@ -251,6 +255,12 @@ def HasDSP : Predicate<"Subtarget->hasDSP()">, def HasDB : Predicate<"Subtarget->hasDataBarrier()">, AssemblerPredicate<"FeatureDB", "data-barriers">; +def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">, + AssemblerPredicate<"FeatureV7Clrex", + "v7 clrex">; +def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">, + AssemblerPredicate<"FeatureAcquireRelease", + "acquire/release">; def HasMP : Predicate<"Subtarget->hasMPExtension()">, AssemblerPredicate<"FeatureMP", "mp-extensions">; @@ -260,6 +270,9 @@ def HasVirtualization: Predicate<"false">, def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">, AssemblerPredicate<"FeatureTrustZone", "TrustZone">; +def Has8MSecExt : Predicate<"Subtarget->has8MSecExt()">, + AssemblerPredicate<"Feature8MSecExt", + "ARMv8-M Security Extensions">; def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; @@ -279,6 +292,8 @@ def IsARM : Predicate<"!Subtarget->isThumb()">, def IsMachO : Predicate<"Subtarget->isTargetMachO()">; def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; +def IsWindows : Predicate<"Subtarget->isTargetWindows()">; +def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">; def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">, AssemblerPredicate<"FeatureNaClTrap", "NaCl">; def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">; @@ -301,19 +316,16 @@ def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion ==" " Subtarget->hasVFP4()) || " "Subtarget->isTargetDarwin()">; -// VGETLNi32 is microcoded on Swift - prefer VMOV. -def HasFastVGETLNi32 : Predicate<"!Subtarget->isSwift()">; -def HasSlowVGETLNi32 : Predicate<"Subtarget->isSwift()">; +def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">; +def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">; -// VDUP.32 is microcoded on Swift - prefer VMOV. -def HasFastVDUP32 : Predicate<"!Subtarget->isSwift()">; -def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">; +def HasFastVDUP32 : Predicate<"!Subtarget->hasSlowVDUP32()">; +def HasSlowVDUP32 : Predicate<"Subtarget->hasSlowVDUP32()">; -// Cortex-A9 prefers VMOVSR to VMOVDRR even when using NEON for scalar FP, as -// this allows more effective execution domain optimization. See -// setExecutionDomain(). -def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">; -def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">; +def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() ||" + "!Subtarget->useNEONForSinglePrecisionFP()">; +def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&" + "Subtarget->useNEONForSinglePrecisionFP()">; def IsLE : Predicate<"MF->getDataLayout().isLittleEndian()">; def IsBE : Predicate<"MF->getDataLayout().isBigEndian()">; @@ -360,8 +372,6 @@ def lo16AllZero : PatLeaf<(i32 imm), [{ return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0; }], hi16>; -class BinOpWithFlagFrag : - PatFrag<(ops node:$LHS, node:$RHS, node:$FLAG), res>; class BinOpFrag : PatFrag<(ops node:$LHS, node:$RHS), res>; class UnOpFrag : PatFrag<(ops node:$Src), res>; @@ -408,34 +418,35 @@ def brtarget : Operand { let DecoderMethod = "DecodeT2BROperand"; } -// FIXME: get rid of this one? -def uncondbrtarget : Operand { - let EncoderMethod = "getUnconditionalBranchTargetOpValue"; - let OperandType = "OPERAND_PCREL"; +// Branches targeting ARM-mode must be divisible by 4 if they're a raw +// immediate. +def ARMBranchTarget : AsmOperandClass { + let Name = "ARMBranchTarget"; } -// Branch target for ARM. Handles conditional/unconditional -def br_target : Operand { - let EncoderMethod = "getARMBranchTargetOpValue"; - let OperandType = "OPERAND_PCREL"; +// Branches targeting Thumb-mode must be divisible by 2 if they're a raw +// immediate. +def ThumbBranchTarget : AsmOperandClass { + let Name = "ThumbBranchTarget"; } -// Call target. -// FIXME: rename bltarget to t2_bl_target? -def bltarget : Operand { - // Encoded the same as branch targets. - let EncoderMethod = "getBranchTargetOpValue"; +def arm_br_target : Operand { + let ParserMatchClass = ARMBranchTarget; + let EncoderMethod = "getARMBranchTargetOpValue"; let OperandType = "OPERAND_PCREL"; } // Call target for ARM. Handles conditional/unconditional // FIXME: rename bl_target to t2_bltarget? -def bl_target : Operand { +def arm_bl_target : Operand { + let ParserMatchClass = ARMBranchTarget; let EncoderMethod = "getARMBLTargetOpValue"; let OperandType = "OPERAND_PCREL"; } -def blx_target : Operand { +// Target for BLX *from* ARM mode. +def arm_blx_target : Operand { + let ParserMatchClass = ThumbBranchTarget; let EncoderMethod = "getARMBLXTargetOpValue"; let OperandType = "OPERAND_PCREL"; } @@ -981,6 +992,21 @@ def addrmode5_pre : AddrMode5 { let PrintMethod = "printAddrMode5Operand"; } +// addrmode5fp16 := reg +/- imm8*2 +// +def AddrMode5FP16AsmOperand : AsmOperandClass { let Name = "AddrMode5FP16"; } +class AddrMode5FP16 : Operand, + ComplexPattern { + let EncoderMethod = "getAddrMode5FP16OpValue"; + let DecoderMethod = "DecodeAddrMode5FP16Operand"; + let ParserMatchClass = AddrMode5FP16AsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm); +} + +def addrmode5fp16 : AddrMode5FP16 { + let PrintMethod = "printAddrMode5FP16Operand"; +} + // addrmode6 := reg with optional alignment // def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; } @@ -1224,7 +1250,7 @@ include "ARMInstrFormats.td" let TwoOperandAliasConstraint = "$Rn = $Rd" in multiclass AsI1_bin_irs opcod, string opc, InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, - PatFrag opnode, bit Commutable = 0> { + SDPatternOperator opnode, bit Commutable = 0> { // The register-immediate version is re-materializable. This is useful // in particular for taking the address of a local. let isReMaterializable = 1 in { @@ -1297,7 +1323,7 @@ multiclass AsI1_bin_irs opcod, string opc, let TwoOperandAliasConstraint = "$Rn = $Rd" in multiclass AsI1_rbin_irs opcod, string opc, InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, - PatFrag opnode, bit Commutable = 0> { + SDNode opnode, bit Commutable = 0> { // The register-immediate version is re-materializable. This is useful // in particular for taking the address of a local. let isReMaterializable = 1 in { @@ -1369,7 +1395,7 @@ multiclass AsI1_rbin_irs opcod, string opc, /// AdjustInstrPostInstrSelection after giving them an optional CPSR operand. let hasPostISelHook = 1, Defs = [CPSR] in { multiclass AsI1_bin_s_irs { def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p), 4, iii, @@ -1402,7 +1428,7 @@ multiclass AsI1_bin_s_irs { def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p), 4, iii, @@ -1431,8 +1457,8 @@ multiclass AsI1_rbin_s_is opcod, string opc, InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, - PatFrag opnode, bit Commutable = 0, - string rrDecoderMethod = ""> { + SDPatternOperator opnode, bit Commutable = 0, + string rrDecoderMethod = ""> { def ri : AI1, @@ -1561,7 +1587,7 @@ class AI_exta_rrot_np opcod, string opc> /// AI1_adde_sube_irs - Define instructions and patterns for adde and sube. let TwoOperandAliasConstraint = "$Rn = $Rd" in -multiclass AI1_adde_sube_irs opcod, string opc, PatFrag opnode, +multiclass AI1_adde_sube_irs opcod, string opc, SDNode opnode, bit Commutable = 0> { let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in { def ri : AsI1 opcod, string opc, PatFrag opnode, /// AI1_rsc_irs - Define instructions and patterns for rsc let TwoOperandAliasConstraint = "$Rn = $Rd" in -multiclass AI1_rsc_irs opcod, string opc, PatFrag opnode> { +multiclass AI1_rsc_irs opcod, string opc, SDNode opnode> { let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in { def ri : AsI1 imm; let Inst{27-8} = 0b00110010000011110000; let Inst{7-0} = imm; + let DecoderMethod = "DecodeHINTInstruction"; } def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6K]>; @@ -1888,6 +1915,7 @@ def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6K]>; def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>; def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>; def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>; +def : InstAlias<"esb$p", (HINT 16, pred:$p)>, Requires<[IsARM, HasRAS]>; def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> { @@ -1915,7 +1943,7 @@ def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, let Inst{7-4} = 0b0111; } // default immediate for breakpoint mnemonic -def : InstAlias<"bkpt", (BKPT 0)>, Requires<[IsARM]>; +def : InstAlias<"bkpt", (BKPT 0), 0>, Requires<[IsARM]>; def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> { @@ -2181,7 +2209,7 @@ let isCall = 1, // at least be a pseudo instruction expanding to the predicated version // at MC lowering time. Defs = [LR], Uses = [SP] in { - def BL : ABXI<0b1011, (outs), (ins bl_target:$func), + def BL : ABXI<0b1011, (outs), (ins arm_bl_target:$func), IIC_Br, "bl\t$func", [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM]>, Sched<[WriteBrL]> { @@ -2191,7 +2219,7 @@ let isCall = 1, let DecoderMethod = "DecodeBranchImmInstruction"; } - def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func), + def BL_pred : ABI<0b1011, (outs), (ins arm_bl_target:$func), IIC_Br, "bl", "\t$func", [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsARM]>, Sched<[WriteBrL]> { @@ -2232,7 +2260,7 @@ let isCall = 1, // mov lr, pc; b if callee is marked noreturn to avoid confusing the // return stack predictor. - def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func), + def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func), 8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>, Requires<[IsARM]>, Sched<[WriteBr]>; } @@ -2240,7 +2268,7 @@ let isCall = 1, let isBranch = 1, isTerminator = 1 in { // FIXME: should be able to write a pattern for ARMBrcond, but can't use // a two-value operand where a dag node expects two operands. :( - def Bcc : ABI<0b1010, (outs), (ins br_target:$target), + def Bcc : ABI<0b1010, (outs), (ins arm_br_target:$target), IIC_Br, "b", "\t$target", [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>, Sched<[WriteBr]> { @@ -2255,8 +2283,9 @@ let isBranch = 1, isTerminator = 1 in { // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly // should be sufficient. // FIXME: Is B really a Barrier? That doesn't seem right. - def B : ARMPseudoExpand<(outs), (ins br_target:$target), 4, IIC_Br, - [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>, + def B : ARMPseudoExpand<(outs), (ins arm_br_target:$target), 4, IIC_Br, + [(br bb:$target)], (Bcc arm_br_target:$target, + (ops 14, zero_reg))>, Sched<[WriteBr]>; let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in { @@ -2283,7 +2312,7 @@ let isBranch = 1, isTerminator = 1 in { } // BLX (immediate) -def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary, +def BLXi : AXI<(outs), (ins arm_blx_target:$target), BrMiscFrm, NoItinerary, "blx\t$target", []>, Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { let Inst{31-25} = 0b1111101; @@ -2313,9 +2342,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>, Sched<[WriteBr]>; - def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst), + def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst), 4, IIC_Br, [], - (Bcc br_target:$dst, (ops 14, zero_reg))>, + (Bcc arm_br_target:$dst, (ops 14, zero_reg))>, Requires<[IsARM]>, Sched<[WriteBr]>; def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst), @@ -2467,14 +2496,12 @@ def ERET : ABI<0b0001, (outs), (ins), NoItinerary, "eret", "", []>, // Load -defm LDR : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, - UnOpFrag<(load node:$Src)>>; +defm LDR : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, load>; defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si, - UnOpFrag<(zextloadi8 node:$Src)>>; -defm STR : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, - BinOpFrag<(store node:$LHS, node:$RHS)>>; + zextloadi8>; +defm STR : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, store>; defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si, - BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; + truncstorei8>; // Special LDR for loads from non-pc-relative constpools. let canFoldAsLoad = 1, mayLoad = 1, hasSideEffects = 0, @@ -2764,6 +2791,12 @@ def LDRBT_POST : ARMAsmPseudo<"ldrbt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q), (outs GPR:$Rt)>; +// Pseudo instruction ldr Rt, =immediate +def LDRConstPool + : ARMAsmPseudo<"ldr${q} $Rt, $immediate", + (ins const_pool_asm_imm:$immediate, pred:$q), + (outs GPR:$Rt)>; + // Store // Stores with truncate @@ -3299,8 +3332,8 @@ def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm), } def : InstAlias<"mov${p} $Rd, $imm", - (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p)>, - Requires<[IsARM]>; + (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p), 0>, + Requires<[IsARM, HasV6T2]>; def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, @@ -3439,11 +3472,9 @@ def UBFX : I<(outs GPRnopc:$Rd), // defm ADD : AsI1_bin_irs<0b0100, "add", - IIC_iALUi, IIC_iALUr, IIC_iALUsr, - BinOpFrag<(add node:$LHS, node:$RHS)>, 1>; + IIC_iALUi, IIC_iALUr, IIC_iALUsr, add, 1>; defm SUB : AsI1_bin_irs<0b0010, "sub", - IIC_iALUi, IIC_iALUr, IIC_iALUsr, - BinOpFrag<(sub node:$LHS, node:$RHS)>>; + IIC_iALUi, IIC_iALUr, IIC_iALUsr, sub>; // ADD and SUB with 's' bit set. // @@ -3455,27 +3486,21 @@ defm SUB : AsI1_bin_irs<0b0010, "sub", // FIXME: Eliminate ADDS/SUBS pseudo opcodes after adding tablegen // support for an optional CPSR definition that corresponds to the DAG // node's second value. We can then eliminate the implicit def of CPSR. -defm ADDS : AsI1_bin_s_irs, 1>; -defm SUBS : AsI1_bin_s_irs>; +defm ADDS : AsI1_bin_s_irs; +defm SUBS : AsI1_bin_s_irs; -defm ADC : AI1_adde_sube_irs<0b0101, "adc", - BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>; -defm SBC : AI1_adde_sube_irs<0b0110, "sbc", - BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>; +defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>; +defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>; defm RSB : AsI1_rbin_irs<0b0011, "rsb", IIC_iALUi, IIC_iALUr, IIC_iALUsr, - BinOpFrag<(sub node:$LHS, node:$RHS)>>; + sub>; // FIXME: Eliminate them if we can write def : Pat patterns which defines // CPSR and the implicit def of CPSR is not needed. -defm RSBS : AsI1_rbin_s_is>; +defm RSBS : AsI1_rbin_s_is; -defm RSC : AI1_rsc_irs<0b0111, "rsc", - BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>; +defm RSC : AI1_rsc_irs<0b0111, "rsc", ARMsube>; // (sub X, imm) gets canonicalized to (add X, -imm). Match this form. // The assume-no-carry-in form uses the negation of the input since add/sub @@ -3685,20 +3710,19 @@ def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos), (SSAT imm1_32:$pos, GPRnopc:$a, 0)>; def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos), (USAT imm0_31:$pos, GPRnopc:$a, 0)>; +def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm), + (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>; //===----------------------------------------------------------------------===// // Bitwise Instructions. // defm AND : AsI1_bin_irs<0b0000, "and", - IIC_iBITi, IIC_iBITr, IIC_iBITsr, - BinOpFrag<(and node:$LHS, node:$RHS)>, 1>; + IIC_iBITi, IIC_iBITr, IIC_iBITsr, and, 1>; defm ORR : AsI1_bin_irs<0b1100, "orr", - IIC_iBITi, IIC_iBITr, IIC_iBITsr, - BinOpFrag<(or node:$LHS, node:$RHS)>, 1>; + IIC_iBITi, IIC_iBITr, IIC_iBITsr, or, 1>; defm EOR : AsI1_bin_irs<0b0001, "eor", - IIC_iBITi, IIC_iBITr, IIC_iBITsr, - BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>; + IIC_iBITi, IIC_iBITr, IIC_iBITsr, xor, 1>; defm BIC : AsI1_bin_irs<0b1110, "bic", IIC_iBITi, IIC_iBITr, IIC_iBITsr, BinOpFrag<(and node:$LHS, (not node:$RHS))>>; @@ -3923,9 +3947,10 @@ def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi), RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>; def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + IIC_iMAC64, "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV6]> { + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]> { bits<4> RdLo; bits<4> RdHi; bits<4> Rm; @@ -3989,28 +4014,28 @@ def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd), IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>, Requires<[IsARM, HasV6]>; -multiclass AI_smul { +multiclass AI_smul { def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", - [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16), + [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16), (sext_inreg GPR:$Rm, i16)))]>, Requires<[IsARM, HasV5TE]>; def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", - [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16), + [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16), (sra GPR:$Rm, (i32 16))))]>, Requires<[IsARM, HasV5TE]>; def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", - [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)), + [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)), (sext_inreg GPR:$Rm, i16)))]>, Requires<[IsARM, HasV5TE]>; def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", - [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)), + [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)), (sra GPR:$Rm, (i32 16))))]>, Requires<[IsARM, HasV5TE]>; @@ -4026,13 +4051,13 @@ multiclass AI_smul { } -multiclass AI_smla { +multiclass AI_smla { let DecoderMethod = "DecodeSMLAInstruction" in { def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, (add GPR:$Ra, - (opnode (sext_inreg GPRnopc:$Rn, i16), + (mul (sext_inreg GPRnopc:$Rn, i16), (sext_inreg GPRnopc:$Rm, i16))))]>, Requires<[IsARM, HasV5TE, UseMulOps]>; @@ -4040,7 +4065,7 @@ multiclass AI_smla { (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, - (add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16), + (add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16), (sra GPRnopc:$Rm, (i32 16)))))]>, Requires<[IsARM, HasV5TE, UseMulOps]>; @@ -4048,7 +4073,7 @@ multiclass AI_smla { (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, - (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)), + (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)), (sext_inreg GPRnopc:$Rm, i16))))]>, Requires<[IsARM, HasV5TE, UseMulOps]>; @@ -4056,7 +4081,7 @@ multiclass AI_smla { (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, - (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)), + (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)), (sra GPRnopc:$Rm, (i32 16)))))]>, Requires<[IsARM, HasV5TE, UseMulOps]>; @@ -4074,8 +4099,8 @@ multiclass AI_smla { } } -defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; -defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; +defm SMUL : AI_smul<"smul">; +defm SMLA : AI_smla<"smla">; // Halfword multiply accumulate long: SMLAL. def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), @@ -4336,8 +4361,7 @@ def SETPAN : AInoP<(outs), (ins imm0_1:$imm), MiscFrm, NoItinerary, "setpan", // defm CMP : AI1_cmp_irs<0b1010, "cmp", - IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, - BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, ARMcmp>; // ARMcmpZ can re-use the above instruction definitions. def : ARMPat<(ARMcmpZ GPR:$src, mod_imm:$imm), @@ -4745,7 +4769,7 @@ def : ARMPat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr), class acquiring_load : PatFrag<(ops node:$ptr), (base node:$ptr), [{ AtomicOrdering Ordering = cast(N)->getOrdering(); - return isAtLeastAcquire(Ordering); + return isAcquireOrStronger(Ordering); }]>; def atomic_load_acquire_8 : acquiring_load; @@ -4755,7 +4779,7 @@ def atomic_load_acquire_32 : acquiring_load; class releasing_store : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ AtomicOrdering Ordering = cast(N)->getOrdering(); - return isAtLeastRelease(Ordering); + return isReleaseOrStronger(Ordering); }]>; def atomic_store_release_8 : releasing_store; @@ -4831,21 +4855,21 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, } class ACI + list pattern, IndexMode im = IndexModeNone> : I { + opc, asm, "", pattern> { let Inst{27-25} = 0b110; } class ACInoP + list pattern, IndexMode im = IndexModeNone> : InoP { + opc, asm, "", pattern> { let Inst{31-28} = 0b1111; let Inst{27-25} = 0b110; } -multiclass LdStCop { +multiclass LdStCop pattern> { def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), - asm, "\t$cop, $CRd, $addr"> { + asm, "\t$cop, $CRd, $addr", pattern> { bits<13> addr; bits<4> cop; bits<4> CRd; @@ -4861,7 +4885,7 @@ multiclass LdStCop { let DecoderMethod = "DecodeCopMemInstruction"; } def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr), - asm, "\t$cop, $CRd, $addr!", IndexModePre> { + asm, "\t$cop, $CRd, $addr!", [], IndexModePre> { bits<13> addr; bits<4> cop; bits<4> CRd; @@ -4878,7 +4902,7 @@ multiclass LdStCop { } def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, postidx_imm8s4:$offset), - asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> { + asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> { bits<9> offset; bits<4> addr; bits<4> cop; @@ -4897,7 +4921,7 @@ multiclass LdStCop { def _OPTION : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, coproc_option_imm:$option), - asm, "\t$cop, $CRd, $addr, $option"> { + asm, "\t$cop, $CRd, $addr, $option", []> { bits<8> option; bits<4> addr; bits<4> cop; @@ -4914,9 +4938,9 @@ multiclass LdStCop { let DecoderMethod = "DecodeCopMemInstruction"; } } -multiclass LdSt2Cop { +multiclass LdSt2Cop pattern> { def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), - asm, "\t$cop, $CRd, $addr"> { + asm, "\t$cop, $CRd, $addr", pattern> { bits<13> addr; bits<4> cop; bits<4> CRd; @@ -4932,7 +4956,7 @@ multiclass LdSt2Cop { let DecoderMethod = "DecodeCopMemInstruction"; } def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr), - asm, "\t$cop, $CRd, $addr!", IndexModePre> { + asm, "\t$cop, $CRd, $addr!", [], IndexModePre> { bits<13> addr; bits<4> cop; bits<4> CRd; @@ -4949,7 +4973,7 @@ multiclass LdSt2Cop { } def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, postidx_imm8s4:$offset), - asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> { + asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> { bits<9> offset; bits<4> addr; bits<4> cop; @@ -4968,7 +4992,7 @@ multiclass LdSt2Cop { def _OPTION : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, coproc_option_imm:$option), - asm, "\t$cop, $CRd, $addr, $option"> { + asm, "\t$cop, $CRd, $addr, $option", []> { bits<8> option; bits<4> addr; bits<4> cop; @@ -4986,14 +5010,15 @@ multiclass LdSt2Cop { } } -defm LDC : LdStCop <1, 0, "ldc">; -defm LDCL : LdStCop <1, 1, "ldcl">; -defm STC : LdStCop <0, 0, "stc">; -defm STCL : LdStCop <0, 1, "stcl">; -defm LDC2 : LdSt2Cop<1, 0, "ldc2">, Requires<[PreV8]>; -defm LDC2L : LdSt2Cop<1, 1, "ldc2l">, Requires<[PreV8]>; -defm STC2 : LdSt2Cop<0, 0, "stc2">, Requires<[PreV8]>; -defm STC2L : LdSt2Cop<0, 1, "stc2l">, Requires<[PreV8]>; +defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>; +defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>; +defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>; +defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>; + +defm STC : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>; +defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>; +defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>; +defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>; //===----------------------------------------------------------------------===// // Move between coprocessor and ARM core register. @@ -5118,9 +5143,9 @@ def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */, (outs GPRnopc:$Rt, GPRnopc:$Rt2), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>; -class MovRRCopro2 pattern = []> - : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1, - GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary, +class MovRRCopro2 pattern = []> + : ABXI<0b1100, oops, iops, NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>, Requires<[PreV8]> { let Inst{31-28} = 0b1111; @@ -5139,13 +5164,18 @@ class MovRRCopro2 pattern = []> let Inst{7-4} = opc1; let Inst{3-0} = CRm; - let DecoderMethod = "DecodeMRRC2"; + let DecoderMethod = "DecoderForMRRC2AndMCRR2"; } def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */, + (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt, + GPRnopc:$Rt2, c_imm:$CRm), [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2, imm:$CRm)]>; -def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>; + +def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */, + (outs GPRnopc:$Rt, GPRnopc:$Rt2), + (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>; //===----------------------------------------------------------------------===// // Move between special register and ARM core register @@ -5164,7 +5194,7 @@ def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary, let Unpredictable{11-0} = 0b110100001111; } -def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>, +def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p), 0>, Requires<[IsARM]>; // The MRSsys instruction is the MRS instruction from the ARM ARM, @@ -5206,6 +5236,7 @@ def MRSbanked : ABI<0b0001, (outs GPRnopc:$Rd), (ins banked_reg:$banked), // to distinguish between them. The mask operand contains the special register // (R Bit) in bit 4 and bits 3-0 contains the mask with the fields to be // accessed in the special register. +let Defs = [CPSR] in def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary, "msr", "\t$mask, $Rn", []> { bits<5> mask; @@ -5220,6 +5251,7 @@ def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary, let Inst{3-0} = Rn; } +let Defs = [CPSR] in def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask, mod_imm:$imm), NoItinerary, "msr", "\t$mask, $imm", []> { bits<5> mask; @@ -5268,8 +5300,8 @@ let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK, [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; let usesCustomInserter = 1, Defs = [CPSR] in - def WIN__DBZCHK : PseudoInst<(outs), (ins GPR:$divisor), NoItinerary, - [(win__dbzchk GPR:$divisor)]>; + def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary, + [(win__dbzchk tGPR:$divisor)]>; //===----------------------------------------------------------------------===// // TLS Instructions @@ -5423,6 +5455,8 @@ def : Pat<(load (ARMWrapperPIC tglobaltlsaddr:$addr)), def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>, Requires<[IsARM, UseMovt]>; +def : ARMPat<(ARMWrapper texternalsym :$dst), (MOVi32imm texternalsym :$dst)>, + Requires<[IsARM, UseMovt]>; def : ARMPat<(ARMWrapperJT tjumptable:$dst), (LEApcrelJT tjumptable:$dst)>; @@ -5568,9 +5602,9 @@ include "ARMInstrNEON.td" // // Memory barriers -def : InstAlias<"dmb", (DMB 0xf)>, Requires<[IsARM, HasDB]>; -def : InstAlias<"dsb", (DSB 0xf)>, Requires<[IsARM, HasDB]>; -def : InstAlias<"isb", (ISB 0xf)>, Requires<[IsARM, HasDB]>; +def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>; +def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>; +def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>; // System instructions def : MnemonicAlias<"swi", "svc">; @@ -5583,13 +5617,13 @@ def : MnemonicAlias<"stmfd", "stmdb">; def : MnemonicAlias<"stmia", "stm">; def : MnemonicAlias<"stmea", "stm">; -// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the -// shift amount is zero (i.e., unspecified). +// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT with the +// input operands swapped when the shift amount is zero (i.e., unspecified). def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm", - (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>, + (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p), 0>, Requires<[IsARM, HasV6]>; def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm", - (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>, + (PKHBT GPRnopc:$Rd, GPRnopc:$Rm, GPRnopc:$Rn, 0, pred:$p), 0>, Requires<[IsARM, HasV6]>; // PUSH/POP aliases for STM/LDM @@ -5747,23 +5781,23 @@ def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>, // the instruction definitions need difference constraints pre-v6. // Use these aliases for the assembly parsing on pre-v6. def : InstAlias<"mul${s}${p} $Rd, $Rn, $Rm", - (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>, + (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s), 0>, Requires<[IsARM, NoV6]>; def : InstAlias<"mla${s}${p} $Rd, $Rn, $Rm, $Ra", (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, - pred:$p, cc_out:$s)>, + pred:$p, cc_out:$s), 0>, Requires<[IsARM, NoV6]>; def : InstAlias<"smlal${s}${p} $RdLo, $RdHi, $Rn, $Rm", - (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>, Requires<[IsARM, NoV6]>; def : InstAlias<"umlal${s}${p} $RdLo, $RdHi, $Rn, $Rm", - (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>, Requires<[IsARM, NoV6]>; def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm", - (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>, Requires<[IsARM, NoV6]>; def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm", - (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>, Requires<[IsARM, NoV6]>; // 'it' blocks in ARM mode just validate the predicates. The IT itself @@ -5775,3 +5809,36 @@ let mayLoad = 1, mayStore =1, hasSideEffects = 1 in def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn), NoItinerary, [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>; + +//===---------------------------------- +// Atomic cmpxchg for -O0 +//===---------------------------------- + +// The fast register allocator used during -O0 inserts spills to cover any VRegs +// live across basic block boundaries. When this happens between an LDXR and an +// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to +// fail. + +// Unfortunately, this means we have to have an alternative (expanded +// post-regalloc) path for -O0 compilations. Fortunately this path can be +// significantly more naive than the standard expansion: we conservatively +// assume seq_cst, strong cmpxchg and omit clrex on failure. + +let Constraints = "@earlyclobber $Rd,@earlyclobber $status", + mayLoad = 1, mayStore = 1 in { +def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$status), + (ins GPR:$addr, GPR:$desired, GPR:$new), + NoItinerary, []>, Sched<[]>; + +def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$status), + (ins GPR:$addr, GPR:$desired, GPR:$new), + NoItinerary, []>, Sched<[]>; + +def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$status), + (ins GPR:$addr, GPR:$desired, GPR:$new), + NoItinerary, []>, Sched<[]>; + +def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status), + (ins GPR:$addr, GPRPair:$desired, GPRPair:$new), + NoItinerary, []>, Sched<[]>; +} diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 5b1f9a06442e..93a174f3678a 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -15,10 +15,6 @@ // Thumb specific DAG Nodes. // -def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, - SDNPVariadic]>; - def imm_sr_XFORM: SDNodeXFormgetZExtValue(); return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32); @@ -70,6 +66,14 @@ def thumb_immshifted_shamt : SDNodeXFormgetTargetConstant(V, SDLoc(N), MVT::i32); }]>; +def imm256_510 : ImmLeaf= 256 && Imm < 511; +}]>; + +def thumb_imm256_510_addend : SDNodeXFormgetTargetConstant(N->getZExtValue() - 255, SDLoc(N), MVT::i32); +}]>; + // Scaled 4 immediate. def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; } def t_imm0_1020s4 : Operand { @@ -121,26 +125,38 @@ def t_adrlabel : Operand { let ParserMatchClass = UnsignedOffset_b8s2; } -def t_bcctarget : Operand { - let EncoderMethod = "getThumbBCCTargetOpValue"; - let DecoderMethod = "DecodeThumbBCCTargetOperand"; -} -def t_cbtarget : Operand { - let EncoderMethod = "getThumbCBTargetOpValue"; - let DecoderMethod = "DecodeThumbCmpBROperand"; +def thumb_br_target : Operand { + let ParserMatchClass = ThumbBranchTarget; + let EncoderMethod = "getThumbBranchTargetOpValue"; + let OperandType = "OPERAND_PCREL"; } -def t_bltarget : Operand { +def thumb_bl_target : Operand { + let ParserMatchClass = ThumbBranchTarget; let EncoderMethod = "getThumbBLTargetOpValue"; let DecoderMethod = "DecodeThumbBLTargetOperand"; } -def t_blxtarget : Operand { +// Target for BLX *from* thumb mode. +def thumb_blx_target : Operand { + let ParserMatchClass = ARMBranchTarget; let EncoderMethod = "getThumbBLXTargetOpValue"; let DecoderMethod = "DecodeThumbBLXOffset"; } +def thumb_bcc_target : Operand { + let ParserMatchClass = ThumbBranchTarget; + let EncoderMethod = "getThumbBCCTargetOpValue"; + let DecoderMethod = "DecodeThumbBCCTargetOperand"; +} + +def thumb_cb_target : Operand { + let ParserMatchClass = ThumbBranchTarget; + let EncoderMethod = "getThumbCBTargetOpValue"; + let DecoderMethod = "DecodeThumbCmpBROperand"; +} + // t_addrmode_pc :=