summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/Analysis/BasicAliasAnalysis.cpp9
-rw-r--r--lib/Analysis/BranchProbabilityInfo.cpp11
-rw-r--r--lib/Analysis/CallGraph.cpp34
-rw-r--r--lib/Analysis/ConstantFolding.cpp80
-rw-r--r--lib/Analysis/DemandedBits.cpp10
-rw-r--r--lib/Analysis/InlineCost.cpp2
-rw-r--r--lib/Analysis/InstructionSimplify.cpp167
-rw-r--r--lib/Analysis/ModuleSummaryAnalysis.cpp3
-rw-r--r--lib/Analysis/OptimizationDiagnosticInfo.cpp2
-rw-r--r--lib/Analysis/ProfileSummaryInfo.cpp13
-rw-r--r--lib/Analysis/ScalarEvolution.cpp159
-rw-r--r--lib/Analysis/TargetLibraryInfo.cpp112
-rw-r--r--lib/Analysis/TargetTransformInfo.cpp13
-rw-r--r--lib/Analysis/ValueTracking.cpp276
-rw-r--r--lib/Analysis/VectorUtils.cpp1
-rw-r--r--lib/AsmParser/LLParser.cpp18
-rw-r--r--lib/Bitcode/Reader/BitcodeReader.cpp17
-rw-r--r--lib/Bitcode/Reader/MetadataLoader.cpp2
-rw-r--r--lib/Bitcode/Writer/BitcodeWriter.cpp8
-rw-r--r--lib/Bitcode/Writer/ValueEnumerator.cpp7
-rw-r--r--lib/CMakeLists.txt2
-rw-r--r--lib/CodeGen/AsmPrinter/CodeViewDebug.cpp14
-rw-r--r--lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp3
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp42
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfCompileUnit.h22
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfDebug.cpp89
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfDebug.h20
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfFile.h4
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfUnit.cpp6
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfUnit.h5
-rw-r--r--lib/CodeGen/AsmPrinter/WinException.cpp12
-rw-r--r--lib/CodeGen/AtomicExpandPass.cpp31
-rw-r--r--lib/CodeGen/CMakeLists.txt3
-rw-r--r--lib/CodeGen/CodeGen.cpp4
-rw-r--r--lib/CodeGen/CodeGenPrepare.cpp548
-rw-r--r--lib/CodeGen/ExpandPostRAPseudos.cpp5
-rw-r--r--lib/CodeGen/ExpandReductions.cpp167
-rw-r--r--lib/CodeGen/GlobalISel/LegalizerInfo.cpp10
-rw-r--r--lib/CodeGen/GlobalISel/RegBankSelect.cpp9
-rw-r--r--lib/CodeGen/GlobalISel/Utils.cpp8
-rw-r--r--lib/CodeGen/IfConversion.cpp30
-rw-r--r--lib/CodeGen/LiveRangeShrink.cpp211
-rw-r--r--lib/CodeGen/LiveVariables.cpp2
-rw-r--r--lib/CodeGen/MachineBlockPlacement.cpp30
-rw-r--r--lib/CodeGen/MachineVerifier.cpp4
-rw-r--r--lib/CodeGen/PHIElimination.cpp2
-rw-r--r--lib/CodeGen/RegisterCoalescer.cpp2
-rw-r--r--lib/CodeGen/RegisterScavenging.cpp7
-rw-r--r--lib/CodeGen/SafeStack.cpp172
-rw-r--r--lib/CodeGen/ScalarizeMaskedMemIntrin.cpp660
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp176
-rw-r--r--lib/CodeGen/SelectionDAG/FastISel.cpp20
-rw-r--r--lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp33
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeDAG.cpp3
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeTypes.h1
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp58
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAG.cpp162
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp138
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h6
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp13
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp74
-rw-r--r--lib/CodeGen/SelectionDAG/TargetLowering.cpp11
-rw-r--r--lib/CodeGen/ShrinkWrap.cpp12
-rw-r--r--lib/CodeGen/SjLjEHPrepare.cpp4
-rw-r--r--lib/CodeGen/TargetLoweringObjectFileImpl.cpp6
-rw-r--r--lib/CodeGen/TargetPassConfig.cpp11
-rw-r--r--lib/CodeGen/TwoAddressInstructionPass.cpp7
-rw-r--r--lib/CodeGen/UnreachableBlockElim.cpp7
-rw-r--r--lib/DebugInfo/CodeView/CMakeLists.txt2
-rw-r--r--lib/DebugInfo/CodeView/CVTypeVisitor.cpp41
-rw-r--r--lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp10
-rw-r--r--lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp91
-rw-r--r--lib/DebugInfo/CodeView/TypeDatabase.cpp73
-rw-r--r--lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp65
-rw-r--r--lib/DebugInfo/CodeView/TypeDumpVisitor.cpp9
-rw-r--r--lib/DebugInfo/DWARF/DWARFContext.cpp25
-rw-r--r--lib/DebugInfo/DWARF/DWARFDebugAranges.cpp5
-rw-r--r--lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp4
-rw-r--r--lib/DebugInfo/DWARF/DWARFDie.cpp12
-rw-r--r--lib/DebugInfo/DWARF/DWARFTypeUnit.cpp6
-rw-r--r--lib/DebugInfo/DWARF/DWARFUnit.cpp16
-rw-r--r--lib/DebugInfo/DWARF/DWARFVerifier.cpp8
-rw-r--r--lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp2
-rw-r--r--lib/ExecutionEngine/Orc/OrcMCJITReplacement.h5
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp12
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp4
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp33
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h2
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h2
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h1
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h1
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h3
-rw-r--r--lib/Fuzzer/FuzzerDriver.cpp3
-rw-r--r--lib/Fuzzer/FuzzerFlags.def8
-rw-r--r--lib/Fuzzer/FuzzerInternal.h1
-rw-r--r--lib/Fuzzer/FuzzerLoop.cpp20
-rw-r--r--lib/Fuzzer/FuzzerMutate.cpp5
-rw-r--r--lib/Fuzzer/afl/afl_driver.cpp57
-rw-r--r--lib/Fuzzer/test/AFLDriverTest.cpp8
-rw-r--r--lib/Fuzzer/test/CMakeLists.txt1
-rw-r--r--lib/Fuzzer/test/OverwriteInputTest.cpp13
-rw-r--r--lib/Fuzzer/test/afl-driver.test26
-rw-r--r--lib/Fuzzer/test/overwrite-input.test2
-rw-r--r--lib/IR/AsmWriter.cpp7
-rw-r--r--lib/IR/AttributeImpl.h18
-rw-r--r--lib/IR/Attributes.cpp72
-rw-r--r--lib/IR/ConstantFold.cpp10
-rw-r--r--lib/IR/ConstantRange.cpp37
-rw-r--r--lib/IR/Constants.cpp15
-rw-r--r--lib/IR/ConstantsContext.h49
-rw-r--r--lib/IR/DebugInfoMetadata.cpp18
-rw-r--r--lib/IR/DebugLoc.cpp114
-rw-r--r--lib/IR/DiagnosticInfo.cpp25
-rw-r--r--lib/IR/Function.cpp79
-rw-r--r--lib/IR/Globals.cpp43
-rw-r--r--lib/IR/IRBuilder.cpp88
-rw-r--r--lib/IR/Instruction.cpp24
-rw-r--r--lib/IR/Instructions.cpp119
-rw-r--r--lib/IR/LegacyPassManager.cpp13
-rw-r--r--lib/IR/Module.cpp35
-rw-r--r--lib/IR/Type.cpp71
-rw-r--r--lib/IR/Verifier.cpp13
-rw-r--r--lib/LLVMBuild.txt2
-rw-r--r--lib/LTO/LTO.cpp4
-rw-r--r--lib/LTO/LTOCodeGenerator.cpp18
-rw-r--r--lib/LTO/ThinLTOCodeGenerator.cpp3
-rw-r--r--lib/Linker/IRMover.cpp18
-rw-r--r--lib/MC/MCObjectStreamer.cpp5
-rw-r--r--lib/MC/MCParser/AsmParser.cpp21
-rw-r--r--lib/Object/COFFObjectFile.cpp4
-rw-r--r--lib/Object/WasmObjectFile.cpp41
-rw-r--r--lib/ObjectYAML/WasmYAML.cpp8
-rw-r--r--lib/ProfileData/SampleProfWriter.cpp42
-rw-r--r--lib/Support/APInt.cpp312
-rw-r--r--lib/Support/CMakeLists.txt1
-rw-r--r--lib/Support/Parallel.cpp138
-rw-r--r--lib/Support/Unix/Path.inc30
-rw-r--r--lib/Support/Unix/Process.inc4
-rw-r--r--lib/Target/AArch64/AArch64.td1
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.cpp2
-rw-r--r--lib/Target/AArch64/AArch64FastISel.cpp2
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp6
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp30
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td8
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.cpp27
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkorDetails.td78
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkorWriteRes.td37
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp8
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h10
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.cpp8
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.h3
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.cpp106
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.h11
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp19
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td21
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp1
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp2881
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h15
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp18
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp34
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h3
-rw-r--r--lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td34
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp153
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.h61
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp282
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.h24
-rw-r--r--lib/Target/AMDGPU/SIAnnotateControlFlow.cpp2
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp22
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp310
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h33
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td7
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td51
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td3
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h18
-rw-r--r--lib/Target/ARM/ARMCallLowering.cpp2
-rw-r--r--lib/Target/ARM/ARMFastISel.cpp2
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp27
-rw-r--r--lib/Target/ARM/ARMISelLowering.h8
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td7
-rw-r--r--lib/Target/ARM/ARMInstrThumb.td4
-rw-r--r--lib/Target/ARM/ARMInstructionSelector.cpp39
-rw-r--r--lib/Target/ARM/ARMLegalizerInfo.cpp8
-rw-r--r--lib/Target/ARM/ARMOptimizeBarriersPass.cpp4
-rw-r--r--lib/Target/ARM/ARMRegisterBankInfo.cpp1
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp2
-rw-r--r--lib/Target/AVR/AVRFrameLowering.cpp2
-rw-r--r--lib/Target/AVR/AVRISelLowering.cpp10
-rw-r--r--lib/Target/AVR/AVRInstrInfo.td6
-rw-r--r--lib/Target/AVR/AVRRegisterInfo.cpp1
-rw-r--r--lib/Target/BPF/BPFISelLowering.cpp3
-rw-r--r--lib/Target/BPF/BPFInstrInfo.td9
-rw-r--r--lib/Target/Hexagon/HexagonISelLowering.cpp3
-rw-r--r--lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonPatterns.td7
-rw-r--r--lib/Target/Hexagon/HexagonPseudo.td2
-rw-r--r--lib/Target/Lanai/LanaiISelLowering.cpp31
-rw-r--r--lib/Target/Lanai/LanaiISelLowering.h5
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.td12
-rw-r--r--lib/Target/MSP430/MSP430FrameLowering.cpp7
-rw-r--r--lib/Target/MSP430/MSP430ISelLowering.cpp271
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.h6
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.td9
-rw-r--r--lib/Target/MSP430/MSP430RegisterInfo.cpp4
-rw-r--r--lib/Target/Mips/MipsFastISel.cpp2
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp2
-rw-r--r--lib/Target/Mips/MipsInstrInfo.td6
-rw-r--r--lib/Target/Mips/MipsOptimizePICCall.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.cpp18
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.td9
-rw-r--r--lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp17
-rw-r--r--lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp3
-rw-r--r--lib/Target/PowerPC/PPCFastISel.cpp3
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp255
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp98
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h29
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td28
-rw-r--r--lib/Target/PowerPC/PPCInstrAltivec.td40
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td13
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td2
-rw-r--r--lib/Target/PowerPC/PPCTLSDynamicCall.cpp3
-rw-r--r--lib/Target/Sparc/SparcISelLowering.cpp31
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.td9
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.td6
-rw-r--r--lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp4
-rw-r--r--lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp19
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp14
-rw-r--r--lib/Target/SystemZ/README.txt2
-rw-r--r--lib/Target/SystemZ/SystemZFeatures.td14
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.cpp13
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.h2
-rw-r--r--lib/Target/SystemZ/SystemZInstrFP.td13
-rw-r--r--lib/Target/SystemZ/SystemZInstrFormats.td301
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.td201
-rw-r--r--lib/Target/SystemZ/SystemZOperands.td2
-rw-r--r--lib/Target/SystemZ/SystemZOperators.td3
-rw-r--r--lib/Target/SystemZ/SystemZSchedule.td4
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ13.td84
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ196.td92
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZEC12.td92
-rw-r--r--lib/Target/SystemZ/SystemZSubtarget.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZSubtarget.h10
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrCall.td4
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.td3
-rw-r--r--lib/Target/X86/X86.td3
-rw-r--r--lib/Target/X86/X86FastISel.cpp48
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp269
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp41
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp214
-rw-r--r--lib/Target/X86/X86InstrCompiler.td14
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp52
-rw-r--r--lib/Target/X86/X86InstrInfo.h11
-rw-r--r--lib/Target/X86/X86InstrInfo.td35
-rw-r--r--lib/Target/X86/X86InstrSSE.td18
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp214
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h2
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp16
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp14
-rw-r--r--lib/Target/X86/X86Subtarget.h6
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp4
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp194
-rw-r--r--lib/Target/X86/X86WinEHState.cpp2
-rw-r--r--lib/Target/XCore/XCoreISelLowering.cpp5
-rw-r--r--lib/Target/XCore/XCoreInstrInfo.td11
-rw-r--r--lib/ToolDrivers/CMakeLists.txt1
-rw-r--r--lib/ToolDrivers/LLVMBuild.txt24
-rw-r--r--lib/ToolDrivers/llvm-lib/CMakeLists.txt (renamed from lib/LibDriver/CMakeLists.txt)0
-rw-r--r--lib/ToolDrivers/llvm-lib/LLVMBuild.txt (renamed from lib/LibDriver/LLVMBuild.txt)0
-rw-r--r--lib/ToolDrivers/llvm-lib/LibDriver.cpp (renamed from lib/LibDriver/LibDriver.cpp)2
-rw-r--r--lib/ToolDrivers/llvm-lib/Options.td (renamed from lib/LibDriver/Options.td)0
-rw-r--r--lib/Transforms/Coroutines/CoroFrame.cpp100
-rw-r--r--lib/Transforms/IPO/FunctionImport.cpp15
-rw-r--r--lib/Transforms/IPO/Inliner.cpp4
-rw-r--r--lib/Transforms/IPO/PartialInlining.cpp426
-rw-r--r--lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp6
-rw-r--r--lib/Transforms/InstCombine/InstCombineAddSub.cpp199
-rw-r--r--lib/Transforms/InstCombine/InstCombineAndOrXor.cpp109
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp8
-rw-r--r--lib/Transforms/InstCombine/InstCombineCasts.cpp25
-rw-r--r--lib/Transforms/InstCombine/InstCombineCompares.cpp39
-rw-r--r--lib/Transforms/InstCombine/InstCombineInternal.h30
-rw-r--r--lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp6
-rw-r--r--lib/Transforms/InstCombine/InstCombineMulDivRem.cpp20
-rw-r--r--lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp2
-rw-r--r--lib/Transforms/InstCombine/InstructionCombining.cpp26
-rw-r--r--lib/Transforms/Instrumentation/AddressSanitizer.cpp34
-rw-r--r--lib/Transforms/Instrumentation/DataFlowSanitizer.cpp8
-rw-r--r--lib/Transforms/Instrumentation/EfficiencySanitizer.cpp49
-rw-r--r--lib/Transforms/Instrumentation/MemorySanitizer.cpp7
-rw-r--r--lib/Transforms/Scalar/CorrelatedValuePropagation.cpp10
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp295
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp210
-rw-r--r--lib/Transforms/Scalar/SimpleLoopUnswitch.cpp161
-rw-r--r--lib/Transforms/Scalar/SpeculativeExecution.cpp43
-rw-r--r--lib/Transforms/Utils/BypassSlowDivision.cpp4
-rw-r--r--lib/Transforms/Utils/CloneFunction.cpp32
-rw-r--r--lib/Transforms/Utils/CloneModule.cpp2
-rw-r--r--lib/Transforms/Utils/EscapeEnumerator.cpp3
-rw-r--r--lib/Transforms/Utils/InlineFunction.cpp61
-rw-r--r--lib/Transforms/Utils/InstructionNamer.cpp13
-rw-r--r--lib/Transforms/Utils/Local.cpp106
-rw-r--r--lib/Transforms/Utils/LoopUtils.cpp201
-rw-r--r--lib/Transforms/Utils/ModuleUtils.cpp12
-rw-r--r--lib/Transforms/Utils/SimplifyLibCalls.cpp6
-rw-r--r--lib/Transforms/Utils/VNCoercion.cpp9
-rw-r--r--lib/Transforms/Utils/ValueMapper.cpp9
-rw-r--r--lib/Transforms/Vectorize/LoadStoreVectorizer.cpp2
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp241
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp112
-rw-r--r--lib/XRay/Trace.cpp34
319 files changed, 12342 insertions, 3756 deletions
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 537823020301..a33c01a0e461 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -17,13 +17,13 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
@@ -36,6 +36,7 @@
#include "llvm/IR/Operator.h"
#include "llvm/Pass.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include <algorithm>
#define DEBUG_TYPE "basicaa"
@@ -1283,9 +1284,9 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
// give up if we can't determine conditions that hold for every cycle:
const Value *V = DecompGEP1.VarIndices[i].V;
- bool SignKnownZero, SignKnownOne;
- ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, DL,
- 0, &AC, nullptr, DT);
+ KnownBits Known = computeKnownBits(V, DL, 0, &AC, nullptr, DT);
+ bool SignKnownZero = Known.isNonNegative();
+ bool SignKnownOne = Known.isNegative();
// Zero-extension widens the variable, and so forces the sign
// bit to zero.
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 0dc4475ca0e2..db87b17c1567 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -301,6 +301,8 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
WeightSum += Weights[i];
}
}
+ assert(WeightSum <= UINT32_MAX &&
+ "Expected weights to scale down to 32 bits");
if (WeightSum == 0 || ReachableIdxs.size() == 0) {
for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
@@ -328,21 +330,14 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
// the difference between reachable blocks.
if (ToDistribute > BranchProbability::getZero()) {
BranchProbability PerEdge = ToDistribute / ReachableIdxs.size();
- for (auto i : ReachableIdxs) {
+ for (auto i : ReachableIdxs)
BP[i] += PerEdge;
- ToDistribute -= PerEdge;
- }
- // Tail goes to the first reachable edge.
- BP[ReachableIdxs[0]] += ToDistribute;
}
}
for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
setEdgeProbability(BB, i, BP[i]);
- assert(WeightSum <= UINT32_MAX &&
- "Expected weights to scale down to 32 bits");
-
return true;
}
diff --git a/lib/Analysis/CallGraph.cpp b/lib/Analysis/CallGraph.cpp
index 6942176ae6ae..ff5242f69a1b 100644
--- a/lib/Analysis/CallGraph.cpp
+++ b/lib/Analysis/CallGraph.cpp
@@ -21,23 +21,18 @@ using namespace llvm;
//
CallGraph::CallGraph(Module &M)
- : M(M), Root(nullptr), ExternalCallingNode(getOrInsertFunction(nullptr)),
+ : M(M), ExternalCallingNode(getOrInsertFunction(nullptr)),
CallsExternalNode(llvm::make_unique<CallGraphNode>(nullptr)) {
// Add every function to the call graph.
for (Function &F : M)
addToCallGraph(&F);
-
- // If we didn't find a main function, use the external call graph node
- if (!Root)
- Root = ExternalCallingNode;
}
CallGraph::CallGraph(CallGraph &&Arg)
- : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)), Root(Arg.Root),
+ : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)),
ExternalCallingNode(Arg.ExternalCallingNode),
CallsExternalNode(std::move(Arg.CallsExternalNode)) {
Arg.FunctionMap.clear();
- Arg.Root = nullptr;
Arg.ExternalCallingNode = nullptr;
}
@@ -57,21 +52,9 @@ CallGraph::~CallGraph() {
void CallGraph::addToCallGraph(Function *F) {
CallGraphNode *Node = getOrInsertFunction(F);
- // If this function has external linkage, anything could call it.
- if (!F->hasLocalLinkage()) {
- ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
- // Found the entry point?
- if (F->getName() == "main") {
- if (Root) // Found multiple external mains? Don't pick one.
- Root = ExternalCallingNode;
- else
- Root = Node; // Found a main, keep track of it!
- }
- }
-
- // If this function has its address taken, anything could call it.
- if (F->hasAddressTaken())
+ // If this function has external linkage or has its address taken, anything
+ // could call it.
+ if (!F->hasLocalLinkage() || F->hasAddressTaken())
ExternalCallingNode->addCalledFunction(CallSite(), Node);
// If this function is not defined in this translation unit, it could call
@@ -96,13 +79,6 @@ void CallGraph::addToCallGraph(Function *F) {
}
void CallGraph::print(raw_ostream &OS) const {
- OS << "CallGraph Root is: ";
- if (Function *F = Root->getFunction())
- OS << F->getName() << "\n";
- else {
- OS << "<<null function: 0x" << Root << ">>\n";
- }
-
// Print in a deterministic order by sorting CallGraphNodes by name. We do
// this here to avoid slowing down the non-printing fast path.
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 130e917e49d7..0ca712bbfe70 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1438,6 +1438,36 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
Name == "sinf" || Name == "sinhf" || Name == "sqrtf";
case 't':
return Name == "tan" || Name == "tanh" || Name == "tanf" || Name == "tanhf";
+ case '_':
+
+ // Check for various function names that get used for the math functions
+ // when the header files are preprocessed with the macro
+ // __FINITE_MATH_ONLY__ enabled.
+ // The '12' here is the length of the shortest name that can match.
+ // We need to check the size before looking at Name[1] and Name[2]
+ // so we may as well check a limit that will eliminate mismatches.
+ if (Name.size() < 12 || Name[1] != '_')
+ return false;
+ switch (Name[2]) {
+ default:
+ return false;
+ case 'a':
+ return Name == "__acos_finite" || Name == "__acosf_finite" ||
+ Name == "__asin_finite" || Name == "__asinf_finite" ||
+ Name == "__atan2_finite" || Name == "__atan2f_finite";
+ case 'c':
+ return Name == "__cosh_finite" || Name == "__coshf_finite";
+ case 'e':
+ return Name == "__exp_finite" || Name == "__expf_finite" ||
+ Name == "__exp2_finite" || Name == "__exp2f_finite";
+ case 'l':
+ return Name == "__log_finite" || Name == "__logf_finite" ||
+ Name == "__log10_finite" || Name == "__log10f_finite";
+ case 'p':
+ return Name == "__pow_finite" || Name == "__powf_finite";
+ case 's':
+ return Name == "__sinh_finite" || Name == "__sinhf_finite";
+ }
}
}
@@ -1637,13 +1667,21 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
if (!TLI)
return nullptr;
- switch (Name[0]) {
+ char NameKeyChar = Name[0];
+ if (Name[0] == '_' && Name.size() > 2 && Name[1] == '_')
+ NameKeyChar = Name[2];
+
+ switch (NameKeyChar) {
case 'a':
if ((Name == "acos" && TLI->has(LibFunc_acos)) ||
- (Name == "acosf" && TLI->has(LibFunc_acosf)))
+ (Name == "acosf" && TLI->has(LibFunc_acosf)) ||
+ (Name == "__acos_finite" && TLI->has(LibFunc_acos_finite)) ||
+ (Name == "__acosf_finite" && TLI->has(LibFunc_acosf_finite)))
return ConstantFoldFP(acos, V, Ty);
else if ((Name == "asin" && TLI->has(LibFunc_asin)) ||
- (Name == "asinf" && TLI->has(LibFunc_asinf)))
+ (Name == "asinf" && TLI->has(LibFunc_asinf)) ||
+ (Name == "__asin_finite" && TLI->has(LibFunc_asin_finite)) ||
+ (Name == "__asinf_finite" && TLI->has(LibFunc_asinf_finite)))
return ConstantFoldFP(asin, V, Ty);
else if ((Name == "atan" && TLI->has(LibFunc_atan)) ||
(Name == "atanf" && TLI->has(LibFunc_atanf)))
@@ -1657,15 +1695,21 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
(Name == "cosf" && TLI->has(LibFunc_cosf)))
return ConstantFoldFP(cos, V, Ty);
else if ((Name == "cosh" && TLI->has(LibFunc_cosh)) ||
- (Name == "coshf" && TLI->has(LibFunc_coshf)))
+ (Name == "coshf" && TLI->has(LibFunc_coshf)) ||
+ (Name == "__cosh_finite" && TLI->has(LibFunc_cosh_finite)) ||
+ (Name == "__coshf_finite" && TLI->has(LibFunc_coshf_finite)))
return ConstantFoldFP(cosh, V, Ty);
break;
case 'e':
if ((Name == "exp" && TLI->has(LibFunc_exp)) ||
- (Name == "expf" && TLI->has(LibFunc_expf)))
+ (Name == "expf" && TLI->has(LibFunc_expf)) ||
+ (Name == "__exp_finite" && TLI->has(LibFunc_exp_finite)) ||
+ (Name == "__expf_finite" && TLI->has(LibFunc_expf_finite)))
return ConstantFoldFP(exp, V, Ty);
if ((Name == "exp2" && TLI->has(LibFunc_exp2)) ||
- (Name == "exp2f" && TLI->has(LibFunc_exp2f)))
+ (Name == "exp2f" && TLI->has(LibFunc_exp2f)) ||
+ (Name == "__exp2_finite" && TLI->has(LibFunc_exp2_finite)) ||
+ (Name == "__exp2f_finite" && TLI->has(LibFunc_exp2f_finite)))
// Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
// C99 library.
return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
@@ -1680,10 +1724,18 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
break;
case 'l':
if ((Name == "log" && V > 0 && TLI->has(LibFunc_log)) ||
- (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)))
+ (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)) ||
+ (Name == "__log_finite" && V > 0 &&
+ TLI->has(LibFunc_log_finite)) ||
+ (Name == "__logf_finite" && V > 0 &&
+ TLI->has(LibFunc_logf_finite)))
return ConstantFoldFP(log, V, Ty);
else if ((Name == "log10" && V > 0 && TLI->has(LibFunc_log10)) ||
- (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)))
+ (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)) ||
+ (Name == "__log10_finite" && V > 0 &&
+ TLI->has(LibFunc_log10_finite)) ||
+ (Name == "__log10f_finite" && V > 0 &&
+ TLI->has(LibFunc_log10f_finite)))
return ConstantFoldFP(log10, V, Ty);
break;
case 'r':
@@ -1695,7 +1747,9 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
(Name == "sinf" && TLI->has(LibFunc_sinf)))
return ConstantFoldFP(sin, V, Ty);
else if ((Name == "sinh" && TLI->has(LibFunc_sinh)) ||
- (Name == "sinhf" && TLI->has(LibFunc_sinhf)))
+ (Name == "sinhf" && TLI->has(LibFunc_sinhf)) ||
+ (Name == "__sinh_finite" && TLI->has(LibFunc_sinh_finite)) ||
+ (Name == "__sinhf_finite" && TLI->has(LibFunc_sinhf_finite)))
return ConstantFoldFP(sinh, V, Ty);
else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc_sqrt)) ||
(Name == "sqrtf" && V >= 0 && TLI->has(LibFunc_sqrtf)))
@@ -1813,13 +1867,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
if (!TLI)
return nullptr;
if ((Name == "pow" && TLI->has(LibFunc_pow)) ||
- (Name == "powf" && TLI->has(LibFunc_powf)))
+ (Name == "powf" && TLI->has(LibFunc_powf)) ||
+ (Name == "__pow_finite" && TLI->has(LibFunc_pow_finite)) ||
+ (Name == "__powf_finite" && TLI->has(LibFunc_powf_finite)))
return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
if ((Name == "fmod" && TLI->has(LibFunc_fmod)) ||
(Name == "fmodf" && TLI->has(LibFunc_fmodf)))
return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty);
if ((Name == "atan2" && TLI->has(LibFunc_atan2)) ||
- (Name == "atan2f" && TLI->has(LibFunc_atan2f)))
+ (Name == "atan2f" && TLI->has(LibFunc_atan2f)) ||
+ (Name == "__atan2_finite" && TLI->has(LibFunc_atan2_finite)) ||
+ (Name == "__atan2f_finite" && TLI->has(LibFunc_atan2f_finite)))
return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
} else if (auto *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy())
diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp
index 9f5dc5318239..8f808f3e7871 100644
--- a/lib/Analysis/DemandedBits.cpp
+++ b/lib/Analysis/DemandedBits.cpp
@@ -86,13 +86,11 @@ void DemandedBits::determineLiveOperandBits(
[&](unsigned BitWidth, const Value *V1, const Value *V2) {
const DataLayout &DL = I->getModule()->getDataLayout();
Known = KnownBits(BitWidth);
- computeKnownBits(const_cast<Value *>(V1), Known, DL, 0,
- &AC, UserI, &DT);
+ computeKnownBits(V1, Known, DL, 0, &AC, UserI, &DT);
if (V2) {
Known2 = KnownBits(BitWidth);
- computeKnownBits(const_cast<Value *>(V2), Known2, DL,
- 0, &AC, UserI, &DT);
+ computeKnownBits(V2, Known2, DL, 0, &AC, UserI, &DT);
}
};
@@ -118,7 +116,7 @@ void DemandedBits::determineLiveOperandBits(
// known to be one.
ComputeKnownBits(BitWidth, I, nullptr);
AB = APInt::getHighBitsSet(BitWidth,
- std::min(BitWidth, Known.One.countLeadingZeros()+1));
+ std::min(BitWidth, Known.countMaxLeadingZeros()+1));
}
break;
case Intrinsic::cttz:
@@ -128,7 +126,7 @@ void DemandedBits::determineLiveOperandBits(
// known to be one.
ComputeKnownBits(BitWidth, I, nullptr);
AB = APInt::getLowBitsSet(BitWidth,
- std::min(BitWidth, Known.One.countTrailingZeros()+1));
+ std::min(BitWidth, Known.countMaxTrailingZeros()+1));
}
break;
}
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 100a591e452c..44c14cb17c22 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -63,7 +63,7 @@ static cl::opt<bool>
// PGO before we actually hook up inliner with analysis passes such as BPI and
// BFI.
static cl::opt<int> ColdThreshold(
- "inlinecold-threshold", cl::Hidden, cl::init(225),
+ "inlinecold-threshold", cl::Hidden, cl::init(45),
cl::desc("Threshold for inlining functions with cold attribute"));
static cl::opt<int>
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 4a713f441ce8..5728887cc1e9 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1317,7 +1317,7 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
// If all valid bits in the shift amount are known zero, the first operand is
// unchanged.
unsigned NumValidShiftBits = Log2_32_Ceil(BitWidth);
- if (Known.Zero.countTrailingOnes() >= NumValidShiftBits)
+ if (Known.countMinTrailingZeros() >= NumValidShiftBits)
return Op0;
return nullptr;
@@ -1536,7 +1536,7 @@ static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1,
auto Range0 = ConstantRange::makeExactICmpRegion(Cmp0->getPredicate(), *C0);
auto Range1 = ConstantRange::makeExactICmpRegion(Cmp1->getPredicate(), *C1);
- // For and-of-comapares, check if the intersection is empty:
+ // For and-of-compares, check if the intersection is empty:
// (icmp X, C0) && (icmp X, C1) --> empty set --> false
if (IsAnd && Range0.intersectWith(Range1).isEmptySet())
return getFalse(Cmp0->getType());
@@ -1870,6 +1870,24 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
return Op0;
+ // (A & B) | (~A ^ B) -> (~A ^ B)
+ // (B & A) | (~A ^ B) -> (~A ^ B)
+ // (A & B) | (B ^ ~A) -> (B ^ ~A)
+ // (B & A) | (B ^ ~A) -> (B ^ ~A)
+ if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+ (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
+ match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
+ return Op1;
+
+ // (~A ^ B) | (A & B) -> (~A ^ B)
+ // (~A ^ B) | (B & A) -> (~A ^ B)
+ // (B ^ ~A) | (A & B) -> (B ^ ~A)
+ // (B ^ ~A) | (B & A) -> (B ^ ~A)
+ if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
+ (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
+ match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
+ return Op0;
+
if (Value *V = simplifyAndOrOfICmps(Op0, Op1, false))
return V;
@@ -2286,7 +2304,6 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
return nullptr;
Type *ITy = GetCompareTy(LHS); // The return type.
- bool LHSKnownNonNegative, LHSKnownNegative;
switch (Pred) {
default:
llvm_unreachable("Unknown ICmp predicate!");
@@ -2304,39 +2321,41 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
return getTrue(ITy);
break;
- case ICmpInst::ICMP_SLT:
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (LHSKnownNegative)
+ case ICmpInst::ICMP_SLT: {
+ KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (LHSKnown.isNegative())
return getTrue(ITy);
- if (LHSKnownNonNegative)
+ if (LHSKnown.isNonNegative())
return getFalse(ITy);
break;
- case ICmpInst::ICMP_SLE:
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (LHSKnownNegative)
+ }
+ case ICmpInst::ICMP_SLE: {
+ KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (LHSKnown.isNegative())
return getTrue(ITy);
- if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+ if (LHSKnown.isNonNegative() &&
+ isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
return getFalse(ITy);
break;
- case ICmpInst::ICMP_SGE:
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (LHSKnownNegative)
+ }
+ case ICmpInst::ICMP_SGE: {
+ KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (LHSKnown.isNegative())
return getFalse(ITy);
- if (LHSKnownNonNegative)
+ if (LHSKnown.isNonNegative())
return getTrue(ITy);
break;
- case ICmpInst::ICMP_SGT:
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (LHSKnownNegative)
+ }
+ case ICmpInst::ICMP_SGT: {
+ KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (LHSKnown.isNegative())
return getFalse(ITy);
- if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+ if (LHSKnown.isNonNegative() &&
+ isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
return getTrue(ITy);
break;
}
+ }
return nullptr;
}
@@ -2535,6 +2554,9 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
return nullptr;
}
+/// TODO: A large part of this logic is duplicated in InstCombine's
+/// foldICmpBinOp(). We should be able to share that and avoid the code
+/// duplication.
static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
Value *RHS, const SimplifyQuery &Q,
unsigned MaxRecurse) {
@@ -2616,15 +2638,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
return getTrue(ITy);
if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) {
- bool RHSKnownNonNegative, RHSKnownNegative;
- bool YKnownNonNegative, YKnownNegative;
- ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, Q.DL, 0,
- Q.AC, Q.CxtI, Q.DT);
- ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (RHSKnownNonNegative && YKnownNegative)
+ KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (RHSKnown.isNonNegative() && YKnown.isNegative())
return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
- if (RHSKnownNegative || YKnownNonNegative)
+ if (RHSKnown.isNegative() || YKnown.isNonNegative())
return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
}
}
@@ -2636,15 +2654,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
return getFalse(ITy);
if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) {
- bool LHSKnownNonNegative, LHSKnownNegative;
- bool YKnownNonNegative, YKnownNegative;
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0,
- Q.AC, Q.CxtI, Q.DT);
- ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (LHSKnownNonNegative && YKnownNegative)
+ KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (LHSKnown.isNonNegative() && YKnown.isNegative())
return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy);
- if (LHSKnownNegative || YKnownNonNegative)
+ if (LHSKnown.isNegative() || YKnown.isNonNegative())
return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy);
}
}
@@ -2691,28 +2705,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
// icmp pred (urem X, Y), Y
if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
- bool KnownNonNegative, KnownNegative;
switch (Pred) {
default:
break;
case ICmpInst::ICMP_SGT:
- case ICmpInst::ICMP_SGE:
- ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (!KnownNonNegative)
+ case ICmpInst::ICMP_SGE: {
+ KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (!Known.isNonNegative())
break;
LLVM_FALLTHROUGH;
+ }
case ICmpInst::ICMP_EQ:
case ICmpInst::ICMP_UGT:
case ICmpInst::ICMP_UGE:
return getFalse(ITy);
case ICmpInst::ICMP_SLT:
- case ICmpInst::ICMP_SLE:
- ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (!KnownNonNegative)
+ case ICmpInst::ICMP_SLE: {
+ KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (!Known.isNonNegative())
break;
LLVM_FALLTHROUGH;
+ }
case ICmpInst::ICMP_NE:
case ICmpInst::ICMP_ULT:
case ICmpInst::ICMP_ULE:
@@ -2722,28 +2735,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
// icmp pred X, (urem Y, X)
if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
- bool KnownNonNegative, KnownNegative;
switch (Pred) {
default:
break;
case ICmpInst::ICMP_SGT:
- case ICmpInst::ICMP_SGE:
- ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (!KnownNonNegative)
+ case ICmpInst::ICMP_SGE: {
+ KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (!Known.isNonNegative())
break;
LLVM_FALLTHROUGH;
+ }
case ICmpInst::ICMP_NE:
case ICmpInst::ICMP_UGT:
case ICmpInst::ICMP_UGE:
return getTrue(ITy);
case ICmpInst::ICMP_SLT:
- case ICmpInst::ICMP_SLE:
- ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (!KnownNonNegative)
+ case ICmpInst::ICMP_SLE: {
+ KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (!Known.isNonNegative())
break;
LLVM_FALLTHROUGH;
+ }
case ICmpInst::ICMP_EQ:
case ICmpInst::ICMP_ULT:
case ICmpInst::ICMP_ULE:
@@ -2815,10 +2827,19 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
break;
case Instruction::UDiv:
case Instruction::LShr:
- if (ICmpInst::isSigned(Pred))
+ if (ICmpInst::isSigned(Pred) || !LBO->isExact() || !RBO->isExact())
break;
- LLVM_FALLTHROUGH;
+ if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+ RBO->getOperand(0), Q, MaxRecurse - 1))
+ return V;
+ break;
case Instruction::SDiv:
+ if (!ICmpInst::isEquality(Pred) || !LBO->isExact() || !RBO->isExact())
+ break;
+ if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+ RBO->getOperand(0), Q, MaxRecurse - 1))
+ return V;
+ break;
case Instruction::AShr:
if (!LBO->isExact() || !RBO->isExact())
break;
@@ -4034,24 +4055,21 @@ Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
/// match a root vector source operand that contains that element in the same
/// vector lane (ie, the same mask index), so we can eliminate the shuffle(s).
static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
- Constant *Mask, Value *RootVec, int RootElt,
+ int MaskVal, Value *RootVec,
unsigned MaxRecurse) {
if (!MaxRecurse--)
return nullptr;
// Bail out if any mask value is undefined. That kind of shuffle may be
// simplified further based on demanded bits or other folds.
- int MaskVal = ShuffleVectorInst::getMaskValue(Mask, RootElt);
if (MaskVal == -1)
return nullptr;
// The mask value chooses which source operand we need to look at next.
- Value *SourceOp;
int InVecNumElts = Op0->getType()->getVectorNumElements();
- if (MaskVal < InVecNumElts) {
- RootElt = MaskVal;
- SourceOp = Op0;
- } else {
+ int RootElt = MaskVal;
+ Value *SourceOp = Op0;
+ if (MaskVal >= InVecNumElts) {
RootElt = MaskVal - InVecNumElts;
SourceOp = Op1;
}
@@ -4061,7 +4079,7 @@ static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
if (auto *SourceShuf = dyn_cast<ShuffleVectorInst>(SourceOp)) {
return foldIdentityShuffles(
DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1),
- SourceShuf->getMask(), RootVec, RootElt, MaxRecurse);
+ SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse);
}
// TODO: Look through bitcasts? What if the bitcast changes the vector element
@@ -4126,17 +4144,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
// second one.
if (Op0Const && !Op1Const) {
std::swap(Op0, Op1);
- for (int &Idx : Indices) {
- if (Idx == -1)
- continue;
- Idx = Idx < (int)InVecNumElts ? Idx + InVecNumElts : Idx - InVecNumElts;
- assert(Idx >= 0 && Idx < (int)InVecNumElts * 2 &&
- "shufflevector mask index out of range");
- }
- Mask = ConstantDataVector::get(
- Mask->getContext(),
- makeArrayRef(reinterpret_cast<uint32_t *>(Indices.data()),
- MaskNumElts));
+ ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts);
}
// A shuffle of a splat is always the splat itself. Legal if the shuffle's
@@ -4160,7 +4168,8 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
for (unsigned i = 0; i != MaskNumElts; ++i) {
// Note that recursion is limited for each vector element, so if any element
// exceeds the limit, this will fail to simplify.
- RootVec = foldIdentityShuffles(i, Op0, Op1, Mask, RootVec, i, MaxRecurse);
+ RootVec =
+ foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse);
// We can't replace a widening/narrowing shuffle with one of its operands.
if (!RootVec || RootVec->getType() != RetTy)
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index 99f900ae3932..26706f5509ba 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -232,7 +232,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
}
// We should have named any anonymous globals
assert(CalledFunction->hasName());
- auto ScaledCount = ProfileSummaryInfo::getProfileCount(&I, BFI);
+ auto ScaledCount = PSI->getProfileCount(&I, BFI);
auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
: CalleeInfo::HotnessType::Unknown;
@@ -330,6 +330,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
const Module &M,
std::function<BlockFrequencyInfo *(const Function &F)> GetBFICallback,
ProfileSummaryInfo *PSI) {
+ assert(PSI);
ModuleSummaryIndex Index;
// Identify the local values in the llvm.used and llvm.compiler.used sets,
diff --git a/lib/Analysis/OptimizationDiagnosticInfo.cpp b/lib/Analysis/OptimizationDiagnosticInfo.cpp
index 73245981b022..e38e530c052d 100644
--- a/lib/Analysis/OptimizationDiagnosticInfo.cpp
+++ b/lib/Analysis/OptimizationDiagnosticInfo.cpp
@@ -101,7 +101,7 @@ void MappingTraits<DiagnosticInfoOptimizationBase *>::mapping(
// These are read-only for now.
DiagnosticLocation DL = OptDiag->getLocation();
StringRef FN =
- GlobalValue::getRealLinkageName(OptDiag->getFunction().getName());
+ GlobalValue::dropLLVMManglingEscape(OptDiag->getFunction().getName());
StringRef PassName(OptDiag->PassName);
io.mapRequired("Pass", PassName);
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index 1a53a8ed4283..502f4205b689 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -75,11 +75,14 @@ ProfileSummaryInfo::getProfileCount(const Instruction *Inst,
return None;
assert((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
"We can only get profile count for call/invoke instruction.");
- // Check if there is a profile metadata on the instruction. If it is present,
- // determine hotness solely based on that.
- uint64_t TotalCount;
- if (Inst->extractProfTotalWeight(TotalCount))
- return TotalCount;
+ if (computeSummary() && Summary->getKind() == ProfileSummary::PSK_Sample) {
+ // In sample PGO mode, check if there is a profile metadata on the
+ // instruction. If it is present, determine hotness solely based on that,
+ // since the sampled entry count may not be accurate.
+ uint64_t TotalCount;
+ if (Inst->extractProfTotalWeight(TotalCount))
+ return TotalCount;
+ }
if (BFI)
return BFI->getBlockProfileCount(Inst->getParent());
return None;
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 01dca0793145..800354d2f5b4 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -584,7 +584,7 @@ CompareValueComplexity(SmallSet<std::pair<Value *, Value *>, 8> &EqCache,
static int CompareSCEVComplexity(
SmallSet<std::pair<const SCEV *, const SCEV *>, 8> &EqCacheSCEV,
const LoopInfo *const LI, const SCEV *LHS, const SCEV *RHS,
- unsigned Depth = 0) {
+ DominatorTree &DT, unsigned Depth = 0) {
// Fast-path: SCEVs are uniqued so we can do a quick equality check.
if (LHS == RHS)
return 0;
@@ -629,9 +629,16 @@ static int CompareSCEVComplexity(
const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
- // Compare addrec loop depths.
+ // If there is a dominance relationship between the loops, sort by the
+ // dominance. Otherwise, sort by depth. We require such order in getAddExpr.
const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
if (LLoop != RLoop) {
+ const BasicBlock *LHead = LLoop->getHeader(), *RHead = RLoop->getHeader();
+ assert(LHead != RHead && "Two loops share the same header?");
+ if (DT.dominates(LHead, RHead))
+ return 1;
+ else if (DT.dominates(RHead, LHead))
+ return -1;
unsigned LDepth = LLoop->getLoopDepth(), RDepth = RLoop->getLoopDepth();
if (LDepth != RDepth)
return (int)LDepth - (int)RDepth;
@@ -645,7 +652,7 @@ static int CompareSCEVComplexity(
// Lexicographically compare.
for (unsigned i = 0; i != LNumOps; ++i) {
int X = CompareSCEVComplexity(EqCacheSCEV, LI, LA->getOperand(i),
- RA->getOperand(i), Depth + 1);
+ RA->getOperand(i), DT, Depth + 1);
if (X != 0)
return X;
}
@@ -669,7 +676,7 @@ static int CompareSCEVComplexity(
if (i >= RNumOps)
return 1;
int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(i),
- RC->getOperand(i), Depth + 1);
+ RC->getOperand(i), DT, Depth + 1);
if (X != 0)
return X;
}
@@ -683,10 +690,10 @@ static int CompareSCEVComplexity(
// Lexicographically compare udiv expressions.
int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getLHS(), RC->getLHS(),
- Depth + 1);
+ DT, Depth + 1);
if (X != 0)
return X;
- X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(),
+ X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(), DT,
Depth + 1);
if (X == 0)
EqCacheSCEV.insert({LHS, RHS});
@@ -701,7 +708,7 @@ static int CompareSCEVComplexity(
// Compare cast expressions by operand.
int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(),
- RC->getOperand(), Depth + 1);
+ RC->getOperand(), DT, Depth + 1);
if (X == 0)
EqCacheSCEV.insert({LHS, RHS});
return X;
@@ -724,7 +731,7 @@ static int CompareSCEVComplexity(
/// land in memory.
///
static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
- LoopInfo *LI) {
+ LoopInfo *LI, DominatorTree &DT) {
if (Ops.size() < 2) return; // Noop
SmallSet<std::pair<const SCEV *, const SCEV *>, 8> EqCache;
@@ -732,15 +739,16 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
// This is the common case, which also happens to be trivially simple.
// Special case it.
const SCEV *&LHS = Ops[0], *&RHS = Ops[1];
- if (CompareSCEVComplexity(EqCache, LI, RHS, LHS) < 0)
+ if (CompareSCEVComplexity(EqCache, LI, RHS, LHS, DT) < 0)
std::swap(LHS, RHS);
return;
}
// Do the rough sort by complexity.
std::stable_sort(Ops.begin(), Ops.end(),
- [&EqCache, LI](const SCEV *LHS, const SCEV *RHS) {
- return CompareSCEVComplexity(EqCache, LI, LHS, RHS) < 0;
+ [&EqCache, LI, &DT](const SCEV *LHS, const SCEV *RHS) {
+ return
+ CompareSCEVComplexity(EqCache, LI, LHS, RHS, DT) < 0;
});
// Now that we are sorted by complexity, group elements of the same
@@ -2186,7 +2194,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
#endif
// Sort by complexity, this groups all similar expression types together.
- GroupByComplexity(Ops, &LI);
+ GroupByComplexity(Ops, &LI, DT);
Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags);
@@ -2492,7 +2500,13 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
// added together. If so, we can fold them.
for (unsigned OtherIdx = Idx+1;
OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
- ++OtherIdx)
+ ++OtherIdx) {
+ // We expect the AddRecExpr's to be sorted in reverse dominance order,
+ // so that the 1st found AddRecExpr is dominated by all others.
+ assert(DT.dominates(
+ cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()->getHeader(),
+ AddRec->getLoop()->getHeader()) &&
+ "AddRecExprs are not sorted in reverse dominance order?");
if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
// Other + {A,+,B}<L> + {C,+,D}<L> --> Other + {A+C,+,B+D}<L>
SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
@@ -2518,6 +2532,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap);
return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
}
+ }
// Otherwise couldn't fold anything into this recurrence. Move onto the
// next one.
@@ -2614,7 +2629,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
#endif
// Sort by complexity, this groups all similar expression types together.
- GroupByComplexity(Ops, &LI);
+ GroupByComplexity(Ops, &LI, DT);
Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);
@@ -3211,7 +3226,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
#endif
// Sort by complexity, this groups all similar expression types together.
- GroupByComplexity(Ops, &LI);
+ GroupByComplexity(Ops, &LI, DT);
// If there are any constants, fold them together.
unsigned Idx = 0;
@@ -3312,7 +3327,7 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
#endif
// Sort by complexity, this groups all similar expression types together.
- GroupByComplexity(Ops, &LI);
+ GroupByComplexity(Ops, &LI, DT);
// If there are any constants, fold them together.
unsigned Idx = 0;
@@ -4636,7 +4651,7 @@ uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) {
KnownBits Known(BitWidth);
computeKnownBits(U->getValue(), Known, getDataLayout(), 0, &AC,
nullptr, &DT);
- return Known.Zero.countTrailingOnes();
+ return Known.countMinTrailingZeros();
}
// SCEVUDivExpr
@@ -5955,6 +5970,30 @@ bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
return false;
}
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E)
+ : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {}
+
+ScalarEvolution::ExitLimit::ExitLimit(
+ const SCEV *E, const SCEV *M, bool MaxOrZero,
+ ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList)
+ : ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) {
+ assert((isa<SCEVCouldNotCompute>(ExactNotTaken) ||
+ !isa<SCEVCouldNotCompute>(MaxNotTaken)) &&
+ "Exact is not allowed to be less precise than Max");
+ for (auto *PredSet : PredSetList)
+ for (auto *P : *PredSet)
+ addPredicate(P);
+}
+
+ScalarEvolution::ExitLimit::ExitLimit(
+ const SCEV *E, const SCEV *M, bool MaxOrZero,
+ const SmallPtrSetImpl<const SCEVPredicate *> &PredSet)
+ : ExitLimit(E, M, MaxOrZero, {&PredSet}) {}
+
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, const SCEV *M,
+ bool MaxOrZero)
+ : ExitLimit(E, M, MaxOrZero, None) {}
+
/// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
/// computable exit into a persistent ExitNotTakenInfo array.
ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
@@ -6637,13 +6676,12 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit(
// {K,ashr,<positive-constant>} stabilizes to signum(K) in at most
// bitwidth(K) iterations.
Value *FirstValue = PN->getIncomingValueForBlock(Predecessor);
- bool KnownZero, KnownOne;
- ComputeSignBit(FirstValue, KnownZero, KnownOne, DL, 0, nullptr,
- Predecessor->getTerminator(), &DT);
+ KnownBits Known = computeKnownBits(FirstValue, DL, 0, nullptr,
+ Predecessor->getTerminator(), &DT);
auto *Ty = cast<IntegerType>(RHS->getType());
- if (KnownZero)
+ if (Known.isNonNegative())
StableValue = ConstantInt::get(Ty, 0);
- else if (KnownOne)
+ else if (Known.isNegative())
StableValue = ConstantInt::get(Ty, -1, true);
else
return getCouldNotCompute();
@@ -7377,48 +7415,49 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
const APInt &N = NC->getAPInt();
APInt Two(BitWidth, 2);
- {
- using namespace APIntOps;
- const APInt& C = L;
- // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
- // The B coefficient is M-N/2
- APInt B(M);
- B -= N.sdiv(Two);
+ // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
- // The A coefficient is N/2
- APInt A(N.sdiv(Two));
+ // The A coefficient is N/2
+ APInt A = N.sdiv(Two);
- // Compute the B^2-4ac term.
- APInt SqrtTerm(B);
- SqrtTerm *= B;
- SqrtTerm -= 4 * (A * C);
+ // The B coefficient is M-N/2
+ APInt B = M;
+ B -= A; // A is the same as N/2.
- if (SqrtTerm.isNegative()) {
- // The loop is provably infinite.
- return None;
- }
+ // The C coefficient is L.
+ const APInt& C = L;
- // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
- // integer value or else APInt::sqrt() will assert.
- APInt SqrtVal(SqrtTerm.sqrt());
+ // Compute the B^2-4ac term.
+ APInt SqrtTerm = B;
+ SqrtTerm *= B;
+ SqrtTerm -= 4 * (A * C);
- // Compute the two solutions for the quadratic formula.
- // The divisions must be performed as signed divisions.
- APInt NegB(-B);
- APInt TwoA(A << 1);
- if (TwoA.isMinValue())
- return None;
+ if (SqrtTerm.isNegative()) {
+ // The loop is provably infinite.
+ return None;
+ }
+
+ // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
+ // integer value or else APInt::sqrt() will assert.
+ APInt SqrtVal = SqrtTerm.sqrt();
+
+ // Compute the two solutions for the quadratic formula.
+ // The divisions must be performed as signed divisions.
+ APInt NegB = -std::move(B);
+ APInt TwoA = std::move(A);
+ TwoA <<= 1;
+ if (TwoA.isNullValue())
+ return None;
- LLVMContext &Context = SE.getContext();
+ LLVMContext &Context = SE.getContext();
- ConstantInt *Solution1 =
- ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
- ConstantInt *Solution2 =
- ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
+ ConstantInt *Solution1 =
+ ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
+ ConstantInt *Solution2 =
+ ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
- return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
- cast<SCEVConstant>(SE.getConstant(Solution2)));
- } // end APIntOps namespace
+ return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
+ cast<SCEVConstant>(SE.getConstant(Solution2)));
}
ScalarEvolution::ExitLimit
@@ -8976,7 +9015,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
.getSignedMax();
// SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow!
- return (std::move(MaxValue) - std::move(MaxStrideMinusOne)).slt(MaxRHS);
+ return (std::move(MaxValue) - MaxStrideMinusOne).slt(MaxRHS);
}
APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax();
@@ -8985,7 +9024,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
.getUnsignedMax();
// UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow!
- return (std::move(MaxValue) - std::move(MaxStrideMinusOne)).ult(MaxRHS);
+ return (std::move(MaxValue) - MaxStrideMinusOne).ult(MaxRHS);
}
bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
@@ -9002,7 +9041,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
.getSignedMax();
// SMinRHS - SMaxStrideMinusOne < SMinValue => overflow!
- return (std::move(MinValue) + std::move(MaxStrideMinusOne)).sgt(MinRHS);
+ return (std::move(MinValue) + MaxStrideMinusOne).sgt(MinRHS);
}
APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin();
@@ -9011,7 +9050,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
.getUnsignedMax();
// UMinRHS - UMaxStrideMinusOne < UMinValue => overflow!
- return (std::move(MinValue) + std::move(MaxStrideMinusOne)).ugt(MinRHS);
+ return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS);
}
const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step,
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index 848e1b4717b5..3cf1bbc5daa5 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -241,6 +241,50 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
TLI.setUnavailable(LibFunc_tanhf);
}
+ // These definitions are due to math-finite.h header on Linux
+ TLI.setUnavailable(LibFunc_acos_finite);
+ TLI.setUnavailable(LibFunc_acosf_finite);
+ TLI.setUnavailable(LibFunc_acosl_finite);
+ TLI.setUnavailable(LibFunc_acosh_finite);
+ TLI.setUnavailable(LibFunc_acoshf_finite);
+ TLI.setUnavailable(LibFunc_acoshl_finite);
+ TLI.setUnavailable(LibFunc_asin_finite);
+ TLI.setUnavailable(LibFunc_asinf_finite);
+ TLI.setUnavailable(LibFunc_asinl_finite);
+ TLI.setUnavailable(LibFunc_atan2_finite);
+ TLI.setUnavailable(LibFunc_atan2f_finite);
+ TLI.setUnavailable(LibFunc_atan2l_finite);
+ TLI.setUnavailable(LibFunc_atanh_finite);
+ TLI.setUnavailable(LibFunc_atanhf_finite);
+ TLI.setUnavailable(LibFunc_atanhl_finite);
+ TLI.setUnavailable(LibFunc_cosh_finite);
+ TLI.setUnavailable(LibFunc_coshf_finite);
+ TLI.setUnavailable(LibFunc_coshl_finite);
+ TLI.setUnavailable(LibFunc_exp10_finite);
+ TLI.setUnavailable(LibFunc_exp10f_finite);
+ TLI.setUnavailable(LibFunc_exp10l_finite);
+ TLI.setUnavailable(LibFunc_exp2_finite);
+ TLI.setUnavailable(LibFunc_exp2f_finite);
+ TLI.setUnavailable(LibFunc_exp2l_finite);
+ TLI.setUnavailable(LibFunc_exp_finite);
+ TLI.setUnavailable(LibFunc_expf_finite);
+ TLI.setUnavailable(LibFunc_expl_finite);
+ TLI.setUnavailable(LibFunc_log10_finite);
+ TLI.setUnavailable(LibFunc_log10f_finite);
+ TLI.setUnavailable(LibFunc_log10l_finite);
+ TLI.setUnavailable(LibFunc_log2_finite);
+ TLI.setUnavailable(LibFunc_log2f_finite);
+ TLI.setUnavailable(LibFunc_log2l_finite);
+ TLI.setUnavailable(LibFunc_log_finite);
+ TLI.setUnavailable(LibFunc_logf_finite);
+ TLI.setUnavailable(LibFunc_logl_finite);
+ TLI.setUnavailable(LibFunc_pow_finite);
+ TLI.setUnavailable(LibFunc_powf_finite);
+ TLI.setUnavailable(LibFunc_powl_finite);
+ TLI.setUnavailable(LibFunc_sinh_finite);
+ TLI.setUnavailable(LibFunc_sinhf_finite);
+ TLI.setUnavailable(LibFunc_sinhl_finite);
+
// Win32 does *not* provide provide these functions, but they are
// generally available on POSIX-compliant systems:
TLI.setUnavailable(LibFunc_access);
@@ -496,7 +540,7 @@ static StringRef sanitizeFunctionName(StringRef funcName) {
// Check for \01 prefix that is used to mangle __asm declarations and
// strip it if present.
- return GlobalValue::getRealLinkageName(funcName);
+ return GlobalValue::dropLLVMManglingEscape(funcName);
}
bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
@@ -1004,22 +1048,34 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
return (NumParams == 1 && FTy.getParamType(0)->isFloatingPointTy());
case LibFunc_acos:
+ case LibFunc_acos_finite:
case LibFunc_acosf:
+ case LibFunc_acosf_finite:
case LibFunc_acosh:
+ case LibFunc_acosh_finite:
case LibFunc_acoshf:
+ case LibFunc_acoshf_finite:
case LibFunc_acoshl:
+ case LibFunc_acoshl_finite:
case LibFunc_acosl:
+ case LibFunc_acosl_finite:
case LibFunc_asin:
+ case LibFunc_asin_finite:
case LibFunc_asinf:
+ case LibFunc_asinf_finite:
case LibFunc_asinh:
case LibFunc_asinhf:
case LibFunc_asinhl:
case LibFunc_asinl:
+ case LibFunc_asinl_finite:
case LibFunc_atan:
case LibFunc_atanf:
case LibFunc_atanh:
+ case LibFunc_atanh_finite:
case LibFunc_atanhf:
+ case LibFunc_atanhf_finite:
case LibFunc_atanhl:
+ case LibFunc_atanhl_finite:
case LibFunc_atanl:
case LibFunc_cbrt:
case LibFunc_cbrtf:
@@ -1030,18 +1086,30 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
case LibFunc_cos:
case LibFunc_cosf:
case LibFunc_cosh:
+ case LibFunc_cosh_finite:
case LibFunc_coshf:
+ case LibFunc_coshf_finite:
case LibFunc_coshl:
+ case LibFunc_coshl_finite:
case LibFunc_cosl:
case LibFunc_exp10:
+ case LibFunc_exp10_finite:
case LibFunc_exp10f:
+ case LibFunc_exp10f_finite:
case LibFunc_exp10l:
+ case LibFunc_exp10l_finite:
case LibFunc_exp2:
+ case LibFunc_exp2_finite:
case LibFunc_exp2f:
+ case LibFunc_exp2f_finite:
case LibFunc_exp2l:
+ case LibFunc_exp2l_finite:
case LibFunc_exp:
+ case LibFunc_exp_finite:
case LibFunc_expf:
+ case LibFunc_expf_finite:
case LibFunc_expl:
+ case LibFunc_expl_finite:
case LibFunc_expm1:
case LibFunc_expm1f:
case LibFunc_expm1l:
@@ -1052,20 +1120,29 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
case LibFunc_floorf:
case LibFunc_floorl:
case LibFunc_log10:
+ case LibFunc_log10_finite:
case LibFunc_log10f:
+ case LibFunc_log10f_finite:
case LibFunc_log10l:
+ case LibFunc_log10l_finite:
case LibFunc_log1p:
case LibFunc_log1pf:
case LibFunc_log1pl:
case LibFunc_log2:
+ case LibFunc_log2_finite:
case LibFunc_log2f:
+ case LibFunc_log2f_finite:
case LibFunc_log2l:
+ case LibFunc_log2l_finite:
case LibFunc_log:
+ case LibFunc_log_finite:
case LibFunc_logb:
case LibFunc_logbf:
case LibFunc_logbl:
case LibFunc_logf:
+ case LibFunc_logf_finite:
case LibFunc_logl:
+ case LibFunc_logl_finite:
case LibFunc_nearbyint:
case LibFunc_nearbyintf:
case LibFunc_nearbyintl:
@@ -1078,8 +1155,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
case LibFunc_sin:
case LibFunc_sinf:
case LibFunc_sinh:
+ case LibFunc_sinh_finite:
case LibFunc_sinhf:
+ case LibFunc_sinhf_finite:
case LibFunc_sinhl:
+ case LibFunc_sinhl_finite:
case LibFunc_sinl:
case LibFunc_sqrt:
case LibFunc_sqrt_finite:
@@ -1100,8 +1180,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
FTy.getReturnType() == FTy.getParamType(0));
case LibFunc_atan2:
+ case LibFunc_atan2_finite:
case LibFunc_atan2f:
+ case LibFunc_atan2f_finite:
case LibFunc_atan2l:
+ case LibFunc_atan2l_finite:
case LibFunc_fmin:
case LibFunc_fminf:
case LibFunc_fminl:
@@ -1115,8 +1198,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
case LibFunc_copysignf:
case LibFunc_copysignl:
case LibFunc_pow:
+ case LibFunc_pow_finite:
case LibFunc_powf:
+ case LibFunc_powf_finite:
case LibFunc_powl:
+ case LibFunc_powl_finite:
return (NumParams == 2 && FTy.getReturnType()->isFloatingPointTy() &&
FTy.getReturnType() == FTy.getParamType(0) &&
FTy.getReturnType() == FTy.getParamType(1));
@@ -1294,6 +1380,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
{"powf", "__svml_powf8", 8},
{"powf", "__svml_powf16", 16},
+ { "__pow_finite", "__svml_pow2", 2 },
+ { "__pow_finite", "__svml_pow4", 4 },
+ { "__pow_finite", "__svml_pow8", 8 },
+
+ { "__powf_finite", "__svml_powf4", 4 },
+ { "__powf_finite", "__svml_powf8", 8 },
+ { "__powf_finite", "__svml_powf16", 16 },
+
{"llvm.pow.f64", "__svml_pow2", 2},
{"llvm.pow.f64", "__svml_pow4", 4},
{"llvm.pow.f64", "__svml_pow8", 8},
@@ -1310,6 +1404,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
{"expf", "__svml_expf8", 8},
{"expf", "__svml_expf16", 16},
+ { "__exp_finite", "__svml_exp2", 2 },
+ { "__exp_finite", "__svml_exp4", 4 },
+ { "__exp_finite", "__svml_exp8", 8 },
+
+ { "__expf_finite", "__svml_expf4", 4 },
+ { "__expf_finite", "__svml_expf8", 8 },
+ { "__expf_finite", "__svml_expf16", 16 },
+
{"llvm.exp.f64", "__svml_exp2", 2},
{"llvm.exp.f64", "__svml_exp4", 4},
{"llvm.exp.f64", "__svml_exp8", 8},
@@ -1326,6 +1428,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
{"logf", "__svml_logf8", 8},
{"logf", "__svml_logf16", 16},
+ { "__log_finite", "__svml_log2", 2 },
+ { "__log_finite", "__svml_log4", 4 },
+ { "__log_finite", "__svml_log8", 8 },
+
+ { "__logf_finite", "__svml_logf4", 4 },
+ { "__logf_finite", "__svml_logf8", 8 },
+ { "__logf_finite", "__svml_logf16", 16 },
+
{"llvm.log.f64", "__svml_log2", 2},
{"llvm.log.f64", "__svml_log4", 4},
{"llvm.log.f64", "__svml_log8", 8},
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 26d606cce9bb..8a5d10473662 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -279,6 +279,10 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
return TTIImpl->getRegisterBitWidth(Vector);
}
+unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
+ return TTIImpl->getMinVectorRegisterBitWidth();
+}
+
bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
return TTIImpl->shouldConsiderAddressTypePromotion(
@@ -500,6 +504,15 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
}
+bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode,
+ Type *Ty, ReductionFlags Flags) const {
+ return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
+}
+
+bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
+ return TTIImpl->shouldExpandReduction(II);
+}
+
TargetTransformInfo::Concept::~Concept() {}
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index a7f3ff672aef..cba7363a0afa 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -88,9 +88,8 @@ struct Query {
/// classic case of this is assume(x = y), which will attempt to determine
/// bits in x from bits in y, which will attempt to determine bits in y from
/// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
- /// isKnownNonZero, which calls computeKnownBits and ComputeSignBit and
- /// isKnownToBeAPowerOfTwo (all of which can call computeKnownBits), and so
- /// on.
+ /// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo
+ /// (all of which can call computeKnownBits), and so on.
std::array<const Value *, MaxDepth> Excluded;
unsigned NumExcluded;
@@ -143,6 +142,16 @@ void llvm::computeKnownBits(const Value *V, KnownBits &Known,
Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
}
+static KnownBits computeKnownBits(const Value *V, unsigned Depth,
+ const Query &Q);
+
+KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
+ unsigned Depth, AssumptionCache *AC,
+ const Instruction *CxtI,
+ const DominatorTree *DT) {
+ return ::computeKnownBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
+}
+
bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
const DataLayout &DL,
AssumptionCache *AC, const Instruction *CxtI,
@@ -159,16 +168,6 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
return (LHSKnown.Zero | RHSKnown.Zero).isAllOnesValue();
}
-static void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
- unsigned Depth, const Query &Q);
-
-void llvm::ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
- const DataLayout &DL, unsigned Depth,
- AssumptionCache *AC, const Instruction *CxtI,
- const DominatorTree *DT) {
- ::ComputeSignBit(V, KnownZero, KnownOne, Depth,
- Query(DL, AC, safeCxtI(V, CxtI), DT));
-}
static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
const Query &Q);
@@ -194,9 +193,8 @@ bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL,
unsigned Depth,
AssumptionCache *AC, const Instruction *CxtI,
const DominatorTree *DT) {
- bool NonNegative, Negative;
- ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
- return NonNegative;
+ KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
+ return Known.isNonNegative();
}
bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
@@ -214,9 +212,8 @@ bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth,
AssumptionCache *AC, const Instruction *CxtI,
const DominatorTree *DT) {
- bool NonNegative, Negative;
- ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
- return Negative;
+ KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
+ return Known.isNegative();
}
static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q);
@@ -342,10 +339,10 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
// Also compute a conservative estimate for high known-0 bits.
// More trickiness is possible, but this is sufficient for the
// interesting case of alignment computation.
- unsigned TrailZ = Known.Zero.countTrailingOnes() +
- Known2.Zero.countTrailingOnes();
- unsigned LeadZ = std::max(Known.Zero.countLeadingOnes() +
- Known2.Zero.countLeadingOnes(),
+ unsigned TrailZ = Known.countMinTrailingZeros() +
+ Known2.countMinTrailingZeros();
+ unsigned LeadZ = std::max(Known.countMinLeadingZeros() +
+ Known2.countMinLeadingZeros(),
BitWidth) - BitWidth;
TrailZ = std::min(TrailZ, BitWidth);
@@ -750,8 +747,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
// Whatever high bits in c are zero are known to be zero.
- Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes());
- // assume(v <_u c)
+ Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
+ // assume(v <_u c)
} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
Pred == ICmpInst::ICMP_ULT &&
isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
@@ -761,9 +758,9 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
// Whatever high bits in c are zero are known to be zero (if c is a power
// of 2, then one more).
if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
- Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes()+1);
+ Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros() + 1);
else
- Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes());
+ Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
}
}
@@ -916,7 +913,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
m_Value(Y))))) {
Known2.resetAll();
computeKnownBits(Y, Known2, Depth + 1, Q);
- if (Known2.One.countTrailingOnes() > 0)
+ if (Known2.countMinTrailingOnes() > 0)
Known.Zero.setBit(0);
}
break;
@@ -953,14 +950,13 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
// treat a udiv as a logical right shift by the power of 2 known to
// be less than the denominator.
computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
- unsigned LeadZ = Known2.Zero.countLeadingOnes();
+ unsigned LeadZ = Known2.countMinLeadingZeros();
Known2.resetAll();
computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
- unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros();
- if (RHSUnknownLeadingOnes != BitWidth)
- LeadZ = std::min(BitWidth,
- LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+ unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+ if (RHSMaxLeadingZeros != BitWidth)
+ LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
Known.Zero.setHighBits(LeadZ);
break;
@@ -983,8 +979,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
if (Known.isNegative() && Known2.isNegative())
// We can derive a lower bound on the result by taking the max of the
// leading one bits.
- MaxHighOnes = std::max(Known.One.countLeadingOnes(),
- Known2.One.countLeadingOnes());
+ MaxHighOnes =
+ std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
// If either side is non-negative, the result is non-negative.
else if (Known.isNonNegative() || Known2.isNonNegative())
MaxHighZeros = 1;
@@ -993,8 +989,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
if (Known.isNonNegative() && Known2.isNonNegative())
// We can derive an upper bound on the result by taking the max of the
// leading zero bits.
- MaxHighZeros = std::max(Known.Zero.countLeadingOnes(),
- Known2.Zero.countLeadingOnes());
+ MaxHighZeros = std::max(Known.countMinLeadingZeros(),
+ Known2.countMinLeadingZeros());
// If either side is negative, the result is negative.
else if (Known.isNegative() || Known2.isNegative())
MaxHighOnes = 1;
@@ -1002,12 +998,12 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
// We can derive a lower bound on the result by taking the max of the
// leading one bits.
MaxHighOnes =
- std::max(Known.One.countLeadingOnes(), Known2.One.countLeadingOnes());
+ std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
} else if (SPF == SPF_UMIN) {
// We can derive an upper bound on the result by taking the max of the
// leading zero bits.
MaxHighZeros =
- std::max(Known.Zero.countLeadingOnes(), Known2.Zero.countLeadingOnes());
+ std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
}
// Only known if known in both the LHS and RHS.
@@ -1185,8 +1181,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
- unsigned Leaders = std::max(Known.Zero.countLeadingOnes(),
- Known2.Zero.countLeadingOnes());
+ unsigned Leaders =
+ std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
Known.resetAll();
Known.Zero.setHighBits(Leaders);
break;
@@ -1207,7 +1203,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
// to determine if we can prove known low zero bits.
KnownBits LocalKnown(BitWidth);
computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q);
- unsigned TrailZ = LocalKnown.Zero.countTrailingOnes();
+ unsigned TrailZ = LocalKnown.countMinTrailingZeros();
gep_type_iterator GTI = gep_type_begin(I);
for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
@@ -1241,7 +1237,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
computeKnownBits(Index, LocalKnown, Depth + 1, Q);
TrailZ = std::min(TrailZ,
unsigned(countTrailingZeros(TypeSize) +
- LocalKnown.Zero.countTrailingOnes()));
+ LocalKnown.countMinTrailingZeros()));
}
}
@@ -1286,8 +1282,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
KnownBits Known3(Known);
computeKnownBits(L, Known3, Depth + 1, Q);
- Known.Zero.setLowBits(std::min(Known2.Zero.countTrailingOnes(),
- Known3.Zero.countTrailingOnes()));
+ Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(),
+ Known3.countMinTrailingZeros()));
if (DontImproveNonNegativePhiBits)
break;
@@ -1386,12 +1382,25 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
Known.Zero |= Known2.Zero.byteSwap();
Known.One |= Known2.One.byteSwap();
break;
- case Intrinsic::ctlz:
+ case Intrinsic::ctlz: {
+ computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+ // If we have a known 1, its position is our upper bound.
+ unsigned PossibleLZ = Known2.One.countLeadingZeros();
+ // If this call is undefined for 0, the result will be less than 2^n.
+ if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
+ PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
+ unsigned LowBits = Log2_32(PossibleLZ)+1;
+ Known.Zero.setBitsFrom(LowBits);
+ break;
+ }
case Intrinsic::cttz: {
- unsigned LowBits = Log2_32(BitWidth)+1;
+ computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+ // If we have a known 1, its position is our upper bound.
+ unsigned PossibleTZ = Known2.One.countTrailingZeros();
// If this call is undefined for 0, the result will be less than 2^n.
if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
- LowBits -= 1;
+ PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
+ unsigned LowBits = Log2_32(PossibleTZ)+1;
Known.Zero.setBitsFrom(LowBits);
break;
}
@@ -1399,7 +1408,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
// We can bound the space the count needs. Also, bits known to be zero
// can't contribute to the population.
- unsigned BitsPossiblySet = BitWidth - Known2.Zero.countPopulation();
+ unsigned BitsPossiblySet = Known2.countMaxPopulation();
unsigned LowBits = Log2_32(BitsPossiblySet)+1;
Known.Zero.setBitsFrom(LowBits);
// TODO: we could bound KnownOne using the lower bound on the number
@@ -1450,6 +1459,14 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
}
/// Determine which bits of V are known to be either zero or one and return
+/// them.
+KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
+ KnownBits Known(getBitWidth(V->getType(), Q.DL));
+ computeKnownBits(V, Known, Depth, Q);
+ return Known;
+}
+
+/// Determine which bits of V are known to be either zero or one and return
/// them in the Known bit set.
///
/// NOTE: we cannot consider 'undef' to be "IsZero" here. The problem is that
@@ -1568,16 +1585,6 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
}
-/// Determine whether the sign bit is known to be zero or one.
-/// Convenience wrapper around computeKnownBits.
-void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
- unsigned Depth, const Query &Q) {
- KnownBits Bits(getBitWidth(V->getType(), Q.DL));
- computeKnownBits(V, Bits, Depth, Q);
- KnownOne = Bits.isNegative();
- KnownZero = Bits.isNonNegative();
-}
-
/// Return true if the given value is known to have exactly one
/// bit set when defined. For vectors return true if every element is known to
/// be a power of two when defined. Supports values with integer or pointer
@@ -1842,24 +1849,20 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
if (BO->isExact())
return isKnownNonZero(X, Depth, Q);
- bool XKnownNonNegative, XKnownNegative;
- ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
- if (XKnownNegative)
+ KnownBits Known = computeKnownBits(X, Depth, Q);
+ if (Known.isNegative())
return true;
// If the shifter operand is a constant, and all of the bits shifted
// out are known to be zero, and X is known non-zero then at least one
// non-zero bit must remain.
if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
- KnownBits Known(BitWidth);
- computeKnownBits(X, Known, Depth, Q);
-
auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
// Is there a known one in the portion not shifted out?
- if (Known.One.countLeadingZeros() < BitWidth - ShiftVal)
+ if (Known.countMaxLeadingZeros() < BitWidth - ShiftVal)
return true;
// Are all the bits to be shifted out known zero?
- if (Known.Zero.countTrailingOnes() >= ShiftVal)
+ if (Known.countMinTrailingZeros() >= ShiftVal)
return isKnownNonZero(X, Depth, Q);
}
}
@@ -1869,39 +1872,34 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
}
// X + Y.
else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
- bool XKnownNonNegative, XKnownNegative;
- bool YKnownNonNegative, YKnownNegative;
- ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
- ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Depth, Q);
+ KnownBits XKnown = computeKnownBits(X, Depth, Q);
+ KnownBits YKnown = computeKnownBits(Y, Depth, Q);
// If X and Y are both non-negative (as signed values) then their sum is not
// zero unless both X and Y are zero.
- if (XKnownNonNegative && YKnownNonNegative)
+ if (XKnown.isNonNegative() && YKnown.isNonNegative())
if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q))
return true;
// If X and Y are both negative (as signed values) then their sum is not
// zero unless both X and Y equal INT_MIN.
- if (XKnownNegative && YKnownNegative) {
- KnownBits Known(BitWidth);
+ if (XKnown.isNegative() && YKnown.isNegative()) {
APInt Mask = APInt::getSignedMaxValue(BitWidth);
// The sign bit of X is set. If some other bit is set then X is not equal
// to INT_MIN.
- computeKnownBits(X, Known, Depth, Q);
- if (Known.One.intersects(Mask))
+ if (XKnown.One.intersects(Mask))
return true;
// The sign bit of Y is set. If some other bit is set then Y is not equal
// to INT_MIN.
- computeKnownBits(Y, Known, Depth, Q);
- if (Known.One.intersects(Mask))
+ if (YKnown.One.intersects(Mask))
return true;
}
// The sum of a non-negative number and a power of two is not zero.
- if (XKnownNonNegative &&
+ if (XKnown.isNonNegative() &&
isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q))
return true;
- if (YKnownNonNegative &&
+ if (YKnown.isNonNegative() &&
isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q))
return true;
}
@@ -2276,14 +2274,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
// If we know that the sign bit is either zero or one, determine the number of
// identical bits in the top of the input value.
- if (Known.isNonNegative())
- return std::max(FirstAnswer, Known.Zero.countLeadingOnes());
-
- if (Known.isNegative())
- return std::max(FirstAnswer, Known.One.countLeadingOnes());
-
- // computeKnownBits gave us no extra information about the top bits.
- return FirstAnswer;
+ return std::max(FirstAnswer, Known.countMinSignBits());
}
/// This function computes the integer multiple of Base that equals V.
@@ -3441,8 +3432,8 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
computeKnownBits(RHS, RHSKnown, DL, /*Depth=*/0, AC, CxtI, DT);
// Note that underestimating the number of zero bits gives a more
// conservative answer.
- unsigned ZeroBits = LHSKnown.Zero.countLeadingOnes() +
- RHSKnown.Zero.countLeadingOnes();
+ unsigned ZeroBits = LHSKnown.countMinLeadingZeros() +
+ RHSKnown.countMinLeadingZeros();
// First handle the easy case: if we have enough zero bits there's
// definitely no overflow.
if (ZeroBits >= BitWidth)
@@ -3475,21 +3466,17 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
AssumptionCache *AC,
const Instruction *CxtI,
const DominatorTree *DT) {
- bool LHSKnownNonNegative, LHSKnownNegative;
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
- AC, CxtI, DT);
- if (LHSKnownNonNegative || LHSKnownNegative) {
- bool RHSKnownNonNegative, RHSKnownNegative;
- ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
- AC, CxtI, DT);
-
- if (LHSKnownNegative && RHSKnownNegative) {
+ KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+ if (LHSKnown.isNonNegative() || LHSKnown.isNegative()) {
+ KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
+
+ if (LHSKnown.isNegative() && RHSKnown.isNegative()) {
// The sign bit is set in both cases: this MUST overflow.
// Create a simple add instruction, and insert it into the struct.
return OverflowResult::AlwaysOverflows;
}
- if (LHSKnownNonNegative && RHSKnownNonNegative) {
+ if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) {
// The sign bit is clear in both cases: this CANNOT overflow.
// Create a simple add instruction, and insert it into the struct.
return OverflowResult::NeverOverflows;
@@ -3499,6 +3486,51 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
return OverflowResult::MayOverflow;
}
+/// \brief Return true if we can prove that adding the two values of the
+/// knownbits will not overflow.
+/// Otherwise return false.
+static bool checkRippleForSignedAdd(const KnownBits &LHSKnown,
+ const KnownBits &RHSKnown) {
+ // Addition of two 2's complement numbers having opposite signs will never
+ // overflow.
+ if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
+ (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
+ return true;
+
+ // If either of the values is known to be non-negative, adding them can only
+ // overflow if the second is also non-negative, so we can assume that.
+ // Two non-negative numbers will only overflow if there is a carry to the
+ // sign bit, so we can check if even when the values are as big as possible
+ // there is no overflow to the sign bit.
+ if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
+ APInt MaxLHS = ~LHSKnown.Zero;
+ MaxLHS.clearSignBit();
+ APInt MaxRHS = ~RHSKnown.Zero;
+ MaxRHS.clearSignBit();
+ APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
+ return Result.isSignBitClear();
+ }
+
+ // If either of the values is known to be negative, adding them can only
+ // overflow if the second is also negative, so we can assume that.
+ // Two negative number will only overflow if there is no carry to the sign
+ // bit, so we can check if even when the values are as small as possible
+ // there is overflow to the sign bit.
+ if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
+ APInt MinLHS = LHSKnown.One;
+ MinLHS.clearSignBit();
+ APInt MinRHS = RHSKnown.One;
+ MinRHS.clearSignBit();
+ APInt Result = std::move(MinLHS) + std::move(MinRHS);
+ return Result.isSignBitSet();
+ }
+
+ // If we reached here it means that we know nothing about the sign bits.
+ // In this case we can't know if there will be an overflow, since by
+ // changing the sign bits any two values can be made to overflow.
+ return false;
+}
+
static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
const Value *RHS,
const AddOperator *Add,
@@ -3510,18 +3542,29 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
return OverflowResult::NeverOverflows;
}
- bool LHSKnownNonNegative, LHSKnownNegative;
- bool RHSKnownNonNegative, RHSKnownNegative;
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
- AC, CxtI, DT);
- ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
- AC, CxtI, DT);
+ // If LHS and RHS each have at least two sign bits, the addition will look
+ // like
+ //
+ // XX..... +
+ // YY.....
+ //
+ // If the carry into the most significant position is 0, X and Y can't both
+ // be 1 and therefore the carry out of the addition is also 0.
+ //
+ // If the carry into the most significant position is 1, X and Y can't both
+ // be 0 and therefore the carry out of the addition is also 1.
+ //
+ // Since the carry into the most significant position is always equal to
+ // the carry out of the addition, there is no signed overflow.
+ if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
+ ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
+ return OverflowResult::NeverOverflows;
+
+ KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+ KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
- if ((LHSKnownNonNegative && RHSKnownNegative) ||
- (LHSKnownNegative && RHSKnownNonNegative)) {
- // The sign bits are opposite: this CANNOT overflow.
+ if (checkRippleForSignedAdd(LHSKnown, RHSKnown))
return OverflowResult::NeverOverflows;
- }
// The remaining code needs Add to be available. Early returns if not so.
if (!Add)
@@ -3532,14 +3575,13 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
// @llvm.assume'ed non-negative rather than proved so from analyzing its
// operands.
bool LHSOrRHSKnownNonNegative =
- (LHSKnownNonNegative || RHSKnownNonNegative);
- bool LHSOrRHSKnownNegative = (LHSKnownNegative || RHSKnownNegative);
+ (LHSKnown.isNonNegative() || RHSKnown.isNonNegative());
+ bool LHSOrRHSKnownNegative =
+ (LHSKnown.isNegative() || RHSKnown.isNegative());
if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) {
- bool AddKnownNonNegative, AddKnownNegative;
- ComputeSignBit(Add, AddKnownNonNegative, AddKnownNegative, DL,
- /*Depth=*/0, AC, CxtI, DT);
- if ((AddKnownNonNegative && LHSOrRHSKnownNonNegative) ||
- (AddKnownNegative && LHSOrRHSKnownNegative)) {
+ KnownBits AddKnown = computeKnownBits(Add, DL, /*Depth=*/0, AC, CxtI, DT);
+ if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) ||
+ (AddKnown.isNegative() && LHSOrRHSKnownNegative)) {
return OverflowResult::NeverOverflows;
}
}
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 722f17a8067e..2d2249da4e13 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
using namespace llvm;
using namespace llvm::PatternMatch;
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 97a567565b47..d7602c83435c 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -162,6 +162,10 @@ bool LLParser::ValidateEndOfModule() {
AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
AttributeSet::get(Context, FnAttrs));
II->setAttributes(AS);
+ } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
+ AttrBuilder Attrs(GV->getAttributes());
+ Attrs.merge(B);
+ GV->setAttributes(AttributeSet::get(Context,Attrs));
} else {
llvm_unreachable("invalid object with forward attribute group reference");
}
@@ -832,10 +836,10 @@ bool LLParser::parseIndirectSymbol(
/// ParseGlobal
/// ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalDLLStorageClass
/// OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
-/// OptionalExternallyInitialized GlobalType Type Const
+/// OptionalExternallyInitialized GlobalType Type Const OptionalAttrs
/// ::= OptionalLinkage OptionalVisibility OptionalDLLStorageClass
/// OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
-/// OptionalExternallyInitialized GlobalType Type Const
+/// OptionalExternallyInitialized GlobalType Type Const OptionalAttrs
///
/// Everything up to and including OptionalUnnamedAddr has been parsed
/// already.
@@ -950,6 +954,16 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
}
}
+ AttrBuilder Attrs;
+ LocTy BuiltinLoc;
+ std::vector<unsigned> FwdRefAttrGrps;
+ if (ParseFnAttributeValuePairs(Attrs, FwdRefAttrGrps, false, BuiltinLoc))
+ return true;
+ if (Attrs.hasAttributes() || !FwdRefAttrGrps.empty()) {
+ GV->setAttributes(AttributeSet::get(Context, Attrs));
+ ForwardRefAttrGroups[GV] = FwdRefAttrGrps;
+ }
+
return false;
}
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 580261a3b5e0..76298121566a 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -93,13 +93,6 @@ static cl::opt<bool> PrintSummaryGUIDs(
cl::desc(
"Print the global id for each value when reading the module summary"));
-// FIXME: This flag should either be removed or moved to clang as a driver flag.
-static llvm::cl::opt<bool> IgnoreEmptyThinLTOIndexFile(
- "ignore-empty-index-file", llvm::cl::ZeroOrMore,
- llvm::cl::desc(
- "Ignore an empty index file and perform non-ThinLTO compilation"),
- llvm::cl::init(false));
-
namespace {
enum {
@@ -2750,7 +2743,7 @@ Error BitcodeReader::parseComdatRecord(ArrayRef<uint64_t> Record) {
Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
// v1: [pointer type, isconst, initid, linkage, alignment, section,
// visibility, threadlocal, unnamed_addr, externally_initialized,
- // dllstorageclass, comdat] (name in VST)
+ // dllstorageclass, comdat, attributes] (name in VST)
// v2: [strtab_offset, strtab_size, v1]
StringRef Name;
std::tie(Name, Record) = readNameFromStrtab(Record);
@@ -2830,6 +2823,11 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
} else if (hasImplicitComdat(RawLinkage)) {
NewGV->setComdat(reinterpret_cast<Comdat *>(1));
}
+
+ if (Record.size() > 12) {
+ auto AS = getAttributes(Record[12]).getFnAttributes();
+ NewGV->setAttributes(AS);
+ }
return Error::success();
}
@@ -5658,7 +5656,8 @@ Expected<bool> llvm::hasGlobalValueSummary(MemoryBufferRef Buffer) {
}
Expected<std::unique_ptr<ModuleSummaryIndex>>
-llvm::getModuleSummaryIndexForFile(StringRef Path) {
+llvm::getModuleSummaryIndexForFile(StringRef Path,
+ bool IgnoreEmptyThinLTOIndexFile) {
ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
MemoryBuffer::getFileOrSTDIN(Path);
if (!FileOrErr)
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index 42135e5949ce..d80e1da911ca 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -500,7 +500,7 @@ class MetadataLoader::MetadataLoaderImpl {
// Upgrade variables attached to globals.
for (auto &GV : TheModule.globals()) {
- SmallVector<MDNode *, 1> MDs, NewMDs;
+ SmallVector<MDNode *, 1> MDs;
GV.getMetadata(LLVMContext::MD_dbg, MDs);
GV.eraseMetadata(LLVMContext::MD_dbg);
for (auto *MD : MDs)
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1b8d81a60201..1f8b50342c2d 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1109,7 +1109,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
// GLOBALVAR: [strtab offset, strtab size, type, isconst, initid,
// linkage, alignment, section, visibility, threadlocal,
// unnamed_addr, externally_initialized, dllstorageclass,
- // comdat]
+ // comdat, attributes]
Vals.push_back(StrtabBuilder.add(GV.getName()));
Vals.push_back(GV.getName().size());
Vals.push_back(VE.getTypeID(GV.getValueType()));
@@ -1124,13 +1124,17 @@ void ModuleBitcodeWriter::writeModuleInfo() {
GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None ||
GV.isExternallyInitialized() ||
GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass ||
- GV.hasComdat()) {
+ GV.hasComdat() ||
+ GV.hasAttributes()) {
Vals.push_back(getEncodedVisibility(GV));
Vals.push_back(getEncodedThreadLocalMode(GV));
Vals.push_back(getEncodedUnnamedAddr(GV));
Vals.push_back(GV.isExternallyInitialized());
Vals.push_back(getEncodedDLLStorageClass(GV));
Vals.push_back(GV.hasComdat() ? VE.getComdatID(GV.getComdat()) : 0);
+
+ auto AL = GV.getAttributesAsList(AttributeList::FunctionIndex);
+ Vals.push_back(VE.getAttributeListID(AL));
} else {
AbbrevToUse = SimpleGVarAbbrev;
}
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 861150766986..fd76400331d9 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -314,10 +314,13 @@ ValueEnumerator::ValueEnumerator(const Module &M,
// Remember what is the cutoff between globalvalue's and other constants.
unsigned FirstConstant = Values.size();
- // Enumerate the global variable initializers.
- for (const GlobalVariable &GV : M.globals())
+ // Enumerate the global variable initializers and attributes.
+ for (const GlobalVariable &GV : M.globals()) {
if (GV.hasInitializer())
EnumerateValue(GV.getInitializer());
+ if (GV.hasAttributes())
+ EnumerateAttributes(GV.getAttributesAsList(AttributeList::FunctionIndex));
+ }
// Enumerate the aliasees.
for (const GlobalAlias &GA : M.aliases())
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 76549540ce0f..73fc2b35fe4e 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -21,5 +21,5 @@ add_subdirectory(LineEditor)
add_subdirectory(ProfileData)
add_subdirectory(Fuzzer)
add_subdirectory(Passes)
-add_subdirectory(LibDriver)
+add_subdirectory(ToolDrivers)
add_subdirectory(XRay)
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 87b45c001de4..98163bffb60b 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -767,7 +767,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
// If our DISubprogram name is empty, use the mangled name.
if (FuncName.empty())
- FuncName = GlobalValue::getRealLinkageName(GV->getName());
+ FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName());
// Emit a symbol subsection, required by VS2012+ to find function boundaries.
OS.AddComment("Symbol subsection for " + Twine(FuncName));
@@ -888,13 +888,21 @@ void CodeViewDebug::collectVariableInfoFromMFTable(
if (!Scope)
continue;
+ // If the variable has an attached offset expression, extract it.
+ // FIXME: Try to handle DW_OP_deref as well.
+ int64_t ExprOffset = 0;
+ if (VI.Expr)
+ if (!VI.Expr->extractIfOffset(ExprOffset))
+ continue;
+
// Get the frame register used and the offset.
unsigned FrameReg = 0;
int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg);
uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg);
// Calculate the label ranges.
- LocalVarDefRange DefRange = createDefRangeMem(CVReg, FrameOffset);
+ LocalVarDefRange DefRange =
+ createDefRangeMem(CVReg, FrameOffset + ExprOffset);
for (const InsnRange &Range : Scope->getRanges()) {
const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
const MCSymbol *End = getLabelAfterInsn(Range.second);
@@ -2194,7 +2202,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() {
if (GV->hasComdat()) {
MCSymbol *GVSym = Asm->getSymbol(GV);
OS.AddComment("Symbol subsection for " +
- Twine(GlobalValue::getRealLinkageName(GV->getName())));
+ Twine(GlobalValue::dropLLVMManglingEscape(GV->getName())));
switchToDebugSectionForSymbol(GVSym);
EndLabel = beginCVSubsection(ModuleDebugFragmentKind::Symbols);
// FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions.
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 1d63e33a4d33..826162ad47c4 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -129,10 +129,9 @@ bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) {
}
void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
- assert(Asm);
PrevInstBB = nullptr;
- if (!hasDebugInfo(MMI, MF)) {
+ if (!Asm || !hasDebugInfo(MMI, MF)) {
skippedNonDebugFunction();
return;
}
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 738e062cb93f..e172712cf889 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -440,7 +440,7 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) {
auto *InlinedSP = getDISubprogram(DS);
// Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
// was inlined from another compile unit.
- DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP];
+ DIE *OriginDIE = getAbstractSPDies()[InlinedSP];
assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine);
@@ -634,7 +634,7 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
LexicalScope *Scope) {
- DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()];
+ DIE *&AbsDef = getAbstractSPDies()[Scope->getScopeNode()];
if (AbsDef)
return;
@@ -696,7 +696,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
DIE *D = getDIE(SP);
- if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) {
+ if (DIE *AbsSPDIE = getAbstractSPDies().lookup(SP)) {
if (D)
// If this subprogram has an abstract definition, reference that
addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
@@ -708,6 +708,42 @@ void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
}
}
+void DwarfCompileUnit::finishVariableDefinition(const DbgVariable &Var) {
+ DbgVariable *AbsVar = getExistingAbstractVariable(
+ InlinedVariable(Var.getVariable(), Var.getInlinedAt()));
+ auto *VariableDie = Var.getDIE();
+ if (AbsVar && AbsVar->getDIE()) {
+ addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
+ *AbsVar->getDIE());
+ } else
+ applyVariableAttributes(Var, *VariableDie);
+}
+
+DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(InlinedVariable IV) {
+ const DILocalVariable *Cleansed;
+ return getExistingAbstractVariable(IV, Cleansed);
+}
+
+// Find abstract variable, if any, associated with Var.
+DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(
+ InlinedVariable IV, const DILocalVariable *&Cleansed) {
+ // More then one inlined variable corresponds to one abstract variable.
+ Cleansed = IV.first;
+ auto &AbstractVariables = getAbstractVariables();
+ auto I = AbstractVariables.find(Cleansed);
+ if (I != AbstractVariables.end())
+ return I->second.get();
+ return nullptr;
+}
+
+void DwarfCompileUnit::createAbstractVariable(const DILocalVariable *Var,
+ LexicalScope *Scope) {
+ assert(Scope && Scope->isAbstractScope());
+ auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
+ DU->addScopeVariable(Scope, AbsDbgVariable.get());
+ getAbstractVariables()[Var] = std::move(AbsDbgVariable);
+}
+
void DwarfCompileUnit::emitHeader(bool UseOffsets) {
// Don't bother labeling the .dwo unit, as its offset isn't used.
if (!Skeleton) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 20a415150b4d..77e9e671529f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -68,6 +68,9 @@ class DwarfCompileUnit final : public DwarfUnit {
// ranges/locs.
const MCSymbol *BaseAddress;
+ DenseMap<const MDNode *, DIE *> AbstractSPDies;
+ DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
+
/// \brief Construct a DIE for the given DbgVariable without initializing the
/// DbgVariable's DIE reference.
DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract);
@@ -76,6 +79,18 @@ class DwarfCompileUnit final : public DwarfUnit {
bool includeMinimalInlineScopes() const;
+ DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
+ if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+ return AbstractSPDies;
+ return DU->getAbstractSPDies();
+ }
+
+ DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() {
+ if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+ return AbstractVariables;
+ return DU->getAbstractVariables();
+ }
+
public:
DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
DwarfDebug *DW, DwarfFile *DWU);
@@ -189,6 +204,13 @@ public:
DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
void finishSubprogramDefinition(const DISubprogram *SP);
+ void finishVariableDefinition(const DbgVariable &Var);
+ /// Find abstract variable associated with Var.
+ typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
+ DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
+ const DILocalVariable *&Cleansed);
+ DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
+ void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
/// Set the skeleton unit associated with this unit.
void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 6f442f5c3172..3410b98d7776 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -71,6 +71,10 @@ static cl::opt<bool> GenerateARangeSection("generate-arange-section",
cl::desc("Generate dwarf aranges"),
cl::init(false));
+static cl::opt<bool> SplitDwarfCrossCuReferences(
+ "split-dwarf-cross-cu-references", cl::Hidden,
+ cl::desc("Enable cross-cu references in DWO files"), cl::init(false));
+
namespace {
enum DefaultOnOff { Default, Enable, Disable };
}
@@ -362,21 +366,29 @@ template <typename Func> static void forBothCUs(DwarfCompileUnit &CU, Func F) {
F(*SkelCU);
}
-void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
+bool DwarfDebug::shareAcrossDWOCUs() const {
+ return SplitDwarfCrossCuReferences;
+}
+
+void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
+ LexicalScope *Scope) {
assert(Scope && Scope->getScopeNode());
assert(Scope->isAbstractScope());
assert(!Scope->getInlinedAt());
auto *SP = cast<DISubprogram>(Scope->getScopeNode());
- ProcessedSPNodes.insert(SP);
-
// Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
// was inlined from another compile unit.
auto &CU = *CUMap.lookup(SP->getUnit());
- forBothCUs(CU, [&](DwarfCompileUnit &CU) {
+ if (auto *SkelCU = CU.getSkeleton()) {
+ (shareAcrossDWOCUs() ? CU : SrcCU)
+ .constructAbstractSubprogramScopeDIE(Scope);
+ if (CU.getCUNode()->getSplitDebugInlining())
+ SkelCU->constructAbstractSubprogramScopeDIE(Scope);
+ } else {
CU.constructAbstractSubprogramScopeDIE(Scope);
- });
+ }
}
void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
@@ -564,13 +576,7 @@ void DwarfDebug::finishVariableDefinitions() {
// DIE::getUnit isn't simple - it walks parent pointers, etc.
DwarfCompileUnit *Unit = CUDieMap.lookup(VariableDie->getUnitDie());
assert(Unit);
- DbgVariable *AbsVar = getExistingAbstractVariable(
- InlinedVariable(Var->getVariable(), Var->getInlinedAt()));
- if (AbsVar && AbsVar->getDIE()) {
- Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
- *AbsVar->getDIE());
- } else
- Unit->applyVariableAttributes(*Var, *VariableDie);
+ Unit->finishVariableDefinition(*Var);
}
}
@@ -718,58 +724,32 @@ void DwarfDebug::endModule() {
}
// clean up.
- AbstractVariables.clear();
+ // FIXME: AbstractVariables.clear();
}
-// Find abstract variable, if any, associated with Var.
-DbgVariable *
-DwarfDebug::getExistingAbstractVariable(InlinedVariable IV,
- const DILocalVariable *&Cleansed) {
- // More then one inlined variable corresponds to one abstract variable.
- Cleansed = IV.first;
- auto I = AbstractVariables.find(Cleansed);
- if (I != AbstractVariables.end())
- return I->second.get();
- return nullptr;
-}
-
-DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) {
- const DILocalVariable *Cleansed;
- return getExistingAbstractVariable(IV, Cleansed);
-}
-
-void DwarfDebug::createAbstractVariable(const DILocalVariable *Var,
- LexicalScope *Scope) {
- assert(Scope && Scope->isAbstractScope());
- auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
- InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
- AbstractVariables[Var] = std::move(AbsDbgVariable);
-}
-
-void DwarfDebug::ensureAbstractVariableIsCreated(InlinedVariable IV,
+void DwarfDebug::ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable IV,
const MDNode *ScopeNode) {
const DILocalVariable *Cleansed = nullptr;
- if (getExistingAbstractVariable(IV, Cleansed))
+ if (CU.getExistingAbstractVariable(IV, Cleansed))
return;
- createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
+ CU.createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
cast<DILocalScope>(ScopeNode)));
}
-void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(
+void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU,
InlinedVariable IV, const MDNode *ScopeNode) {
const DILocalVariable *Cleansed = nullptr;
- if (getExistingAbstractVariable(IV, Cleansed))
+ if (CU.getExistingAbstractVariable(IV, Cleansed))
return;
if (LexicalScope *Scope =
LScopes.findAbstractScope(cast_or_null<DILocalScope>(ScopeNode)))
- createAbstractVariable(Cleansed, Scope);
+ CU.createAbstractVariable(Cleansed, Scope);
}
-
// Collect variable information from side table maintained by MF.
void DwarfDebug::collectVariableInfoFromMFTable(
- DenseSet<InlinedVariable> &Processed) {
+ DwarfCompileUnit &TheCU, DenseSet<InlinedVariable> &Processed) {
for (const auto &VI : Asm->MF->getVariableDbgInfo()) {
if (!VI.Var)
continue;
@@ -784,7 +764,7 @@ void DwarfDebug::collectVariableInfoFromMFTable(
if (!Scope)
continue;
- ensureAbstractVariableIsCreatedIfScoped(Var, Scope->getScopeNode());
+ ensureAbstractVariableIsCreatedIfScoped(TheCU, Var, Scope->getScopeNode());
auto RegVar = make_unique<DbgVariable>(Var.first, Var.second);
RegVar->initializeMMI(VI.Expr, VI.Slot);
if (InfoHolder.addScopeVariable(Scope, RegVar.get()))
@@ -955,9 +935,10 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
}
}
-DbgVariable *DwarfDebug::createConcreteVariable(LexicalScope &Scope,
+DbgVariable *DwarfDebug::createConcreteVariable(DwarfCompileUnit &TheCU,
+ LexicalScope &Scope,
InlinedVariable IV) {
- ensureAbstractVariableIsCreatedIfScoped(IV, Scope.getScopeNode());
+ ensureAbstractVariableIsCreatedIfScoped(TheCU, IV, Scope.getScopeNode());
ConcreteVariables.push_back(make_unique<DbgVariable>(IV.first, IV.second));
InfoHolder.addScopeVariable(&Scope, ConcreteVariables.back().get());
return ConcreteVariables.back().get();
@@ -980,7 +961,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
const DISubprogram *SP,
DenseSet<InlinedVariable> &Processed) {
// Grab the variable info that was squirreled away in the MMI side-table.
- collectVariableInfoFromMFTable(Processed);
+ collectVariableInfoFromMFTable(TheCU, Processed);
for (const auto &I : DbgValues) {
InlinedVariable IV = I.first;
@@ -1002,7 +983,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
continue;
Processed.insert(IV);
- DbgVariable *RegVar = createConcreteVariable(*Scope, IV);
+ DbgVariable *RegVar = createConcreteVariable(TheCU, *Scope, IV);
const MachineInstr *MInsn = Ranges.front().first;
assert(MInsn->isDebugValue() && "History must begin with debug value");
@@ -1038,7 +1019,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
for (const DILocalVariable *DV : SP->getVariables()) {
if (Processed.insert(InlinedVariable(DV, nullptr)).second)
if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope()))
- createConcreteVariable(*Scope, InlinedVariable(DV, nullptr));
+ createConcreteVariable(TheCU, *Scope, InlinedVariable(DV, nullptr));
}
}
@@ -1229,12 +1210,12 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
for (const DILocalVariable *DV : SP->getVariables()) {
if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second)
continue;
- ensureAbstractVariableIsCreated(InlinedVariable(DV, nullptr),
+ ensureAbstractVariableIsCreated(TheCU, InlinedVariable(DV, nullptr),
DV->getScope());
assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes
&& "ensureAbstractVariableIsCreated inserted abstract scopes");
}
- constructAbstractSubprogramScopeDIE(AScope);
+ constructAbstractSubprogramScopeDIE(TheCU, AScope);
}
ProcessedSPNodes.insert(SP);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 8a96e7867b6e..b9c5aa9ffb23 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -210,7 +210,6 @@ class DwarfDebug : public DebugHandlerBase {
DenseMap<const MCSymbol *, uint64_t> SymSize;
/// Collection of abstract variables.
- DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
/// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
@@ -313,20 +312,16 @@ class DwarfDebug : public DebugHandlerBase {
typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
- /// Find abstract variable associated with Var.
- DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
- const DILocalVariable *&Cleansed);
- DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
- void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
- void ensureAbstractVariableIsCreated(InlinedVariable Var,
+ void ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable Var,
const MDNode *Scope);
- void ensureAbstractVariableIsCreatedIfScoped(InlinedVariable Var,
+ void ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable Var,
const MDNode *Scope);
- DbgVariable *createConcreteVariable(LexicalScope &Scope, InlinedVariable IV);
+ DbgVariable *createConcreteVariable(DwarfCompileUnit &TheCU,
+ LexicalScope &Scope, InlinedVariable IV);
/// Construct a DIE for this abstract scope.
- void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
+ void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope);
void finishVariableDefinitions();
@@ -446,7 +441,8 @@ class DwarfDebug : public DebugHandlerBase {
const DbgValueHistoryMap::InstrRanges &Ranges);
/// Collect variable information from the side table maintained by MF.
- void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P);
+ void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU,
+ DenseSet<InlinedVariable> &P);
protected:
/// Gather pre-function debug information.
@@ -518,6 +514,8 @@ public:
/// split dwarf proposal support.
bool useSplitDwarf() const { return HasSplitDwarf; }
+ bool shareAcrossDWOCUs() const;
+
/// Returns the Dwarf Version.
uint16_t getDwarfVersion() const;
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index d4d2ed277274..54924e9806ed 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -53,6 +53,7 @@ class DwarfFile {
// Collection of abstract subprogram DIEs.
DenseMap<const MDNode *, DIE *> AbstractSPDies;
+ DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
/// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
/// be shared across CUs, that is why we keep the map here instead
@@ -105,6 +106,9 @@ public:
DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
return AbstractSPDies;
}
+ DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() {
+ return AbstractVariables;
+ }
void insertDIE(const MDNode *TypeMD, DIE *Die) {
DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 8d25def7772c..667afbb450bd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -173,7 +173,7 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
}
/// Check whether the DIE for this MDNode can be shared across CUs.
-static bool isShareableAcrossCUs(const DINode *D) {
+bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const {
// When the MDNode can be part of the type system, the DIE can be shared
// across CUs.
// Combining type units and cross-CU DIE sharing is lower value (since
@@ -181,6 +181,8 @@ static bool isShareableAcrossCUs(const DINode *D) {
// level already) but may be implementable for some value in projects
// building multiple independent libraries with LTO and then linking those
// together.
+ if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+ return false;
return (isa<DIType>(D) ||
(isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) &&
!GenerateDwarfTypeUnits;
@@ -645,7 +647,7 @@ void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
addString(Die,
DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
: dwarf::DW_AT_MIPS_linkage_name,
- GlobalValue::getRealLinkageName(LinkageName));
+ GlobalValue::dropLLVMManglingEscape(LinkageName));
}
void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 8fc841703e23..7acad2cbd89f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -65,7 +65,7 @@ public:
//===----------------------------------------------------------------------===//
/// This dwarf writer support class manages information associated with a
/// source file.
- class DwarfUnit : public DIEUnit {
+class DwarfUnit : public DIEUnit {
protected:
/// MDNode for the compile unit.
const DICompileUnit *CUNode;
@@ -103,6 +103,9 @@ protected:
bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie);
+ bool shareAcrossDWOCUs() const;
+ bool isShareableAcrossCUs(const DINode *D) const;
+
public:
// Accessors.
AsmPrinter* getAsmPrinter() const { return Asm; }
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 704f0ac2f191..815658bfb637 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -101,7 +101,7 @@ void WinException::beginFunction(const MachineFunction *MF) {
// functions may still refer to it.
const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
StringRef FLinkageName =
- GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+ GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName());
emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName);
}
shouldEmitLSDA = hasEHFunclets;
@@ -174,7 +174,7 @@ static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm,
// their funclet entry block's number.
const MachineFunction *MF = MBB->getParent();
const Function *F = MF->getFunction();
- StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+ StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
MCContext &Ctx = MF->getContext();
StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch";
return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" +
@@ -252,7 +252,7 @@ void WinException::endFunclet() {
!CurrentFuncletEntry->isCleanupFuncletEntry()) {
// If this is a C++ catch funclet (or the parent function),
// emit a reference to the LSDA for the parent function.
- StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+ StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
Twine("$cppxdata$", FuncLinkageName));
Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4);
@@ -536,7 +536,7 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
// Emit a label assignment with the SEH frame offset so we can use it for
// llvm.x86.seh.recoverfp.
StringRef FLinkageName =
- GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+ GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName());
MCSymbol *ParentFrameOffset =
Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
const MCExpr *MCOffset =
@@ -635,7 +635,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
auto &OS = *Asm->OutStreamer;
const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
- StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+ StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable;
MCSymbol *FuncInfoXData = nullptr;
@@ -942,7 +942,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
MCStreamer &OS = *Asm->OutStreamer;
const Function *F = MF->getFunction();
- StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName());
+ StringRef FLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
bool VerboseAsm = OS.isVerboseAsm();
auto AddComment = [&](const Twine &Comment) {
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index 9c19a4fd3c3e..17e6be05eb42 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -47,8 +47,7 @@ namespace {
bool runOnFunction(Function &F) override;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
- bool IsStore, bool IsLoad);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -224,22 +223,16 @@ bool AtomicExpand::runOnFunction(Function &F) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
- bool IsStore, IsLoad;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
- IsStore = false;
- IsLoad = true;
} else if (SI && isReleaseOrStronger(SI->getOrdering())) {
FenceOrdering = SI->getOrdering();
SI->setOrdering(AtomicOrdering::Monotonic);
- IsStore = true;
- IsLoad = false;
} else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
isAcquireOrStronger(RMWI->getOrdering()))) {
FenceOrdering = RMWI->getOrdering();
RMWI->setOrdering(AtomicOrdering::Monotonic);
- IsStore = IsLoad = true;
} else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
(isReleaseOrStronger(CASI->getSuccessOrdering()) ||
isAcquireOrStronger(CASI->getSuccessOrdering()))) {
@@ -250,11 +243,10 @@ bool AtomicExpand::runOnFunction(Function &F) {
FenceOrdering = CASI->getSuccessOrdering();
CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
CASI->setFailureOrdering(AtomicOrdering::Monotonic);
- IsStore = IsLoad = true;
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering);
}
}
@@ -320,13 +312,12 @@ bool AtomicExpand::runOnFunction(Function &F) {
return MadeChange;
}
-bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
- bool IsStore, bool IsLoad) {
+bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
IRBuilder<> Builder(I);
- auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
- auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
// The trailing fence is emitted before the instruction instead of after
// because there is no easy way of setting Builder insertion point after
// an instruction. So we must erase it from the BB, and insert it back
@@ -1048,8 +1039,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
std::prev(BB->end())->eraseFromParent();
Builder.SetInsertPoint(BB);
if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
- TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
- /*IsLoad=*/true);
+ TLI->emitLeadingFence(Builder, CI, SuccessOrder);
Builder.CreateBr(StartBB);
// Start the main loop block now that we've taken care of the preliminaries.
@@ -1064,8 +1054,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
Builder.SetInsertPoint(ReleasingStoreBB);
if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
- TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
- /*IsLoad=*/true);
+ TLI->emitLeadingFence(Builder, CI, SuccessOrder);
Builder.CreateBr(TryStoreBB);
Builder.SetInsertPoint(TryStoreBB);
@@ -1094,8 +1083,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
// necessary.
Builder.SetInsertPoint(SuccessBB);
if (ShouldInsertFencesForAtomic)
- TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
- /*IsLoad=*/true);
+ TLI->emitTrailingFence(Builder, CI, SuccessOrder);
Builder.CreateBr(ExitBB);
Builder.SetInsertPoint(NoStoreBB);
@@ -1107,8 +1095,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
Builder.SetInsertPoint(FailureBB);
if (ShouldInsertFencesForAtomic)
- TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
- /*IsLoad=*/true);
+ TLI->emitTrailingFence(Builder, CI, FailureOrder);
Builder.CreateBr(ExitBB);
// Finally, we have control-flow based knowledge of whether the cmpxchg
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 26da748fa244..55a27e2fb79e 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_library(LLVMCodeGen
ExecutionDepsFix.cpp
ExpandISelPseudos.cpp
ExpandPostRAPseudos.cpp
+ ExpandReductions.cpp
FaultMaps.cpp
FEntryInserter.cpp
FuncletLayout.cpp
@@ -48,6 +49,7 @@ add_llvm_library(LLVMCodeGen
LivePhysRegs.cpp
LiveRangeCalc.cpp
LiveRangeEdit.cpp
+ LiveRangeShrink.cpp
LiveRegMatrix.cpp
LiveRegUnits.cpp
LiveStackAnalysis.cpp
@@ -118,6 +120,7 @@ add_llvm_library(LLVMCodeGen
SafeStack.cpp
SafeStackColoring.cpp
SafeStackLayout.cpp
+ ScalarizeMaskedMemIntrin.cpp
ScheduleDAG.cpp
ScheduleDAGInstrs.cpp
ScheduleDAGPrinter.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 3fc12ccc3b60..4d30c6574b12 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -43,6 +43,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeLiveDebugValuesPass(Registry);
initializeLiveDebugVariablesPass(Registry);
initializeLiveIntervalsPass(Registry);
+ initializeLiveRangeShrinkPass(Registry);
initializeLiveStacksPass(Registry);
initializeLiveVariablesPass(Registry);
initializeLocalStackSlotPassPass(Registry);
@@ -79,7 +80,8 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeRAGreedyPass(Registry);
initializeRegisterCoalescerPass(Registry);
initializeRenameIndependentSubregsPass(Registry);
- initializeSafeStackPass(Registry);
+ initializeSafeStackLegacyPassPass(Registry);
+ initializeScalarizeMaskedMemIntrinPass(Registry);
initializeShrinkWrapPass(Registry);
initializeSlotIndexesPass(Registry);
initializeStackColoringPass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index c6c93811a0f9..f2e024c5e3bd 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -295,7 +295,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
if (PSI->isFunctionHotInCallGraph(&F))
F.setSectionPrefix(".hot");
else if (PSI->isFunctionColdInCallGraph(&F))
- F.setSectionPrefix(".cold");
+ F.setSectionPrefix(".unlikely");
}
/// This optimization identifies DIV instructions that can be
@@ -1549,519 +1549,6 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
return MadeChange;
}
-// Translate a masked load intrinsic like
-// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
-// <16 x i1> %mask, <16 x i32> %passthru)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-// %1 = bitcast i8* %addr to i32*
-// %2 = extractelement <16 x i1> %mask, i32 0
-// %3 = icmp eq i1 %2, true
-// br i1 %3, label %cond.load, label %else
-//
-//cond.load: ; preds = %0
-// %4 = getelementptr i32* %1, i32 0
-// %5 = load i32* %4
-// %6 = insertelement <16 x i32> undef, i32 %5, i32 0
-// br label %else
-//
-//else: ; preds = %0, %cond.load
-// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
-// %7 = extractelement <16 x i1> %mask, i32 1
-// %8 = icmp eq i1 %7, true
-// br i1 %8, label %cond.load1, label %else2
-//
-//cond.load1: ; preds = %else
-// %9 = getelementptr i32* %1, i32 1
-// %10 = load i32* %9
-// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
-// br label %else2
-//
-//else2: ; preds = %else, %cond.load1
-// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-// %12 = extractelement <16 x i1> %mask, i32 2
-// %13 = icmp eq i1 %12, true
-// br i1 %13, label %cond.load4, label %else5
-//
-static void scalarizeMaskedLoad(CallInst *CI) {
- Value *Ptr = CI->getArgOperand(0);
- Value *Alignment = CI->getArgOperand(1);
- Value *Mask = CI->getArgOperand(2);
- Value *Src0 = CI->getArgOperand(3);
-
- unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
- VectorType *VecType = dyn_cast<VectorType>(CI->getType());
- assert(VecType && "Unexpected return type of masked load intrinsic");
-
- Type *EltTy = CI->getType()->getVectorElementType();
-
- IRBuilder<> Builder(CI->getContext());
- Instruction *InsertPt = CI;
- BasicBlock *IfBlock = CI->getParent();
- BasicBlock *CondBlock = nullptr;
- BasicBlock *PrevIfBlock = CI->getParent();
-
- Builder.SetInsertPoint(InsertPt);
- Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
- // Short-cut if the mask is all-true.
- bool IsAllOnesMask = isa<Constant>(Mask) &&
- cast<Constant>(Mask)->isAllOnesValue();
-
- if (IsAllOnesMask) {
- Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
- CI->replaceAllUsesWith(NewI);
- CI->eraseFromParent();
- return;
- }
-
- // Adjust alignment for the scalar instruction.
- AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8);
- // Bitcast %addr fron i8* to EltTy*
- Type *NewPtrType =
- EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
- Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
- unsigned VectorWidth = VecType->getNumElements();
-
- Value *UndefVal = UndefValue::get(VecType);
-
- // The result vector
- Value *VResult = UndefVal;
-
- if (isa<ConstantVector>(Mask)) {
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
- if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
- continue;
- Value *Gep =
- Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
- LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
- VResult = Builder.CreateInsertElement(VResult, Load,
- Builder.getInt32(Idx));
- }
- Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
- CI->replaceAllUsesWith(NewI);
- CI->eraseFromParent();
- return;
- }
-
- PHINode *Phi = nullptr;
- Value *PrevPhi = UndefVal;
-
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
- // Fill the "else" block, created in the previous iteration
- //
- // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
- // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
- // %to_load = icmp eq i1 %mask_1, true
- // br i1 %to_load, label %cond.load, label %else
- //
- if (Idx > 0) {
- Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
- Phi->addIncoming(VResult, CondBlock);
- Phi->addIncoming(PrevPhi, PrevIfBlock);
- PrevPhi = Phi;
- VResult = Phi;
- }
-
- Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
- Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
- ConstantInt::get(Predicate->getType(), 1));
-
- // Create "cond" block
- //
- // %EltAddr = getelementptr i32* %1, i32 0
- // %Elt = load i32* %EltAddr
- // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
- //
- CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
- Builder.SetInsertPoint(InsertPt);
-
- Value *Gep =
- Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
- LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
- VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
-
- // Create "else" block, fill it in the next iteration
- BasicBlock *NewIfBlock =
- CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
- Builder.SetInsertPoint(InsertPt);
- Instruction *OldBr = IfBlock->getTerminator();
- BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
- OldBr->eraseFromParent();
- PrevIfBlock = IfBlock;
- IfBlock = NewIfBlock;
- }
-
- Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
- Phi->addIncoming(VResult, CondBlock);
- Phi->addIncoming(PrevPhi, PrevIfBlock);
- Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
- CI->replaceAllUsesWith(NewI);
- CI->eraseFromParent();
-}
-
-// Translate a masked store intrinsic, like
-// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
-// <16 x i1> %mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set
-//
-// %1 = bitcast i8* %addr to i32*
-// %2 = extractelement <16 x i1> %mask, i32 0
-// %3 = icmp eq i1 %2, true
-// br i1 %3, label %cond.store, label %else
-//
-// cond.store: ; preds = %0
-// %4 = extractelement <16 x i32> %val, i32 0
-// %5 = getelementptr i32* %1, i32 0
-// store i32 %4, i32* %5
-// br label %else
-//
-// else: ; preds = %0, %cond.store
-// %6 = extractelement <16 x i1> %mask, i32 1
-// %7 = icmp eq i1 %6, true
-// br i1 %7, label %cond.store1, label %else2
-//
-// cond.store1: ; preds = %else
-// %8 = extractelement <16 x i32> %val, i32 1
-// %9 = getelementptr i32* %1, i32 1
-// store i32 %8, i32* %9
-// br label %else2
-// . . .
-static void scalarizeMaskedStore(CallInst *CI) {
- Value *Src = CI->getArgOperand(0);
- Value *Ptr = CI->getArgOperand(1);
- Value *Alignment = CI->getArgOperand(2);
- Value *Mask = CI->getArgOperand(3);
-
- unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
- VectorType *VecType = dyn_cast<VectorType>(Src->getType());
- assert(VecType && "Unexpected data type in masked store intrinsic");
-
- Type *EltTy = VecType->getElementType();
-
- IRBuilder<> Builder(CI->getContext());
- Instruction *InsertPt = CI;
- BasicBlock *IfBlock = CI->getParent();
- Builder.SetInsertPoint(InsertPt);
- Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
- // Short-cut if the mask is all-true.
- bool IsAllOnesMask = isa<Constant>(Mask) &&
- cast<Constant>(Mask)->isAllOnesValue();
-
- if (IsAllOnesMask) {
- Builder.CreateAlignedStore(Src, Ptr, AlignVal);
- CI->eraseFromParent();
- return;
- }
-
- // Adjust alignment for the scalar instruction.
- AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8);
- // Bitcast %addr fron i8* to EltTy*
- Type *NewPtrType =
- EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
- Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
- unsigned VectorWidth = VecType->getNumElements();
-
- if (isa<ConstantVector>(Mask)) {
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
- if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
- continue;
- Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
- Value *Gep =
- Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
- Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
- }
- CI->eraseFromParent();
- return;
- }
-
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
- // Fill the "else" block, created in the previous iteration
- //
- // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
- // %to_store = icmp eq i1 %mask_1, true
- // br i1 %to_store, label %cond.store, label %else
- //
- Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
- Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
- ConstantInt::get(Predicate->getType(), 1));
-
- // Create "cond" block
- //
- // %OneElt = extractelement <16 x i32> %Src, i32 Idx
- // %EltAddr = getelementptr i32* %1, i32 0
- // %store i32 %OneElt, i32* %EltAddr
- //
- BasicBlock *CondBlock =
- IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
- Builder.SetInsertPoint(InsertPt);
-
- Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
- Value *Gep =
- Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
- Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
-
- // Create "else" block, fill it in the next iteration
- BasicBlock *NewIfBlock =
- CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
- Builder.SetInsertPoint(InsertPt);
- Instruction *OldBr = IfBlock->getTerminator();
- BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
- OldBr->eraseFromParent();
- IfBlock = NewIfBlock;
- }
- CI->eraseFromParent();
-}
-
-// Translate a masked gather intrinsic like
-// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
-// <16 x i1> %Mask, <16 x i32> %Src)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
-// % Mask0 = extractelement <16 x i1> %Mask, i32 0
-// % ToLoad0 = icmp eq i1 % Mask0, true
-// br i1 % ToLoad0, label %cond.load, label %else
-//
-// cond.load:
-// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// % Load0 = load i32, i32* % Ptr0, align 4
-// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
-// br label %else
-//
-// else:
-// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
-// % Mask1 = extractelement <16 x i1> %Mask, i32 1
-// % ToLoad1 = icmp eq i1 % Mask1, true
-// br i1 % ToLoad1, label %cond.load1, label %else2
-//
-// cond.load1:
-// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// % Load1 = load i32, i32* % Ptr1, align 4
-// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
-// br label %else2
-// . . .
-// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
-// ret <16 x i32> %Result
-static void scalarizeMaskedGather(CallInst *CI) {
- Value *Ptrs = CI->getArgOperand(0);
- Value *Alignment = CI->getArgOperand(1);
- Value *Mask = CI->getArgOperand(2);
- Value *Src0 = CI->getArgOperand(3);
-
- VectorType *VecType = dyn_cast<VectorType>(CI->getType());
-
- assert(VecType && "Unexpected return type of masked load intrinsic");
-
- IRBuilder<> Builder(CI->getContext());
- Instruction *InsertPt = CI;
- BasicBlock *IfBlock = CI->getParent();
- BasicBlock *CondBlock = nullptr;
- BasicBlock *PrevIfBlock = CI->getParent();
- Builder.SetInsertPoint(InsertPt);
- unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-
- Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
- Value *UndefVal = UndefValue::get(VecType);
-
- // The result vector
- Value *VResult = UndefVal;
- unsigned VectorWidth = VecType->getNumElements();
-
- // Shorten the way if the mask is a vector of constants.
- bool IsConstMask = isa<ConstantVector>(Mask);
-
- if (IsConstMask) {
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
- if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
- continue;
- Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
- "Ptr" + Twine(Idx));
- LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
- "Load" + Twine(Idx));
- VResult = Builder.CreateInsertElement(VResult, Load,
- Builder.getInt32(Idx),
- "Res" + Twine(Idx));
- }
- Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
- CI->replaceAllUsesWith(NewI);
- CI->eraseFromParent();
- return;
- }
-
- PHINode *Phi = nullptr;
- Value *PrevPhi = UndefVal;
-
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
- // Fill the "else" block, created in the previous iteration
- //
- // %Mask1 = extractelement <16 x i1> %Mask, i32 1
- // %ToLoad1 = icmp eq i1 %Mask1, true
- // br i1 %ToLoad1, label %cond.load, label %else
- //
- if (Idx > 0) {
- Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
- Phi->addIncoming(VResult, CondBlock);
- Phi->addIncoming(PrevPhi, PrevIfBlock);
- PrevPhi = Phi;
- VResult = Phi;
- }
-
- Value *Predicate = Builder.CreateExtractElement(Mask,
- Builder.getInt32(Idx),
- "Mask" + Twine(Idx));
- Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
- ConstantInt::get(Predicate->getType(), 1),
- "ToLoad" + Twine(Idx));
-
- // Create "cond" block
- //
- // %EltAddr = getelementptr i32* %1, i32 0
- // %Elt = load i32* %EltAddr
- // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
- //
- CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
- Builder.SetInsertPoint(InsertPt);
-
- Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
- "Ptr" + Twine(Idx));
- LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
- "Load" + Twine(Idx));
- VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
- "Res" + Twine(Idx));
-
- // Create "else" block, fill it in the next iteration
- BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
- Builder.SetInsertPoint(InsertPt);
- Instruction *OldBr = IfBlock->getTerminator();
- BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
- OldBr->eraseFromParent();
- PrevIfBlock = IfBlock;
- IfBlock = NewIfBlock;
- }
-
- Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
- Phi->addIncoming(VResult, CondBlock);
- Phi->addIncoming(PrevPhi, PrevIfBlock);
- Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
- CI->replaceAllUsesWith(NewI);
- CI->eraseFromParent();
-}
-
-// Translate a masked scatter intrinsic, like
-// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
-// <16 x i1> %Mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set.
-//
-// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
-// % Mask0 = extractelement <16 x i1> % Mask, i32 0
-// % ToStore0 = icmp eq i1 % Mask0, true
-// br i1 %ToStore0, label %cond.store, label %else
-//
-// cond.store:
-// % Elt0 = extractelement <16 x i32> %Src, i32 0
-// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// store i32 %Elt0, i32* % Ptr0, align 4
-// br label %else
-//
-// else:
-// % Mask1 = extractelement <16 x i1> % Mask, i32 1
-// % ToStore1 = icmp eq i1 % Mask1, true
-// br i1 % ToStore1, label %cond.store1, label %else2
-//
-// cond.store1:
-// % Elt1 = extractelement <16 x i32> %Src, i32 1
-// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// store i32 % Elt1, i32* % Ptr1, align 4
-// br label %else2
-// . . .
-static void scalarizeMaskedScatter(CallInst *CI) {
- Value *Src = CI->getArgOperand(0);
- Value *Ptrs = CI->getArgOperand(1);
- Value *Alignment = CI->getArgOperand(2);
- Value *Mask = CI->getArgOperand(3);
-
- assert(isa<VectorType>(Src->getType()) &&
- "Unexpected data type in masked scatter intrinsic");
- assert(isa<VectorType>(Ptrs->getType()) &&
- isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
- "Vector of pointers is expected in masked scatter intrinsic");
-
- IRBuilder<> Builder(CI->getContext());
- Instruction *InsertPt = CI;
- BasicBlock *IfBlock = CI->getParent();
- Builder.SetInsertPoint(InsertPt);
- Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
- unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
- unsigned VectorWidth = Src->getType()->getVectorNumElements();
-
- // Shorten the way if the mask is a vector of constants.
- bool IsConstMask = isa<ConstantVector>(Mask);
-
- if (IsConstMask) {
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
- if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
- continue;
- Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
- "Elt" + Twine(Idx));
- Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
- "Ptr" + Twine(Idx));
- Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
- }
- CI->eraseFromParent();
- return;
- }
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
- // Fill the "else" block, created in the previous iteration
- //
- // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
- // % ToStore = icmp eq i1 % Mask1, true
- // br i1 % ToStore, label %cond.store, label %else
- //
- Value *Predicate = Builder.CreateExtractElement(Mask,
- Builder.getInt32(Idx),
- "Mask" + Twine(Idx));
- Value *Cmp =
- Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
- ConstantInt::get(Predicate->getType(), 1),
- "ToStore" + Twine(Idx));
-
- // Create "cond" block
- //
- // % Elt1 = extractelement <16 x i32> %Src, i32 1
- // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
- // %store i32 % Elt1, i32* % Ptr1
- //
- BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
- Builder.SetInsertPoint(InsertPt);
-
- Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
- "Elt" + Twine(Idx));
- Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
- "Ptr" + Twine(Idx));
- Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
-
- // Create "else" block, fill it in the next iteration
- BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
- Builder.SetInsertPoint(InsertPt);
- Instruction *OldBr = IfBlock->getTerminator();
- BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
- OldBr->eraseFromParent();
- IfBlock = NewIfBlock;
- }
- CI->eraseFromParent();
-}
-
/// If counting leading or trailing zeros is an expensive operation and a zero
/// input is defined, add a check for zero to avoid calling the intrinsic.
///
@@ -2242,39 +1729,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
}
return true;
}
- case Intrinsic::masked_load: {
- // Scalarize unsupported vector masked load
- if (!TTI->isLegalMaskedLoad(CI->getType())) {
- scalarizeMaskedLoad(CI);
- ModifiedDT = true;
- return true;
- }
- return false;
- }
- case Intrinsic::masked_store: {
- if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
- scalarizeMaskedStore(CI);
- ModifiedDT = true;
- return true;
- }
- return false;
- }
- case Intrinsic::masked_gather: {
- if (!TTI->isLegalMaskedGather(CI->getType())) {
- scalarizeMaskedGather(CI);
- ModifiedDT = true;
- return true;
- }
- return false;
- }
- case Intrinsic::masked_scatter: {
- if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
- scalarizeMaskedScatter(CI);
- ModifiedDT = true;
- return true;
- }
- return false;
- }
case Intrinsic::aarch64_stlxr:
case Intrinsic::aarch64_stxr: {
ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index ab2382e2db6d..e860906043dd 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -142,8 +142,9 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
MachineOperand &DstMO = MI->getOperand(0);
MachineOperand &SrcMO = MI->getOperand(1);
- if (SrcMO.getReg() == DstMO.getReg()) {
- DEBUG(dbgs() << "identity copy: " << *MI);
+ bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg());
+ if (IdentityCopy || SrcMO.isUndef()) {
+ DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy: ") << *MI);
// No need to insert an identity copy instruction, but replace with a KILL
// if liveness is changed.
if (SrcMO.isUndef() || MI->getNumOperands() > 2) {
diff --git a/lib/CodeGen/ExpandReductions.cpp b/lib/CodeGen/ExpandReductions.cpp
new file mode 100644
index 000000000000..a40ea28056dd
--- /dev/null
+++ b/lib/CodeGen/ExpandReductions.cpp
@@ -0,0 +1,167 @@
+//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for reduction intrinsics, allowing targets
+// to enable the experimental intrinsics until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExpandReductions.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+unsigned getOpcode(Intrinsic::ID ID) {
+ switch (ID) {
+ case Intrinsic::experimental_vector_reduce_fadd:
+ return Instruction::FAdd;
+ case Intrinsic::experimental_vector_reduce_fmul:
+ return Instruction::FMul;
+ case Intrinsic::experimental_vector_reduce_add:
+ return Instruction::Add;
+ case Intrinsic::experimental_vector_reduce_mul:
+ return Instruction::Mul;
+ case Intrinsic::experimental_vector_reduce_and:
+ return Instruction::And;
+ case Intrinsic::experimental_vector_reduce_or:
+ return Instruction::Or;
+ case Intrinsic::experimental_vector_reduce_xor:
+ return Instruction::Xor;
+ case Intrinsic::experimental_vector_reduce_smax:
+ case Intrinsic::experimental_vector_reduce_smin:
+ case Intrinsic::experimental_vector_reduce_umax:
+ case Intrinsic::experimental_vector_reduce_umin:
+ return Instruction::ICmp;
+ case Intrinsic::experimental_vector_reduce_fmax:
+ case Intrinsic::experimental_vector_reduce_fmin:
+ return Instruction::FCmp;
+ default:
+ llvm_unreachable("Unexpected ID");
+ }
+}
+
+RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {
+ switch (ID) {
+ case Intrinsic::experimental_vector_reduce_smax:
+ return RecurrenceDescriptor::MRK_SIntMax;
+ case Intrinsic::experimental_vector_reduce_smin:
+ return RecurrenceDescriptor::MRK_SIntMin;
+ case Intrinsic::experimental_vector_reduce_umax:
+ return RecurrenceDescriptor::MRK_UIntMax;
+ case Intrinsic::experimental_vector_reduce_umin:
+ return RecurrenceDescriptor::MRK_UIntMin;
+ case Intrinsic::experimental_vector_reduce_fmax:
+ return RecurrenceDescriptor::MRK_FloatMax;
+ case Intrinsic::experimental_vector_reduce_fmin:
+ return RecurrenceDescriptor::MRK_FloatMin;
+ default:
+ return RecurrenceDescriptor::MRK_Invalid;
+ }
+}
+
+bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
+ bool Changed = false;
+ SmallVector<IntrinsicInst*, 4> Worklist;
+ for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+ if (auto II = dyn_cast<IntrinsicInst>(&*I))
+ Worklist.push_back(II);
+
+ for (auto *II : Worklist) {
+ IRBuilder<> Builder(II);
+ Value *Vec = nullptr;
+ auto ID = II->getIntrinsicID();
+ auto MRK = RecurrenceDescriptor::MRK_Invalid;
+ switch (ID) {
+ case Intrinsic::experimental_vector_reduce_fadd:
+ case Intrinsic::experimental_vector_reduce_fmul:
+ // FMFs must be attached to the call, otherwise it's an ordered reduction
+ // and it can't be handled by generating this shuffle sequence.
+ // TODO: Implement scalarization of ordered reductions here for targets
+ // without native support.
+ if (!II->getFastMathFlags().unsafeAlgebra())
+ continue;
+ Vec = II->getArgOperand(1);
+ break;
+ case Intrinsic::experimental_vector_reduce_add:
+ case Intrinsic::experimental_vector_reduce_mul:
+ case Intrinsic::experimental_vector_reduce_and:
+ case Intrinsic::experimental_vector_reduce_or:
+ case Intrinsic::experimental_vector_reduce_xor:
+ case Intrinsic::experimental_vector_reduce_smax:
+ case Intrinsic::experimental_vector_reduce_smin:
+ case Intrinsic::experimental_vector_reduce_umax:
+ case Intrinsic::experimental_vector_reduce_umin:
+ case Intrinsic::experimental_vector_reduce_fmax:
+ case Intrinsic::experimental_vector_reduce_fmin:
+ Vec = II->getArgOperand(0);
+ MRK = getMRK(ID);
+ break;
+ default:
+ continue;
+ }
+ if (!TTI->shouldExpandReduction(II))
+ continue;
+ auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
+ II->replaceAllUsesWith(Rdx);
+ II->eraseFromParent();
+ Changed = true;
+ }
+ return Changed;
+}
+
+class ExpandReductions : public FunctionPass {
+public:
+ static char ID;
+ ExpandReductions() : FunctionPass(ID) {
+ initializeExpandReductionsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return expandReductions(F, TTI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+}
+
+char ExpandReductions::ID;
+INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
+ "Expand reduction intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
+ "Expand reduction intrinsics", false, false)
+
+FunctionPass *llvm::createExpandReductionsPass() {
+ return new ExpandReductions();
+}
+
+PreservedAnalyses ExpandReductionsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ if (!expandReductions(F, &TTI))
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index eaf4056e47ea..4d4591042296 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -162,7 +162,7 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI,
return std::get<0>(getAction(MI, MRI)) == Legal;
}
-LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
+Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect,
LegalizeAction Action) const {
switch(Action) {
default:
@@ -174,20 +174,20 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
return Aspect.Type;
case NarrowScalar: {
return findLegalType(Aspect,
- [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
+ [](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
}
case WidenScalar: {
- return findLegalType(Aspect, [&](LLT Ty) -> LLT {
+ return findLegalType(Aspect, [](LLT Ty) -> LLT {
return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize();
});
}
case FewerElements: {
return findLegalType(Aspect,
- [&](LLT Ty) -> LLT { return Ty.halfElements(); });
+ [](LLT Ty) -> LLT { return Ty.halfElements(); });
}
case MoreElements: {
return findLegalType(Aspect,
- [&](LLT Ty) -> LLT { return Ty.doubleElements(); });
+ [](LLT Ty) -> LLT { return Ty.doubleElements(); });
}
}
}
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 7248f50945d0..2eb3cdee694d 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -204,12 +204,8 @@ uint64_t RegBankSelect::getRepairCost(
// TODO: use a dedicated constant for ImpossibleCost.
if (Cost != UINT_MAX)
return Cost;
- assert(!TPC->isGlobalISelAbortEnabled() &&
- "Legalization not available yet");
// Return the legalization cost of that repairing.
}
- assert(!TPC->isGlobalISelAbortEnabled() &&
- "Complex repairing not implemented yet");
return UINT_MAX;
}
@@ -452,6 +448,11 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
// Sums up the repairing cost of MO at each insertion point.
uint64_t RepairCost = getRepairCost(MO, ValMapping);
+
+ // This is an impossible to repair cost.
+ if (RepairCost == UINT_MAX)
+ continue;
+
// Bias used for splitting: 5%.
const uint64_t PercentageForBias = 5;
uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100;
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index 3c93f8123b0d..254bdf10d804 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -110,3 +110,11 @@ Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg,
return None;
}
+
+const llvm::ConstantFP* llvm::getConstantFPVRegVal(unsigned VReg,
+ const MachineRegisterInfo &MRI) {
+ MachineInstr *MI = MRI.getVRegDef(VReg);
+ if (TargetOpcode::G_FCONSTANT != MI->getOpcode())
+ return nullptr;
+ return MI->getOperand(1).getFPImm();
+}
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 37fe41582333..628d599a3cc7 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1318,7 +1318,8 @@ static bool canFallThroughTo(MachineBasicBlock &MBB, MachineBasicBlock &ToMBB) {
return false;
PI = I++;
}
- return true;
+ // Finally see if the last I is indeed a successor to PI.
+ return PI->isSuccessor(&*I);
}
/// Invalidate predecessor BB info so it would be re-analyzed to determine if it
@@ -1587,22 +1588,32 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB);
}
+ // To be able to insert code freely at the end of BBI we sometimes remove
+ // the branch from BBI to NextMBB temporarily. Remember if this happened.
+ bool RemovedBranchToNextMBB = false;
if (CvtMBB.pred_size() > 1) {
BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
// Copy instructions in the true block, predicate them, and add them to
// the entry block.
CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
- // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
- // explicitly remove CvtBBI as a successor.
+ // Keep the CFG updated.
BBI.BB->removeSuccessor(&CvtMBB, true);
} else {
// Predicate the 'true' block after removing its branch.
CvtBBI->NonPredSize -= TII->removeBranch(CvtMBB);
PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
- // Now merge the entry of the triangle with the true block.
+ // Remove the branch from the entry of the triangle to NextBB to be able to
+ // do the merge below. Keep the CFG updated, but remember we removed the
+ // branch since we do want to execute NextMBB, either by introducing a
+ // branch to it again, or merging it into the entry block.
+ // How it's handled is decided further down.
BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+ BBI.BB->removeSuccessor(&NextMBB, true);
+ RemovedBranchToNextMBB = true;
+
+ // Now merge the entry of the triangle with the true block.
MergeBlocks(BBI, *CvtBBI, false);
}
@@ -1640,12 +1651,19 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
// block. By not merging them, we make it possible to iteratively
// ifcvt the blocks.
if (!HasEarlyExit &&
- NextMBB.pred_size() == 1 && !NextBBI->HasFallThrough &&
+ // We might have removed BBI from NextMBB's predecessor list above but
+ // we want it to be there, so consider that too.
+ (NextMBB.pred_size() == (RemovedBranchToNextMBB ? 0 : 1)) &&
+ !NextBBI->HasFallThrough &&
!NextMBB.hasAddressTaken()) {
+ // We will merge NextBBI into BBI, and thus remove the current
+ // fallthrough from BBI into CvtBBI.
+ BBI.BB->removeSuccessor(&CvtMBB, true);
MergeBlocks(BBI, *NextBBI);
FalseBBDead = true;
} else {
InsertUncondBranch(*BBI.BB, NextMBB, TII);
+ BBI.BB->addSuccessor(&NextMBB);
BBI.HasFallThrough = false;
}
// Mixed predicated and unpredicated code. This cannot be iteratively
@@ -1653,8 +1671,6 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
IterIfcvt = false;
}
- RemoveExtraEdges(BBI);
-
// Update block info. BB can be iteratively if-converted.
if (!IterIfcvt)
BBI.IsDone = true;
diff --git a/lib/CodeGen/LiveRangeShrink.cpp b/lib/CodeGen/LiveRangeShrink.cpp
new file mode 100644
index 000000000000..00182e2c779f
--- /dev/null
+++ b/lib/CodeGen/LiveRangeShrink.cpp
@@ -0,0 +1,211 @@
+//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+///===---------------------------------------------------------------------===//
+///
+/// \file
+/// This pass moves instructions close to the definition of its operands to
+/// shrink live range of the def instruction. The code motion is limited within
+/// the basic block. The moved instruction should have 1 def, and more than one
+/// uses, all of which are the only use of the def.
+///
+///===---------------------------------------------------------------------===//
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "lrshrink"
+
+STATISTIC(NumInstrsHoistedToShrinkLiveRange,
+ "Number of insructions hoisted to shrink live range.");
+
+using namespace llvm;
+
+namespace {
+class LiveRangeShrink : public MachineFunctionPass {
+public:
+ static char ID;
+
+ LiveRangeShrink() : MachineFunctionPass(ID) {
+ initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return "Live Range Shrink"; }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char LiveRangeShrink::ID = 0;
+char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID;
+
+INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false,
+ false)
+namespace {
+typedef DenseMap<MachineInstr *, unsigned> InstOrderMap;
+
+/// Returns \p New if it's dominated by \p Old, otherwise return \p Old.
+/// \p M maintains a map from instruction to its dominating order that satisfies
+/// M[A] > M[B] guarantees that A is dominated by B.
+/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return
+/// \p New.
+MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old,
+ const InstOrderMap &M) {
+ auto NewIter = M.find(&New);
+ if (NewIter == M.end())
+ return Old;
+ if (Old == nullptr)
+ return &New;
+ unsigned OrderOld = M.find(Old)->second;
+ unsigned OrderNew = NewIter->second;
+ if (OrderOld != OrderNew)
+ return OrderOld < OrderNew ? &New : Old;
+ // OrderOld == OrderNew, we need to iterate down from Old to see if it
+ // can reach New, if yes, New is dominated by Old.
+ for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew;
+ I = I->getNextNode())
+ if (I == &New)
+ return &New;
+ return Old;
+}
+
+/// Builds Instruction to its dominating order number map \p M by traversing
+/// from instruction \p Start.
+void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) {
+ M.clear();
+ unsigned i = 0;
+ for (MachineInstr &I : make_range(Start, Start->getParent()->end()))
+ M[&I] = i++;
+}
+} // end anonymous namespace
+
+bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+
+ InstOrderMap IOM;
+ // Map from register to instruction order (value of IOM) where the
+ // register is used last. When moving instructions up, we need to
+ // make sure all its defs (including dead def) will not cross its
+ // last use when moving up.
+ DenseMap<unsigned, unsigned> UseMap;
+
+ for (MachineBasicBlock &MBB : MF) {
+ if (MBB.empty())
+ continue;
+ bool SawStore = false;
+ BuildInstOrderMap(MBB.begin(), IOM);
+ UseMap.clear();
+
+ for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) {
+ MachineInstr &MI = *Next;
+ ++Next;
+ if (MI.isPHI() || MI.isDebugValue())
+ continue;
+ if (MI.mayStore())
+ SawStore = true;
+
+ unsigned CurrentOrder = IOM[&MI];
+ unsigned Barrier = 0;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || MO.isDebug())
+ continue;
+ if (MO.isUse())
+ UseMap[MO.getReg()] = CurrentOrder;
+ else if (MO.isDead() && UseMap.count(MO.getReg()))
+ // Barrier is the last instruction where MO get used. MI should not
+ // be moved above Barrier.
+ Barrier = std::max(Barrier, UseMap[MO.getReg()]);
+ }
+
+ if (!MI.isSafeToMove(nullptr, SawStore)) {
+ // If MI has side effects, it should become a barrier for code motion.
+ // IOM is rebuild from the next instruction to prevent later
+ // instructions from being moved before this MI.
+ if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
+ BuildInstOrderMap(Next, IOM);
+ SawStore = false;
+ }
+ continue;
+ }
+
+ const MachineOperand *DefMO = nullptr;
+ MachineInstr *Insert = nullptr;
+
+ // Number of live-ranges that will be shortened. We do not count
+ // live-ranges that are defined by a COPY as it could be coalesced later.
+ unsigned NumEligibleUse = 0;
+
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || MO.isDead() || MO.isDebug())
+ continue;
+ unsigned Reg = MO.getReg();
+ // Do not move the instruction if it def/uses a physical register,
+ // unless it is a constant physical register.
+ if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+ !MRI.isConstantPhysReg(Reg)) {
+ Insert = nullptr;
+ break;
+ }
+ if (MO.isDef()) {
+ // Do not move if there is more than one def.
+ if (DefMO) {
+ Insert = nullptr;
+ break;
+ }
+ DefMO = &MO;
+ } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg)) {
+ MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
+ if (!DefInstr.isCopy())
+ NumEligibleUse++;
+ Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
+ } else {
+ Insert = nullptr;
+ break;
+ }
+ }
+ // Move the instruction when # of shrunk live range > 1.
+ if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) {
+ MachineBasicBlock::iterator I = std::next(Insert->getIterator());
+ // Skip all the PHI and debug instructions.
+ while (I != MBB.end() && (I->isPHI() || I->isDebugValue()))
+ I = std::next(I);
+ if (I == MI.getIterator())
+ continue;
+
+ // Update the dominator order to be the same as the insertion point.
+ // We do this to maintain a non-decreasing order without need to update
+ // all instruction orders after the insertion point.
+ unsigned NewOrder = IOM[&*I];
+ IOM[&MI] = NewOrder;
+ NumInstrsHoistedToShrinkLiveRange++;
+
+ // Find MI's debug value following MI.
+ MachineBasicBlock::iterator EndIter = std::next(MI.getIterator());
+ if (MI.getOperand(0).isReg())
+ for (; EndIter != MBB.end() && EndIter->isDebugValue() &&
+ EndIter->getOperand(0).isReg() &&
+ EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg();
+ ++EndIter, ++Next)
+ IOM[&*EndIter] = NewOrder;
+ MBB.splice(I, &MBB, MI.getIterator(), EndIter);
+ }
+ }
+ }
+ return false;
+}
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 3568b0294ad9..a9aec926115a 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -767,7 +767,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
MachineBasicBlock *SuccBB) {
const unsigned NumNew = BB->getNumber();
- SmallSet<unsigned, 16> Defs, Kills;
+ DenseSet<unsigned> Defs, Kills;
MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end();
for (; BBI != BBE && BBI->isPHI(); ++BBI) {
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 4cfc128a8c1d..5003115a770f 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -133,6 +133,14 @@ static cl::opt<unsigned> TailDupPlacementThreshold(
"that won't conflict."), cl::init(2),
cl::Hidden);
+// Heuristic for aggressive tail duplication.
+static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
+ "tail-dup-placement-aggressive-threshold",
+ cl::desc("Instruction cutoff for aggressive tail duplication during "
+ "layout. Used at -O3. Tail merging during layout is forced to "
+ "have a threshold that won't conflict."), cl::init(3),
+ cl::Hidden);
+
// Heuristic for tail duplication.
static cl::opt<unsigned> TailDupPlacementPenalty(
"tail-dup-placement-penalty",
@@ -2646,9 +2654,26 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
assert(BlockToChain.empty());
assert(ComputedEdges.empty());
+ unsigned TailDupSize = TailDupPlacementThreshold;
+ // If only the aggressive threshold is explicitly set, use it.
+ if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
+ TailDupPlacementThreshold.getNumOccurrences() == 0)
+ TailDupSize = TailDupPlacementAggressiveThreshold;
+
+ TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+ // For agressive optimization, we can adjust some thresholds to be less
+ // conservative.
+ if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
+ // At O3 we should be more willing to copy blocks for tail duplication. This
+ // increases size pressure, so we only do it at O3
+ // Do this unless only the regular threshold is explicitly set.
+ if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
+ TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
+ TailDupSize = TailDupPlacementAggressiveThreshold;
+ }
+
if (TailDupPlacement) {
MPDT = &getAnalysis<MachinePostDominatorTree>();
- unsigned TailDupSize = TailDupPlacementThreshold;
if (MF.getFunction()->optForSize())
TailDupSize = 1;
TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
@@ -2658,7 +2683,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
buildCFGChains();
// Changing the layout can create new tail merging opportunities.
- TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
// TailMerge can create jump into if branches that make CFG irreducible for
// HW that requires structured CFG.
bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
@@ -2666,7 +2690,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
BranchFoldPlacement;
// No tail merging opportunities if the block number is less than four.
if (MF.size() > 3 && EnableTailMerge) {
- unsigned TailMergeSize = TailDupPlacementThreshold + 1;
+ unsigned TailMergeSize = TailDupSize + 1;
BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
*MBPI, TailMergeSize);
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index bfb2cde030dc..ab433273b189 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -2063,12 +2063,12 @@ void MachineVerifier::verifyStackFrame() {
if (I.getOpcode() == FrameSetupOpcode) {
if (BBState.ExitIsSetup)
report("FrameSetup is after another FrameSetup", &I);
- BBState.ExitValue -= TII->getFrameSize(I);
+ BBState.ExitValue -= TII->getFrameTotalSize(I);
BBState.ExitIsSetup = true;
}
if (I.getOpcode() == FrameDestroyOpcode) {
- int Size = TII->getFrameSize(I);
+ int Size = TII->getFrameTotalSize(I);
if (!BBState.ExitIsSetup)
report("FrameDestroy is not after a FrameSetup", &I);
int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue :
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index c67a25b888bf..db2264b2439d 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -34,7 +34,7 @@
#include <algorithm>
using namespace llvm;
-#define DEBUG_TYPE "phielim"
+#define DEBUG_TYPE "phi-node-elimination"
static cl::opt<bool>
DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false),
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index bf44ee8453b6..1803ea2b9249 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -3214,7 +3214,7 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
CurrList(WorkList.begin() + PrevSize, WorkList.end());
if (copyCoalesceWorkList(CurrList))
WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
- (MachineInstr*)nullptr), WorkList.end());
+ nullptr), WorkList.end());
}
void RegisterCoalescer::coalesceLocals() {
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 35db30f89976..0635e5c0a63c 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -62,10 +62,9 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
}
this->MBB = &MBB;
- for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
- IE = Scavenged.end(); I != IE; ++I) {
- I->Reg = 0;
- I->Restore = nullptr;
+ for (ScavengedInfo &SI : Scavenged) {
+ SI.Reg = 0;
+ SI.Restore = nullptr;
}
Tracking = false;
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index 7fa379d80c6c..08b3d345f689 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -19,6 +19,7 @@
#include "SafeStackLayout.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -92,11 +93,11 @@ public:
/// determined statically), and the unsafe stack, which contains all
/// local variables that are accessed in ways that we can't prove to
/// be safe.
-class SafeStack : public FunctionPass {
- const TargetMachine *TM;
- const TargetLoweringBase *TL;
- const DataLayout *DL;
- ScalarEvolution *SE;
+class SafeStack {
+ Function &F;
+ const TargetLoweringBase &TL;
+ const DataLayout &DL;
+ ScalarEvolution &SE;
Type *StackPtrTy;
Type *IntPtrTy;
@@ -171,33 +172,21 @@ class SafeStack : public FunctionPass {
uint64_t AllocaSize);
public:
- static char ID; // Pass identification, replacement for typeid.
- SafeStack(const TargetMachine *TM)
- : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) {
- initializeSafeStackPass(*PassRegistry::getPassRegistry());
- }
- SafeStack() : SafeStack(nullptr) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<ScalarEvolutionWrapperPass>();
- }
-
- bool doInitialization(Module &M) override {
- DL = &M.getDataLayout();
-
- StackPtrTy = Type::getInt8PtrTy(M.getContext());
- IntPtrTy = DL->getIntPtrType(M.getContext());
- Int32Ty = Type::getInt32Ty(M.getContext());
- Int8Ty = Type::getInt8Ty(M.getContext());
-
- return false;
- }
-
- bool runOnFunction(Function &F) override;
-}; // class SafeStack
+ SafeStack(Function &F, const TargetLoweringBase &TL, const DataLayout &DL,
+ ScalarEvolution &SE)
+ : F(F), TL(TL), DL(DL), SE(SE),
+ StackPtrTy(Type::getInt8PtrTy(F.getContext())),
+ IntPtrTy(DL.getIntPtrType(F.getContext())),
+ Int32Ty(Type::getInt32Ty(F.getContext())),
+ Int8Ty(Type::getInt8Ty(F.getContext())) {}
+
+ // Run the transformation on the associated function.
+ // Returns whether the function was changed.
+ bool run();
+};
uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
- uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType());
+ uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
if (AI->isArrayAllocation()) {
auto C = dyn_cast<ConstantInt>(AI->getArraySize());
if (!C)
@@ -209,11 +198,11 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
const Value *AllocaPtr, uint64_t AllocaSize) {
- AllocaOffsetRewriter Rewriter(*SE, AllocaPtr);
- const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr));
+ AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
+ const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
- uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType());
- ConstantRange AccessStartRange = SE->getUnsignedRange(Expr);
+ uint64_t BitWidth = SE.getTypeSizeInBits(Expr->getType());
+ ConstantRange AccessStartRange = SE.getUnsignedRange(Expr);
ConstantRange SizeRange =
ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize));
ConstantRange AccessRange = AccessStartRange.add(SizeRange);
@@ -226,8 +215,8 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
<< *AllocaPtr << "\n"
<< " Access " << *Addr << "\n"
<< " SCEV " << *Expr
- << " U: " << SE->getUnsignedRange(Expr)
- << ", S: " << SE->getSignedRange(Expr) << "\n"
+ << " U: " << SE.getUnsignedRange(Expr)
+ << ", S: " << SE.getSignedRange(Expr) << "\n"
<< " Range " << AccessRange << "\n"
<< " AllocaRange " << AllocaRange << "\n"
<< " " << (Safe ? "safe" : "unsafe") << "\n");
@@ -266,7 +255,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
switch (I->getOpcode()) {
case Instruction::Load: {
- if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr,
+ if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getType()), AllocaPtr,
AllocaSize))
return false;
break;
@@ -282,7 +271,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
return false;
}
- if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()),
+ if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getOperand(0)->getType()),
AllocaPtr, AllocaSize))
return false;
break;
@@ -343,7 +332,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
}
Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
- Value *StackGuardVar = TL->getIRStackGuard(IRB);
+ Value *StackGuardVar = TL.getIRStackGuard(IRB);
if (!StackGuardVar)
StackGuardVar =
F.getParent()->getOrInsertGlobal("__stack_chk_guard", StackPtrTy);
@@ -390,7 +379,7 @@ void SafeStack::findInsts(Function &F,
if (!Arg.hasByValAttr())
continue;
uint64_t Size =
- DL->getTypeStoreSize(Arg.getType()->getPointerElementType());
+ DL.getTypeStoreSize(Arg.getType()->getPointerElementType());
if (IsSafeStackAlloca(&Arg, Size))
continue;
@@ -476,19 +465,19 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
if (StackGuardSlot) {
Type *Ty = StackGuardSlot->getAllocatedType();
unsigned Align =
- std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
+ std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
Align, SSC.getFullLiveRange());
}
for (Argument *Arg : ByValArguments) {
Type *Ty = Arg->getType()->getPointerElementType();
- uint64_t Size = DL->getTypeStoreSize(Ty);
+ uint64_t Size = DL.getTypeStoreSize(Ty);
if (Size == 0)
Size = 1; // Don't create zero-sized stack objects.
// Ensure the object is properly aligned.
- unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+ unsigned Align = std::max((unsigned)DL.getPrefTypeAlignment(Ty),
Arg->getParamAlignment());
SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange());
}
@@ -501,7 +490,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
// Ensure the object is properly aligned.
unsigned Align =
- std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment());
+ std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment());
SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI));
}
@@ -539,7 +528,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
unsigned Offset = SSL.getObjectOffset(Arg);
Type *Ty = Arg->getType()->getPointerElementType();
- uint64_t Size = DL->getTypeStoreSize(Ty);
+ uint64_t Size = DL.getTypeStoreSize(Ty);
if (Size == 0)
Size = 1; // Don't create zero-sized stack objects.
@@ -630,7 +619,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false);
Type *Ty = AI->getAllocatedType();
- uint64_t TySize = DL->getTypeAllocSize(Ty);
+ uint64_t TySize = DL.getTypeAllocSize(Ty);
Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize));
Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(UnsafeStackPtr), IntPtrTy);
@@ -638,7 +627,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
// Align the SP value to satisfy the AllocaInst, type and stack alignments.
unsigned Align = std::max(
- std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()),
+ std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()),
(unsigned)StackAlignment);
assert(isPowerOf2_32(Align));
@@ -685,25 +674,10 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
}
}
-bool SafeStack::runOnFunction(Function &F) {
- DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
-
- if (!F.hasFnAttribute(Attribute::SafeStack)) {
- DEBUG(dbgs() << "[SafeStack] safestack is not requested"
- " for this function\n");
- return false;
- }
-
- if (F.isDeclaration()) {
- DEBUG(dbgs() << "[SafeStack] function definition"
- " is not available\n");
- return false;
- }
-
- if (!TM)
- report_fatal_error("Target machine is required");
- TL = TM->getSubtargetImpl(F)->getTargetLowering();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+bool SafeStack::run() {
+ assert(F.hasFnAttribute(Attribute::SafeStack) &&
+ "Can't run SafeStack on a function without the attribute");
+ assert(!F.isDeclaration() && "Can't run SafeStack on a function declaration");
++NumFunctions;
@@ -736,7 +710,7 @@ bool SafeStack::runOnFunction(Function &F) {
++NumUnsafeStackRestorePointsFunctions;
IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt());
- UnsafeStackPtr = TL->getSafeStackPointerLocation(IRB);
+ UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB);
// Load the current stack pointer (we'll also use it as a base pointer).
// FIXME: use a dedicated register for it ?
@@ -788,14 +762,70 @@ bool SafeStack::runOnFunction(Function &F) {
return true;
}
+class SafeStackLegacyPass : public FunctionPass {
+ const TargetMachine *TM;
+
+public:
+ static char ID; // Pass identification, replacement for typeid..
+ SafeStackLegacyPass(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {
+ initializeSafeStackLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ SafeStackLegacyPass() : SafeStackLegacyPass(nullptr) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ }
+
+ bool runOnFunction(Function &F) override {
+ DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
+
+ if (!F.hasFnAttribute(Attribute::SafeStack)) {
+ DEBUG(dbgs() << "[SafeStack] safestack is not requested"
+ " for this function\n");
+ return false;
+ }
+
+ if (F.isDeclaration()) {
+ DEBUG(dbgs() << "[SafeStack] function definition"
+ " is not available\n");
+ return false;
+ }
+
+ if (!TM)
+ report_fatal_error("Target machine is required");
+ auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+ if (!TL)
+ report_fatal_error("TargetLowering instance is required");
+
+ auto *DL = &F.getParent()->getDataLayout();
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+ // Compute DT and LI only for functions that have the attribute.
+ // This is only useful because the legacy pass manager doesn't let us
+ // compute analyzes lazily.
+ // In the backend pipeline, nothing preserves DT before SafeStack, so we
+ // would otherwise always compute it wastefully, even if there is no
+ // function with the safestack attribute.
+ DominatorTree DT(F);
+ LoopInfo LI(DT);
+
+ ScalarEvolution SE(F, TLI, ACT, DT, LI);
+
+ return SafeStack(F, *TL, *DL, SE).run();
+ }
+};
+
} // anonymous namespace
-char SafeStack::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack",
+char SafeStackLegacyPass::ID = 0;
+INITIALIZE_TM_PASS_BEGIN(SafeStackLegacyPass, "safe-stack",
"Safe Stack instrumentation pass", false, false)
-INITIALIZE_TM_PASS_END(SafeStack, "safe-stack",
+INITIALIZE_TM_PASS_END(SafeStackLegacyPass, "safe-stack",
"Safe Stack instrumentation pass", false, false)
FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) {
- return new SafeStack(TM);
+ return new SafeStackLegacyPass(TM);
}
diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
new file mode 100644
index 000000000000..dab5b91f50ad
--- /dev/null
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -0,0 +1,660 @@
+//=== ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ===//
+//=== instrinsics ===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces masked memory intrinsics - when unsupported by the target
+// - with a chain of basic blocks, that deal with the elements one-by-one if the
+// appropriate mask bit is set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarize-masked-mem-intrin"
+
+namespace {
+
+class ScalarizeMaskedMemIntrin : public FunctionPass {
+ const TargetTransformInfo *TTI;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID), TTI(nullptr) {
+ initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ return "Scalarize Masked Memory Intrinsics";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+private:
+ bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
+ bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
+};
+} // namespace
+
+char ScalarizeMaskedMemIntrin::ID = 0;
+INITIALIZE_PASS_BEGIN(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
+ "Scalarize unsupported masked memory intrinsics", false,
+ false)
+INITIALIZE_PASS_END(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
+ "Scalarize unsupported masked memory intrinsics", false,
+ false)
+
+FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
+ return new ScalarizeMaskedMemIntrin();
+}
+
+// Translate a masked load intrinsic like
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+// <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// %1 = bitcast i8* %addr to i32*
+// %2 = extractelement <16 x i1> %mask, i32 0
+// %3 = icmp eq i1 %2, true
+// br i1 %3, label %cond.load, label %else
+//
+// cond.load: ; preds = %0
+// %4 = getelementptr i32* %1, i32 0
+// %5 = load i32* %4
+// %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+// br label %else
+//
+// else: ; preds = %0, %cond.load
+// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
+// %7 = extractelement <16 x i1> %mask, i32 1
+// %8 = icmp eq i1 %7, true
+// br i1 %8, label %cond.load1, label %else2
+//
+// cond.load1: ; preds = %else
+// %9 = getelementptr i32* %1, i32 1
+// %10 = load i32* %9
+// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
+// br label %else2
+//
+// else2: ; preds = %else, %cond.load1
+// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+// %12 = extractelement <16 x i1> %mask, i32 2
+// %13 = icmp eq i1 %12, true
+// br i1 %13, label %cond.load4, label %else5
+//
+static void scalarizeMaskedLoad(CallInst *CI) {
+ Value *Ptr = CI->getArgOperand(0);
+ Value *Alignment = CI->getArgOperand(1);
+ Value *Mask = CI->getArgOperand(2);
+ Value *Src0 = CI->getArgOperand(3);
+
+ unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+ VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+ assert(VecType && "Unexpected return type of masked load intrinsic");
+
+ Type *EltTy = CI->getType()->getVectorElementType();
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ BasicBlock *CondBlock = nullptr;
+ BasicBlock *PrevIfBlock = CI->getParent();
+
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ // Short-cut if the mask is all-true.
+ bool IsAllOnesMask =
+ isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue();
+
+ if (IsAllOnesMask) {
+ Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+ return;
+ }
+
+ // Adjust alignment for the scalar instruction.
+ AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits() / 8);
+ // Bitcast %addr fron i8* to EltTy*
+ Type *NewPtrType =
+ EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+ Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+ unsigned VectorWidth = VecType->getNumElements();
+
+ Value *UndefVal = UndefValue::get(VecType);
+
+ // The result vector
+ Value *VResult = UndefVal;
+
+ if (isa<ConstantVector>(Mask)) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+ continue;
+ Value *Gep =
+ Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+ LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+ VResult =
+ Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+ }
+ Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+ return;
+ }
+
+ PHINode *Phi = nullptr;
+ Value *PrevPhi = UndefVal;
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+ // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+ // %to_load = icmp eq i1 %mask_1, true
+ // br i1 %to_load, label %cond.load, label %else
+ //
+ if (Idx > 0) {
+ Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+ Phi->addIncoming(VResult, CondBlock);
+ Phi->addIncoming(PrevPhi, PrevIfBlock);
+ PrevPhi = Phi;
+ VResult = Phi;
+ }
+
+ Value *Predicate =
+ Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+ ConstantInt::get(Predicate->getType(), 1));
+
+ // Create "cond" block
+ //
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %Elt = load i32* %EltAddr
+ // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+ //
+ CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *Gep =
+ Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+ LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+ VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock =
+ CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+ }
+
+ Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+ Phi->addIncoming(VResult, CondBlock);
+ Phi->addIncoming(PrevPhi, PrevIfBlock);
+ Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+}
+
+// Translate a masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+// <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+// %1 = bitcast i8* %addr to i32*
+// %2 = extractelement <16 x i1> %mask, i32 0
+// %3 = icmp eq i1 %2, true
+// br i1 %3, label %cond.store, label %else
+//
+// cond.store: ; preds = %0
+// %4 = extractelement <16 x i32> %val, i32 0
+// %5 = getelementptr i32* %1, i32 0
+// store i32 %4, i32* %5
+// br label %else
+//
+// else: ; preds = %0, %cond.store
+// %6 = extractelement <16 x i1> %mask, i32 1
+// %7 = icmp eq i1 %6, true
+// br i1 %7, label %cond.store1, label %else2
+//
+// cond.store1: ; preds = %else
+// %8 = extractelement <16 x i32> %val, i32 1
+// %9 = getelementptr i32* %1, i32 1
+// store i32 %8, i32* %9
+// br label %else2
+// . . .
+static void scalarizeMaskedStore(CallInst *CI) {
+ Value *Src = CI->getArgOperand(0);
+ Value *Ptr = CI->getArgOperand(1);
+ Value *Alignment = CI->getArgOperand(2);
+ Value *Mask = CI->getArgOperand(3);
+
+ unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+ VectorType *VecType = dyn_cast<VectorType>(Src->getType());
+ assert(VecType && "Unexpected data type in masked store intrinsic");
+
+ Type *EltTy = VecType->getElementType();
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ // Short-cut if the mask is all-true.
+ bool IsAllOnesMask =
+ isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue();
+
+ if (IsAllOnesMask) {
+ Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+ CI->eraseFromParent();
+ return;
+ }
+
+ // Adjust alignment for the scalar instruction.
+ AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits() / 8);
+ // Bitcast %addr fron i8* to EltTy*
+ Type *NewPtrType =
+ EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+ Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+ unsigned VectorWidth = VecType->getNumElements();
+
+ if (isa<ConstantVector>(Mask)) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+ continue;
+ Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+ Value *Gep =
+ Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+ Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+ }
+ CI->eraseFromParent();
+ return;
+ }
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+ // %to_store = icmp eq i1 %mask_1, true
+ // br i1 %to_store, label %cond.store, label %else
+ //
+ Value *Predicate =
+ Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+ ConstantInt::get(Predicate->getType(), 1));
+
+ // Create "cond" block
+ //
+ // %OneElt = extractelement <16 x i32> %Src, i32 Idx
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %store i32 %OneElt, i32* %EltAddr
+ //
+ BasicBlock *CondBlock =
+ IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+ Value *Gep =
+ Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+ Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock =
+ CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
+ CI->eraseFromParent();
+}
+
+// Translate a masked gather intrinsic like
+// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
+// <16 x i1> %Mask, <16 x i32> %Src)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> %Mask, i32 0
+// % ToLoad0 = icmp eq i1 % Mask0, true
+// br i1 % ToLoad0, label %cond.load, label %else
+//
+// cond.load:
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// % Load0 = load i32, i32* % Ptr0, align 4
+// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
+// br label %else
+//
+// else:
+// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
+// % Mask1 = extractelement <16 x i1> %Mask, i32 1
+// % ToLoad1 = icmp eq i1 % Mask1, true
+// br i1 % ToLoad1, label %cond.load1, label %else2
+//
+// cond.load1:
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// % Load1 = load i32, i32* % Ptr1, align 4
+// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
+// br label %else2
+// . . .
+// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
+// ret <16 x i32> %Result
+static void scalarizeMaskedGather(CallInst *CI) {
+ Value *Ptrs = CI->getArgOperand(0);
+ Value *Alignment = CI->getArgOperand(1);
+ Value *Mask = CI->getArgOperand(2);
+ Value *Src0 = CI->getArgOperand(3);
+
+ VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+
+ assert(VecType && "Unexpected return type of masked load intrinsic");
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ BasicBlock *CondBlock = nullptr;
+ BasicBlock *PrevIfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+ unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ Value *UndefVal = UndefValue::get(VecType);
+
+ // The result vector
+ Value *VResult = UndefVal;
+ unsigned VectorWidth = VecType->getNumElements();
+
+ // Shorten the way if the mask is a vector of constants.
+ bool IsConstMask = isa<ConstantVector>(Mask);
+
+ if (IsConstMask) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+ continue;
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+ "Ptr" + Twine(Idx));
+ LoadInst *Load =
+ Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
+ VResult = Builder.CreateInsertElement(
+ VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx));
+ }
+ Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+ return;
+ }
+
+ PHINode *Phi = nullptr;
+ Value *PrevPhi = UndefVal;
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %Mask1 = extractelement <16 x i1> %Mask, i32 1
+ // %ToLoad1 = icmp eq i1 %Mask1, true
+ // br i1 %ToLoad1, label %cond.load, label %else
+ //
+ if (Idx > 0) {
+ Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+ Phi->addIncoming(VResult, CondBlock);
+ Phi->addIncoming(PrevPhi, PrevIfBlock);
+ PrevPhi = Phi;
+ VResult = Phi;
+ }
+
+ Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
+ "Mask" + Twine(Idx));
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+ ConstantInt::get(Predicate->getType(), 1),
+ "ToLoad" + Twine(Idx));
+
+ // Create "cond" block
+ //
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %Elt = load i32* %EltAddr
+ // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+ //
+ CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+ "Ptr" + Twine(Idx));
+ LoadInst *Load =
+ Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
+ VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
+ "Res" + Twine(Idx));
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+ }
+
+ Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+ Phi->addIncoming(VResult, CondBlock);
+ Phi->addIncoming(PrevPhi, PrevIfBlock);
+ Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+}
+
+// Translate a masked scatter intrinsic, like
+// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
+// <16 x i1> %Mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set.
+//
+// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> % Mask, i32 0
+// % ToStore0 = icmp eq i1 % Mask0, true
+// br i1 %ToStore0, label %cond.store, label %else
+//
+// cond.store:
+// % Elt0 = extractelement <16 x i32> %Src, i32 0
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// store i32 %Elt0, i32* % Ptr0, align 4
+// br label %else
+//
+// else:
+// % Mask1 = extractelement <16 x i1> % Mask, i32 1
+// % ToStore1 = icmp eq i1 % Mask1, true
+// br i1 % ToStore1, label %cond.store1, label %else2
+//
+// cond.store1:
+// % Elt1 = extractelement <16 x i32> %Src, i32 1
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// store i32 % Elt1, i32* % Ptr1, align 4
+// br label %else2
+// . . .
+static void scalarizeMaskedScatter(CallInst *CI) {
+ Value *Src = CI->getArgOperand(0);
+ Value *Ptrs = CI->getArgOperand(1);
+ Value *Alignment = CI->getArgOperand(2);
+ Value *Mask = CI->getArgOperand(3);
+
+ assert(isa<VectorType>(Src->getType()) &&
+ "Unexpected data type in masked scatter intrinsic");
+ assert(isa<VectorType>(Ptrs->getType()) &&
+ isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
+ "Vector of pointers is expected in masked scatter intrinsic");
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+ unsigned VectorWidth = Src->getType()->getVectorNumElements();
+
+ // Shorten the way if the mask is a vector of constants.
+ bool IsConstMask = isa<ConstantVector>(Mask);
+
+ if (IsConstMask) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+ continue;
+ Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+ "Elt" + Twine(Idx));
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+ "Ptr" + Twine(Idx));
+ Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+ }
+ CI->eraseFromParent();
+ return;
+ }
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ // Fill the "else" block, created in the previous iteration
+ //
+ // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
+ // % ToStore = icmp eq i1 % Mask1, true
+ // br i1 % ToStore, label %cond.store, label %else
+ //
+ Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
+ "Mask" + Twine(Idx));
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+ ConstantInt::get(Predicate->getType(), 1),
+ "ToStore" + Twine(Idx));
+
+ // Create "cond" block
+ //
+ // % Elt1 = extractelement <16 x i32> %Src, i32 1
+ // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+ // %store i32 % Elt1, i32* % Ptr1
+ //
+ BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+ "Elt" + Twine(Idx));
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+ "Ptr" + Twine(Idx));
+ Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
+ CI->eraseFromParent();
+}
+
+bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ bool EverMadeChange = false;
+
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ bool MadeChange = true;
+ while (MadeChange) {
+ MadeChange = false;
+ for (Function::iterator I = F.begin(); I != F.end();) {
+ BasicBlock *BB = &*I++;
+ bool ModifiedDTOnIteration = false;
+ MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
+
+ // Restart BB iteration if the dominator tree of the Function was changed
+ if (ModifiedDTOnIteration)
+ break;
+ }
+
+ EverMadeChange |= MadeChange;
+ }
+
+ return EverMadeChange;
+}
+
+bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
+ bool MadeChange = false;
+
+ BasicBlock::iterator CurInstIterator = BB.begin();
+ while (CurInstIterator != BB.end()) {
+ if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++))
+ MadeChange |= optimizeCallInst(CI, ModifiedDT);
+ if (ModifiedDT)
+ return true;
+ }
+
+ return MadeChange;
+}
+
+bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
+ bool &ModifiedDT) {
+
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+ if (II) {
+ switch (II->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::masked_load: {
+ // Scalarize unsupported vector masked load
+ if (!TTI->isLegalMaskedLoad(CI->getType())) {
+ scalarizeMaskedLoad(CI);
+ ModifiedDT = true;
+ return true;
+ }
+ return false;
+ }
+ case Intrinsic::masked_store: {
+ if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
+ scalarizeMaskedStore(CI);
+ ModifiedDT = true;
+ return true;
+ }
+ return false;
+ }
+ case Intrinsic::masked_gather: {
+ if (!TTI->isLegalMaskedGather(CI->getType())) {
+ scalarizeMaskedGather(CI);
+ ModifiedDT = true;
+ return true;
+ }
+ return false;
+ }
+ case Intrinsic::masked_scatter: {
+ if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
+ scalarizeMaskedScatter(CI);
+ ModifiedDT = true;
+ return true;
+ }
+ return false;
+ }
+ }
+ }
+
+ return false;
+}
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c77046fdfaf5..caf5cb497a71 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -114,7 +114,7 @@ namespace {
SmallPtrSet<SDNode *, 32> CombinedNodes;
// AA - Used for DAG load/store alias analysis.
- AliasAnalysis &AA;
+ AliasAnalysis *AA;
/// When an instruction is simplified, add all users of the instruction to
/// the work lists because they might get more simplified now.
@@ -496,9 +496,9 @@ namespace {
SDValue distributeTruncateThroughAnd(SDNode *N);
public:
- DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
+ DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
: DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
- OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
+ OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) {
ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
MaximumLegalStoreInBits = 0;
@@ -1729,10 +1729,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
NumLeftToConsider--;
}
- SDValue Result;
-
// If we've changed things around then replace token factor.
if (Changed) {
+ SDValue Result;
if (Ops.empty()) {
// The entry token is the only possible outcome.
Result = DAG.getEntryNode();
@@ -1749,13 +1748,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
}
}
-
- // Add users to worklist, since we may introduce a lot of new
- // chained token factors while removing memory deps.
- return CombineTo(N, Result, true /*add to worklist*/);
+ return Result;
}
-
- return Result;
+ return SDValue();
}
/// MERGE_VALUES can always be eliminated.
@@ -2131,17 +2126,17 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CarryIn = N->getOperand(2);
+ SDLoc DL(N);
// canonicalize constant to RHS
ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
if (N0C && !N1C)
- return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
- N1, N0, CarryIn);
+ return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
// fold (addcarry x, y, false) -> (uaddo x, y)
if (isNullConstant(CarryIn))
- return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(), N0, N1);
+ return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
return Combined;
@@ -5313,17 +5308,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
}
}
- // If the target supports masking y in (shl, y),
- // fold (shl x, (and y, ((1 << numbits(x)) - 1))) -> (shl x, y)
- if (TLI.isOperationLegal(ISD::SHL, VT) &&
- TLI.supportsModuloShift(ISD::SHL, VT) && N1->getOpcode() == ISD::AND) {
- if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
- if (Mask->getZExtValue() == OpSizeInBits - 1) {
- return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1->getOperand(0));
- }
- }
- }
-
ConstantSDNode *N1C = isConstOrConstSplat(N1);
// fold (shl c1, c2) -> c1<<c2
@@ -5331,7 +5315,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
if (N0C && N1C && !N1C->isOpaque())
return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
// fold (shl 0, x) -> 0
- if (isNullConstant(N0))
+ if (isNullConstantOrNullSplatConstant(N0))
return N0;
// fold (shl x, c >= size(x)) -> undef
if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
@@ -5522,18 +5506,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
- // If the target supports masking y in (sra, y),
- // fold (sra x, (and y, ((1 << numbits(x)) - 1))) -> (sra x, y)
- if (TLI.isOperationLegal(ISD::SRA, VT) &&
- TLI.supportsModuloShift(ISD::SRA, VT) && N1->getOpcode() == ISD::AND) {
- if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
- if (Mask->getZExtValue() == OpSizeInBits - 1) {
- return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, N1->getOperand(0));
- }
- }
- }
-
// Arithmetic shifting an all-sign-bit value is a no-op.
+ // fold (sra 0, x) -> 0
+ // fold (sra -1, x) -> -1
if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
return N0;
@@ -5548,12 +5523,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
if (N0C && N1C && !N1C->isOpaque())
return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
- // fold (sra 0, x) -> 0
- if (isNullConstant(N0))
- return N0;
- // fold (sra -1, x) -> -1
- if (isAllOnesConstant(N0))
- return N0;
// fold (sra x, c >= size(x)) -> undef
if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
return DAG.getUNDEF(VT);
@@ -5691,17 +5660,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
- // If the target supports masking y in (srl, y),
- // fold (srl x, (and y, ((1 << numbits(x)) - 1))) -> (srl x, y)
- if (TLI.isOperationLegal(ISD::SRL, VT) &&
- TLI.supportsModuloShift(ISD::SRL, VT) && N1->getOpcode() == ISD::AND) {
- if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
- if (Mask->getZExtValue() == OpSizeInBits - 1) {
- return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1->getOperand(0));
- }
- }
- }
-
// fold vector ops
if (VT.isVector())
if (SDValue FoldedVOp = SimplifyVBinOp(N))
@@ -5714,7 +5672,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
if (N0C && N1C && !N1C->isOpaque())
return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
// fold (srl 0, x) -> 0
- if (isNullConstant(N0))
+ if (isNullConstantOrNullSplatConstant(N0))
return N0;
// fold (srl x, c >= size(x)) -> undef
if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
@@ -7365,14 +7323,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
N0.getValueSizeInBits(),
std::min(Op.getValueSizeInBits(),
VT.getSizeInBits()));
- if (TruncatedBits.isSubsetOf(Known.Zero)) {
- if (VT.bitsGT(Op.getValueType()))
- return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op);
- if (VT.bitsLT(Op.getValueType()))
- return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-
- return Op;
- }
+ if (TruncatedBits.isSubsetOf(Known.Zero))
+ return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
}
// fold (zext (truncate (load x))) -> (zext (smaller load x))
@@ -7419,14 +7371,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
}
if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
- SDValue Op = N0.getOperand(0);
- if (SrcVT.bitsLT(VT)) {
- Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
- AddToWorklist(Op.getNode());
- } else if (SrcVT.bitsGT(VT)) {
- Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
- AddToWorklist(Op.getNode());
- }
+ SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
+ AddToWorklist(Op.getNode());
return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
}
}
@@ -7440,11 +7386,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
N0.getValueType()) ||
!TLI.isZExtFree(N0.getValueType(), VT))) {
SDValue X = N0.getOperand(0).getOperand(0);
- if (X.getValueType().bitsLT(VT)) {
- X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X);
- } else if (X.getValueType().bitsGT(VT)) {
- X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
- }
+ X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
Mask = Mask.zext(VT.getSizeInBits());
SDLoc DL(N);
@@ -7669,14 +7611,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
}
// fold (aext (truncate x))
- if (N0.getOpcode() == ISD::TRUNCATE) {
- SDValue TruncOp = N0.getOperand(0);
- if (TruncOp.getValueType() == VT)
- return TruncOp; // x iff x size == zext size.
- if (TruncOp.getValueType().bitsGT(VT))
- return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp);
- return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp);
- }
+ if (N0.getOpcode() == ISD::TRUNCATE)
+ return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
// Fold (aext (and (trunc x), cst)) -> (and x, cst)
// if the trunc is not free.
@@ -7687,11 +7623,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
N0.getValueType())) {
SDLoc DL(N);
SDValue X = N0.getOperand(0).getOperand(0);
- if (X.getValueType().bitsLT(VT)) {
- X = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
- } else if (X.getValueType().bitsGT(VT)) {
- X = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
- }
+ X = DAG.getAnyExtOrTrunc(X, DL, VT);
APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
Mask = Mask.zext(VT.getSizeInBits());
return DAG.getNode(ISD::AND, DL, VT,
@@ -14868,6 +14800,55 @@ SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) {
return SDValue();
}
+// Combine shuffles of splat-shuffles of the form:
+// shuffle (shuffle V, undef, splat-mask), undef, M
+// If splat-mask contains undef elements, we need to be careful about
+// introducing undef's in the folded mask which are not the result of composing
+// the masks of the shuffles.
+static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask,
+ ShuffleVectorSDNode *Splat,
+ SelectionDAG &DAG) {
+ ArrayRef<int> SplatMask = Splat->getMask();
+ assert(UserMask.size() == SplatMask.size() && "Mask length mismatch");
+
+ // Prefer simplifying to the splat-shuffle, if possible. This is legal if
+ // every undef mask element in the splat-shuffle has a corresponding undef
+ // element in the user-shuffle's mask or if the composition of mask elements
+ // would result in undef.
+ // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
+ // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
+ // In this case it is not legal to simplify to the splat-shuffle because we
+ // may be exposing the users of the shuffle an undef element at index 1
+ // which was not there before the combine.
+ // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
+ // In this case the composition of masks yields SplatMask, so it's ok to
+ // simplify to the splat-shuffle.
+ // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
+ // In this case the composed mask includes all undef elements of SplatMask
+ // and in addition sets element zero to undef. It is safe to simplify to
+ // the splat-shuffle.
+ auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
+ ArrayRef<int> SplatMask) {
+ for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
+ if (UserMask[i] != -1 && SplatMask[i] == -1 &&
+ SplatMask[UserMask[i]] != -1)
+ return false;
+ return true;
+ };
+ if (CanSimplifyToExistingSplat(UserMask, SplatMask))
+ return SDValue(Splat, 0);
+
+ // Create a new shuffle with a mask that is composed of the two shuffles'
+ // masks.
+ SmallVector<int, 32> NewMask;
+ for (int Idx : UserMask)
+ NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);
+
+ return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
+ Splat->getOperand(0), Splat->getOperand(1),
+ NewMask);
+}
+
SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
EVT VT = N->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
@@ -14914,6 +14895,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
}
+ // A shuffle of a single vector that is a splat can always be folded.
+ if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))
+ if (N1->isUndef() && N0Shuf->isSplat())
+ return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG);
+
// If it is a splat, check if the argument vector is another splat or a
// build_vector.
if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
@@ -16381,17 +16367,17 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
UseAA = false;
#endif
- if (UseAA &&
+ if (UseAA && AA &&
Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) {
// Use alias analysis information.
int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset;
int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset;
AliasResult AAResult =
- AA.alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
- UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
- MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
- UseTBAA ? Op1->getAAInfo() : AAMDNodes()));
+ AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
+ UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
+ MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
+ UseTBAA ? Op1->getAAInfo() : AAMDNodes()) );
if (AAResult == NoAlias)
return false;
}
@@ -16605,7 +16591,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
}
/// This is the entry point for the file.
-void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
+void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
CodeGenOpt::Level OptLevel) {
/// This is the main entry point to this class.
DAGCombiner(*this, AA, OptLevel).Run(Level);
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 8c98e3740f6d..5003b79974eb 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -622,7 +622,7 @@ bool FastISel::selectStackmap(const CallInst *I) {
// have to worry about calling conventions and target-specific lowering code.
// Instead we perform the call lowering right here.
//
- // CALLSEQ_START(0...)
+ // CALLSEQ_START(0, 0...)
// STACKMAP(id, nbytes, ...)
// CALLSEQ_END(0, 0)
//
@@ -1150,16 +1150,16 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
return true;
}
- unsigned Offset = 0;
+ // Byval arguments with frame indices were already handled after argument
+ // lowering and before isel.
+ const auto *Arg =
+ dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
+ if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
+ return true;
+
Optional<MachineOperand> Op;
- if (const auto *Arg = dyn_cast<Argument>(Address))
- // Some arguments' frame index is recorded during argument lowering.
- Offset = FuncInfo.getArgumentFrameIndex(Arg);
- if (Offset)
- Op = MachineOperand::CreateFI(Offset);
- if (!Op)
- if (unsigned Reg = lookUpRegForValue(Address))
- Op = MachineOperand::CreateReg(Reg, false);
+ if (unsigned Reg = lookUpRegForValue(Address))
+ Op = MachineOperand::CreateReg(Reg, false);
// If we have a VLA that has a "use" in a metadata node that's then used
// here but it has no other uses, then we have a problem. E.g.,
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index cdf4d3a8b4e5..606b8952f3c1 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -85,7 +85,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
MF = &mf;
TLI = MF->getSubtarget().getTargetLowering();
RegInfo = &MF->getRegInfo();
- MachineModuleInfo &MMI = MF->getMMI();
const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
unsigned StackAlign = TFI->getStackAlignment();
@@ -214,33 +213,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(&I)))
InitializeRegForValue(&I);
- // Collect llvm.dbg.declare information. This is done now instead of
- // during the initial isel pass through the IR so that it is done
- // in a predictable order.
- if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I)) {
- assert(DI->getVariable() && "Missing variable");
- assert(DI->getDebugLoc() && "Missing location");
- if (MMI.hasDebugInfo()) {
- // Don't handle byval struct arguments or VLAs, for example.
- // Non-byval arguments are handled here (they refer to the stack
- // temporary alloca at this point).
- const Value *Address = DI->getAddress();
- if (Address) {
- if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
- Address = BCI->getOperand(0);
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
- DenseMap<const AllocaInst *, int>::iterator SI =
- StaticAllocaMap.find(AI);
- if (SI != StaticAllocaMap.end()) { // Check for VLAs.
- int FI = SI->second;
- MF->setVariableDbgInfo(DI->getVariable(), DI->getExpression(),
- FI, DI->getDebugLoc());
- }
- }
- }
- }
- }
-
// Decide the preferred extend type for a value.
PreferredExtendType[&I] = getPreferredExtendForValue(&I);
}
@@ -510,12 +482,11 @@ void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A,
/// If the argument does not have any assigned frame index then 0 is
/// returned.
int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
- DenseMap<const Argument *, int>::iterator I =
- ByValArgFrameIndexMap.find(A);
+ auto I = ByValArgFrameIndexMap.find(A);
if (I != ByValArgFrameIndexMap.end())
return I->second;
DEBUG(dbgs() << "Argument does not have assigned frame index!\n");
- return 0;
+ return INT_MAX;
}
unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2654b3ad7a62..9a47a914df91 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1493,7 +1493,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
SDValue Size = Tmp2.getOperand(1);
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
@@ -4187,6 +4187,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
ReplacedNode(Node);
break;
}
+ case ISD::MUL:
case ISD::SDIV:
case ISD::SREM:
case ISD::UDIV:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cde4331cc42d..4c3b514856b7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -675,6 +675,7 @@ private:
// Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
bool SplitVectorOperand(SDNode *N, unsigned OpNo);
SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
+ SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo);
SDValue SplitVecOp_UnaryOp(SDNode *N);
SDValue SplitVecOp_TruncateHelper(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 97a7fab6efd0..ff0e609803d8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1513,6 +1513,22 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::ZERO_EXTEND_VECTOR_INREG:
Res = SplitVecOp_ExtVecInRegOp(N);
break;
+
+ case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMIN:
+ Res = SplitVecOp_VECREDUCE(N, OpNo);
+ break;
}
}
@@ -1565,6 +1581,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
}
+SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
+ EVT ResVT = N->getValueType(0);
+ SDValue Lo, Hi;
+ SDLoc dl(N);
+
+ SDValue VecOp = N->getOperand(OpNo);
+ EVT VecVT = VecOp.getValueType();
+ assert(VecVT.isVector() && "Can only split reduce vector operand");
+ GetSplitVector(VecOp, Lo, Hi);
+ EVT LoOpVT, HiOpVT;
+ std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
+
+ bool NoNaN = N->getFlags().hasNoNaNs();
+ unsigned CombineOpc = 0;
+ switch (N->getOpcode()) {
+ case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
+ case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
+ case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break;
+ case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break;
+ case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break;
+ case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break;
+ case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break;
+ case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
+ case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
+ case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
+ case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
+ case ISD::VECREDUCE_FMAX:
+ CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
+ break;
+ case ISD::VECREDUCE_FMIN:
+ CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
+ break;
+ default:
+ llvm_unreachable("Unexpected reduce ISD node");
+ }
+
+ // Use the appropriate scalar instruction on the split subvectors before
+ // reducing the now partially reduced smaller vector.
+ SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi);
+ return DAG.getNode(N->getOpcode(), dl, ResVT, Partial);
+}
+
SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
// The result has a legal vector type, but the input needs splitting.
EVT ResVT = N->getValueType(0);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d605a1dc1c20..057badcd6b74 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2217,10 +2217,10 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// Also compute a conservative estimate for high known-0 bits.
// More trickiness is possible, but this is sufficient for the
// interesting case of alignment computation.
- unsigned TrailZ = Known.Zero.countTrailingOnes() +
- Known2.Zero.countTrailingOnes();
- unsigned LeadZ = std::max(Known.Zero.countLeadingOnes() +
- Known2.Zero.countLeadingOnes(),
+ unsigned TrailZ = Known.countMinTrailingZeros() +
+ Known2.countMinTrailingZeros();
+ unsigned LeadZ = std::max(Known.countMinLeadingZeros() +
+ Known2.countMinLeadingZeros(),
BitWidth) - BitWidth;
Known.resetAll();
@@ -2233,13 +2233,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// treat a udiv as a logical right shift by the power of 2 known to
// be less than the denominator.
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
- unsigned LeadZ = Known2.Zero.countLeadingOnes();
+ unsigned LeadZ = Known2.countMinLeadingZeros();
computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
- unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros();
- if (RHSUnknownLeadingOnes != BitWidth)
- LeadZ = std::min(BitWidth,
- LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+ unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+ if (RHSMaxLeadingZeros != BitWidth)
+ LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
Known.Zero.setHighBits(LeadZ);
break;
@@ -2359,7 +2358,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
case ISD::CTTZ_ZERO_UNDEF: {
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
// If we have a known 1, its position is our upper bound.
- unsigned PossibleTZ = Known2.One.countTrailingZeros();
+ unsigned PossibleTZ = Known2.countMaxTrailingZeros();
unsigned LowBits = Log2_32(PossibleTZ) + 1;
Known.Zero.setBitsFrom(LowBits);
break;
@@ -2368,7 +2367,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
case ISD::CTLZ_ZERO_UNDEF: {
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
// If we have a known 1, its position is our upper bound.
- unsigned PossibleLZ = Known2.One.countLeadingZeros();
+ unsigned PossibleLZ = Known2.countMaxLeadingZeros();
unsigned LowBits = Log2_32(PossibleLZ) + 1;
Known.Zero.setBitsFrom(LowBits);
break;
@@ -2376,7 +2375,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
case ISD::CTPOP: {
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
// If we know some of the bits are zero, they can't be one.
- unsigned PossibleOnes = BitWidth - Known2.Zero.countPopulation();
+ unsigned PossibleOnes = Known2.countMaxPopulation();
Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
break;
}
@@ -2493,13 +2492,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// going to be 0 in the result. Both addition and complement operations
// preserve the low zero bits.
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
- unsigned KnownZeroLow = Known2.Zero.countTrailingOnes();
+ unsigned KnownZeroLow = Known2.countMinTrailingZeros();
if (KnownZeroLow == 0)
break;
computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
- KnownZeroLow = std::min(KnownZeroLow,
- Known2.Zero.countTrailingOnes());
+ KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
Known.Zero.setLowBits(KnownZeroLow);
break;
}
@@ -2526,15 +2524,13 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// and the other has the top 8 bits clear, we know the top 7 bits of the
// output must be clear.
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
- unsigned KnownZeroHigh = Known2.Zero.countLeadingOnes();
- unsigned KnownZeroLow = Known2.Zero.countTrailingOnes();
+ unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
+ unsigned KnownZeroLow = Known2.countMinTrailingZeros();
computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
Depth + 1);
- KnownZeroHigh = std::min(KnownZeroHigh,
- Known2.Zero.countLeadingOnes());
- KnownZeroLow = std::min(KnownZeroLow,
- Known2.Zero.countTrailingOnes());
+ KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
+ KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
if (Opcode == ISD::ADDE || Opcode == ISD::ADDCARRY) {
// With ADDE and ADDCARRY, a carry bit may be added in, so we can only
@@ -2594,8 +2590,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
- uint32_t Leaders = std::max(Known.Zero.countLeadingOnes(),
- Known2.Zero.countLeadingOnes());
+ uint32_t Leaders =
+ std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
Known.resetAll();
Known.Zero.setHighBits(Leaders);
break;
@@ -2711,8 +2707,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// UMIN - we know that the result will have the maximum of the
// known zero leading bits of the inputs.
- unsigned LeadZero = Known.Zero.countLeadingOnes();
- LeadZero = std::max(LeadZero, Known2.Zero.countLeadingOnes());
+ unsigned LeadZero = Known.countMinLeadingZeros();
+ LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());
Known.Zero &= Known2.Zero;
Known.One &= Known2.One;
@@ -2726,8 +2722,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// UMAX - we know that the result will have the maximum of the
// known one leading bits of the inputs.
- unsigned LeadOne = Known.One.countLeadingOnes();
- LeadOne = std::max(LeadOne, Known2.One.countLeadingOnes());
+ unsigned LeadOne = Known.countMinLeadingOnes();
+ LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());
Known.Zero &= Known2.Zero;
Known.One &= Known2.One;
@@ -2843,8 +2839,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
// Fall back to computeKnownBits to catch other known cases.
KnownBits Known;
computeKnownBits(Val, Known);
- return (Known.Zero.countPopulation() == BitWidth - 1) &&
- (Known.One.countPopulation() == 1);
+ return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
}
unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
@@ -2860,6 +2855,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
EVT VT = Op.getValueType();
assert(VT.isInteger() && "Invalid VT!");
unsigned VTBits = VT.getScalarSizeInBits();
+ unsigned NumElts = DemandedElts.getBitWidth();
unsigned Tmp, Tmp2;
unsigned FirstAnswer = 1;
@@ -2903,6 +2899,39 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
}
return Tmp;
+ case ISD::VECTOR_SHUFFLE: {
+ // Collect the minimum number of sign bits that are shared by every vector
+ // element referenced by the shuffle.
+ APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+ const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+ assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int M = SVN->getMaskElt(i);
+ if (!DemandedElts[i])
+ continue;
+ // For UNDEF elements, we don't know anything about the common state of
+ // the shuffle result.
+ if (M < 0)
+ return 1;
+ if ((unsigned)M < NumElts)
+ DemandedLHS.setBit((unsigned)M % NumElts);
+ else
+ DemandedRHS.setBit((unsigned)M % NumElts);
+ }
+ Tmp = UINT_MAX;
+ if (!!DemandedLHS)
+ Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+ if (!!DemandedRHS) {
+ Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
+ Tmp = std::min(Tmp, Tmp2);
+ }
+ // If we don't know anything, early out and try computeKnownBits fall-back.
+ if (Tmp == 1)
+ break;
+ assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+ return Tmp;
+ }
+
case ISD::SIGN_EXTEND:
case ISD::SIGN_EXTEND_VECTOR_INREG:
Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
@@ -3142,14 +3171,36 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
}
- case ISD::EXTRACT_SUBVECTOR:
- return ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ case ISD::EXTRACT_SUBVECTOR: {
+ // If we know the element index, just demand that subvector elements,
+ // otherwise demand them all.
+ SDValue Src = Op.getOperand(0);
+ ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+ if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+ // Offset the demanded elts by the subvector index.
+ uint64_t Idx = SubIdx->getZExtValue();
+ APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
+ return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
+ }
+ return ComputeNumSignBits(Src, Depth + 1);
+ }
case ISD::CONCAT_VECTORS:
- // Determine the minimum number of sign bits across all input vectors.
- // Early out if the result is already 1.
- Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
- for (unsigned i = 1, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i)
- Tmp = std::min(Tmp, ComputeNumSignBits(Op.getOperand(i), Depth + 1));
+ // Determine the minimum number of sign bits across all demanded
+ // elts of the input vectors. Early out if the result is already 1.
+ Tmp = UINT_MAX;
+ EVT SubVectorVT = Op.getOperand(0).getValueType();
+ unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
+ unsigned NumSubVectors = Op.getNumOperands();
+ for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
+ APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
+ DemandedSub = DemandedSub.trunc(NumSubVectorElts);
+ if (!DemandedSub)
+ continue;
+ Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
+ Tmp = std::min(Tmp, Tmp2);
+ }
+ assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
return Tmp;
}
@@ -3543,7 +3594,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
assert(Operand.getValueType().bitsLT(VT) &&
"Invalid sext node, dst < src!");
if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
- return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+ return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
else if (OpOpcode == ISD::UNDEF)
// sext(undef) = 0, because the top bits will all be the same.
return getConstant(0, DL, VT);
@@ -3559,8 +3610,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
assert(Operand.getValueType().bitsLT(VT) &&
"Invalid zext node, dst < src!");
if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x)
- return getNode(ISD::ZERO_EXTEND, DL, VT,
- Operand.getNode()->getOperand(0));
+ return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
else if (OpOpcode == ISD::UNDEF)
// zext(undef) = 0, because the top bits will be zero.
return getConstant(0, DL, VT);
@@ -3579,13 +3629,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
OpOpcode == ISD::ANY_EXTEND)
// (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x)
- return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+ return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
else if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
// (ext (trunx x)) -> x
if (OpOpcode == ISD::TRUNCATE) {
- SDValue OpOp = Operand.getNode()->getOperand(0);
+ SDValue OpOp = Operand.getOperand(0);
if (OpOp.getValueType() == VT)
return OpOp;
}
@@ -3601,16 +3651,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
assert(Operand.getValueType().bitsGT(VT) &&
"Invalid truncate node, src < dst!");
if (OpOpcode == ISD::TRUNCATE)
- return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+ return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
OpOpcode == ISD::ANY_EXTEND) {
// If the source is smaller than the dest, we still need an extend.
- if (Operand.getNode()->getOperand(0).getValueType().getScalarType()
+ if (Operand.getOperand(0).getValueType().getScalarType()
.bitsLT(VT.getScalarType()))
- return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
- if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
- return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
- return Operand.getNode()->getOperand(0);
+ return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
+ if (Operand.getOperand(0).getValueType().bitsGT(VT))
+ return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
+ return Operand.getOperand(0);
}
if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
@@ -3665,15 +3715,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
// -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
// FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags?
- return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1),
- Operand.getNode()->getOperand(0),
- Operand.getNode()->getFlags());
+ return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
+ Operand.getOperand(0), Operand.getNode()->getFlags());
if (OpOpcode == ISD::FNEG) // --X -> X
- return Operand.getNode()->getOperand(0);
+ return Operand.getOperand(0);
break;
case ISD::FABS:
if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X)
- return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0));
+ return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
break;
}
@@ -5970,7 +6019,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
unsigned NumOps = Ops.size();
switch (NumOps) {
case 0: return getNode(Opcode, DL, VT);
- case 1: return getNode(Opcode, DL, VT, Ops[0]);
+ case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
default: break;
@@ -7520,9 +7569,8 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
KnownBits Known(PtrWidth);
- llvm::computeKnownBits(const_cast<GlobalValue *>(GV), Known,
- getDataLayout());
- unsigned AlignBits = Known.Zero.countTrailingOnes();
+ llvm::computeKnownBits(GV, Known, getDataLayout());
+ unsigned AlignBits = Known.countMinTrailingZeros();
unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
if (Align)
return MinAlign(Align, GVOffset);
@@ -7621,7 +7669,7 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
return false;
// FIXME: The widths are based on this node's type, but build vectors can
- // truncate their operands.
+ // truncate their operands.
SplatValue = APInt(VecWidth, 0);
SplatUndef = APInt(VecWidth, 0);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 50313e2da884..57d340c41c39 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -661,7 +661,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
unsigned RegSize = RegisterVT.getSizeInBits();
unsigned NumSignBits = LOI->NumSignBits;
- unsigned NumZeroBits = LOI->Known.Zero.countLeadingOnes();
+ unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();
if (NumZeroBits == RegSize) {
// The current value is a zero.
@@ -811,9 +811,9 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
}
}
-void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa,
const TargetLibraryInfo *li) {
- AA = &aa;
+ AA = aa;
GFI = gfi;
LibInfo = li;
DL = &DAG.getDataLayout();
@@ -3423,7 +3423,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (isVolatile || NumValues > MaxParallelChains)
// Serialize volatile loads with other side effects.
Root = getRoot();
- else if (AA->pointsToConstantMemory(MemoryLocation(
+ else if (AA && AA->pointsToConstantMemory(MemoryLocation(
SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) {
// Do not serialize (non-volatile) loads of constant memory with anything.
Root = DAG.getEntryNode();
@@ -3535,8 +3535,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
Type *Ty = I.getType();
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
- assert(!AA->pointsToConstantMemory(MemoryLocation(
- SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo)) &&
+ assert((!AA || !AA->pointsToConstantMemory(MemoryLocation(
+ SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) &&
"load_from_swift_error should not be constant memory");
SmallVector<EVT, 4> ValueVTs;
@@ -3817,7 +3817,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
// Do not serialize masked loads of constant memory with anything.
- bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation(
+ bool AddToChain = !AA || !AA->pointsToConstantMemory(MemoryLocation(
PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo));
SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
@@ -3861,7 +3861,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
bool ConstantMemory = false;
if (UniformBase &&
- AA->pointsToConstantMemory(MemoryLocation(
+ AA && AA->pointsToConstantMemory(MemoryLocation(
BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()),
AAInfo))) {
// Do not serialize (non-volatile) loads of constant memory with anything.
@@ -4676,7 +4676,8 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
bool IsIndirect = false;
Optional<MachineOperand> Op;
// Some arguments' frame index is recorded during argument lowering.
- if (int FI = FuncInfo.getArgumentFrameIndex(Arg))
+ int FI = FuncInfo.getArgumentFrameIndex(Arg);
+ if (FI != INT_MAX)
Op = MachineOperand::CreateFI(FI);
if (!Op && N.getNode()) {
@@ -4927,6 +4928,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
return nullptr;
}
+ // Byval arguments with frame indices were already handled after argument
+ // lowering and before isel.
+ const auto *Arg =
+ dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
+ if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
+ return nullptr;
+
SDValue &N = NodeMap[Address];
if (!N.getNode() && isa<Argument>(Address))
// Check unused arguments map.
@@ -4957,20 +4965,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
// virtual register info from the FuncInfo.ValueMap.
if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true,
N)) {
- // If variable is pinned by a alloca in dominating bb then
- // use StaticAllocaMap.
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
- if (AI->getParent() != DI.getParent()) {
- DenseMap<const AllocaInst*, int>::iterator SI =
- FuncInfo.StaticAllocaMap.find(AI);
- if (SI != FuncInfo.StaticAllocaMap.end()) {
- SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second,
- 0, dl, SDNodeOrder);
- DAG.AddDbgValue(SDV, nullptr, false);
- return nullptr;
- }
- }
- }
DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
}
}
@@ -5651,7 +5645,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
int FI = FuncInfo.StaticAllocaMap[Slot];
MCSymbol *FrameAllocSym =
MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
- GlobalValue::getRealLinkageName(MF.getName()), Idx);
+ GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
TII->get(TargetOpcode::LOCAL_ESCAPE))
.addSym(FrameAllocSym)
@@ -5672,7 +5666,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
unsigned IdxVal = unsigned(Idx->getLimitedValue(INT_MAX));
MCSymbol *FrameAllocSym =
MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
- GlobalValue::getRealLinkageName(Fn->getName()), IdxVal);
+ GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
// Create a MCSymbol for the label to avoid any target lowering
// that would make this PC relative.
@@ -5737,6 +5731,24 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
case Intrinsic::experimental_deoptimize:
LowerDeoptimizeCall(&I);
return nullptr;
+
+ case Intrinsic::experimental_vector_reduce_fadd:
+ case Intrinsic::experimental_vector_reduce_fmul:
+ case Intrinsic::experimental_vector_reduce_add:
+ case Intrinsic::experimental_vector_reduce_mul:
+ case Intrinsic::experimental_vector_reduce_and:
+ case Intrinsic::experimental_vector_reduce_or:
+ case Intrinsic::experimental_vector_reduce_xor:
+ case Intrinsic::experimental_vector_reduce_smax:
+ case Intrinsic::experimental_vector_reduce_smin:
+ case Intrinsic::experimental_vector_reduce_umax:
+ case Intrinsic::experimental_vector_reduce_umin:
+ case Intrinsic::experimental_vector_reduce_fmax:
+ case Intrinsic::experimental_vector_reduce_fmin: {
+ visitVectorReduce(I, Intrinsic);
+ return nullptr;
+ }
+
}
}
@@ -5982,7 +5994,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
bool ConstantMemory = false;
// Do not serialize (non-volatile) loads of constant memory with anything.
- if (Builder.AA->pointsToConstantMemory(PtrVal)) {
+ if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) {
Root = Builder.DAG.getEntryNode();
ConstantMemory = true;
} else {
@@ -7422,11 +7434,11 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
// have to worry about calling conventions and target specific lowering code.
// Instead we perform the call lowering right here.
//
- // chain, flag = CALLSEQ_START(chain, 0)
+ // chain, flag = CALLSEQ_START(chain, 0, 0)
// chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
// chain, flag = CALLSEQ_END(chain, 0, 0, flag)
//
- Chain = DAG.getCALLSEQ_START(getRoot(), NullPtr, DL);
+ Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL);
InFlag = Chain.getValue(1);
// Add the <id> and <numBytes> constants.
@@ -7616,6 +7628,76 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
FuncInfo.MF->getFrameInfo().setHasPatchPoint();
}
+void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
+ unsigned Intrinsic) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Op1 = getValue(I.getArgOperand(0));
+ SDValue Op2;
+ if (I.getNumArgOperands() > 1)
+ Op2 = getValue(I.getArgOperand(1));
+ SDLoc dl = getCurSDLoc();
+ EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+ SDValue Res;
+ FastMathFlags FMF;
+ if (isa<FPMathOperator>(I))
+ FMF = I.getFastMathFlags();
+ SDNodeFlags SDFlags;
+ SDFlags.setNoNaNs(FMF.noNaNs());
+
+ switch (Intrinsic) {
+ case Intrinsic::experimental_vector_reduce_fadd:
+ if (FMF.unsafeAlgebra())
+ Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
+ else
+ Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
+ break;
+ case Intrinsic::experimental_vector_reduce_fmul:
+ if (FMF.unsafeAlgebra())
+ Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
+ else
+ Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
+ break;
+ case Intrinsic::experimental_vector_reduce_add:
+ Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_mul:
+ Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_and:
+ Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_or:
+ Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_xor:
+ Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_smax:
+ Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_smin:
+ Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_umax:
+ Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_umin:
+ Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_fmax: {
+ Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
+ break;
+ }
+ case Intrinsic::experimental_vector_reduce_fmin: {
+ Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
+ break;
+ }
+ default:
+ llvm_unreachable("Unhandled vector reduce intrinsic");
+ }
+ setValue(&I, Res);
+}
+
/// Returns an AttributeList representing the attributes applied to the return
/// value of the given call.
static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 9e9989058ae5..bdaee858da61 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -604,11 +604,11 @@ public:
SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
CodeGenOpt::Level ol)
: CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()),
- DAG(dag), FuncInfo(funcinfo),
+ DAG(dag), DL(nullptr), AA(nullptr), FuncInfo(funcinfo),
HasTailCall(false) {
}
- void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+ void init(GCFunctionInfo *gfi, AliasAnalysis *AA,
const TargetLibraryInfo *li);
/// Clear out the current SelectionDAG and the associated state and prepare
@@ -909,6 +909,8 @@ private:
void visitGCRelocate(const GCRelocateInst &I);
void visitGCResult(const GCResultInst &I);
+ void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
+
void visitUserOp1(const Instruction &I) {
llvm_unreachable("UserOp1 should not exist at instruction selection time!");
}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 26dd45ef933f..c37d7080f2c5 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -346,6 +346,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::SETFALSE: return "setfalse";
case ISD::SETFALSE2: return "setfalse2";
}
+ case ISD::VECREDUCE_FADD: return "vecreduce_fadd";
+ case ISD::VECREDUCE_FMUL: return "vecreduce_fmul";
+ case ISD::VECREDUCE_ADD: return "vecreduce_add";
+ case ISD::VECREDUCE_MUL: return "vecreduce_mul";
+ case ISD::VECREDUCE_AND: return "vecreduce_and";
+ case ISD::VECREDUCE_OR: return "vecreduce_or";
+ case ISD::VECREDUCE_XOR: return "vecreduce_xor";
+ case ISD::VECREDUCE_SMAX: return "vecreduce_smax";
+ case ISD::VECREDUCE_SMIN: return "vecreduce_smin";
+ case ISD::VECREDUCE_UMAX: return "vecreduce_umax";
+ case ISD::VECREDUCE_UMIN: return "vecreduce_umin";
+ case ISD::VECREDUCE_FMAX: return "vecreduce_fmax";
+ case ISD::VECREDUCE_FMIN: return "vecreduce_fmin";
}
}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 3aabdaeaa094..5e0feccb6b4c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -38,6 +38,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePassRegistry.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -299,7 +300,7 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
FuncInfo(new FunctionLoweringInfo()),
CurDAG(new SelectionDAG(tm, OL)),
SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
- GFI(),
+ AA(), GFI(),
OptLevel(OL),
DAGSize(0) {
initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
@@ -317,7 +318,8 @@ SelectionDAGISel::~SelectionDAGISel() {
}
void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<AAResultsWrapperPass>();
+ if (OptLevel != CodeGenOpt::None)
+ AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<GCModuleInfo>();
AU.addRequired<StackProtector>();
AU.addPreserved<StackProtector>();
@@ -394,7 +396,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
TII = MF->getSubtarget().getInstrInfo();
TLI = MF->getSubtarget().getTargetLowering();
RegInfo = &MF->getRegInfo();
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
@@ -406,12 +407,22 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
CurDAG->init(*MF, *ORE);
FuncInfo->set(Fn, *MF, CurDAG);
+ // Now get the optional analyzes if we want to.
+ // This is based on the possibly changed OptLevel (after optnone is taken
+ // into account). That's unfortunate but OK because it just means we won't
+ // ask for passes that have been required anyway.
+
if (UseMBPI && OptLevel != CodeGenOpt::None)
FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
else
FuncInfo->BPI = nullptr;
- SDB->init(GFI, *AA, LibInfo);
+ if (OptLevel != CodeGenOpt::None)
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ else
+ AA = nullptr;
+
+ SDB->init(GFI, AA, LibInfo);
MF->setHasInlineAsm(false);
@@ -715,7 +726,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
{
NamedRegionTimer T("combine1", "DAG Combining 1", GroupName,
GroupDescription, TimePassesIsEnabled);
- CurDAG->Combine(BeforeLegalizeTypes, *AA, OptLevel);
+ CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel);
}
DEBUG(dbgs() << "Optimized lowered selection DAG: BB#" << BlockNumber
@@ -747,7 +758,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
{
NamedRegionTimer T("combine_lt", "DAG Combining after legalize types",
GroupName, GroupDescription, TimePassesIsEnabled);
- CurDAG->Combine(AfterLegalizeTypes, *AA, OptLevel);
+ CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel);
}
DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber
@@ -781,7 +792,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
{
NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors",
GroupName, GroupDescription, TimePassesIsEnabled);
- CurDAG->Combine(AfterLegalizeVectorOps, *AA, OptLevel);
+ CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel);
}
DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#"
@@ -807,7 +818,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
{
NamedRegionTimer T("combine2", "DAG Combining 2", GroupName,
GroupDescription, TimePassesIsEnabled);
- CurDAG->Combine(AfterLegalizeDAG, *AA, OptLevel);
+ CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel);
}
DEBUG(dbgs() << "Optimized legalized selection DAG: BB#" << BlockNumber
@@ -1145,6 +1156,51 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
}
}
+/// Collect llvm.dbg.declare information. This is done after argument lowering
+/// in case the declarations refer to arguments.
+static void processDbgDeclares(FunctionLoweringInfo *FuncInfo) {
+ MachineFunction *MF = FuncInfo->MF;
+ const DataLayout &DL = MF->getDataLayout();
+ for (const BasicBlock &BB : *FuncInfo->Fn) {
+ for (const Instruction &I : BB) {
+ const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I);
+ if (!DI)
+ continue;
+
+ assert(DI->getVariable() && "Missing variable");
+ assert(DI->getDebugLoc() && "Missing location");
+ const Value *Address = DI->getAddress();
+ if (!Address)
+ continue;
+
+ // Look through casts and constant offset GEPs. These mostly come from
+ // inalloca.
+ APInt Offset(DL.getPointerSizeInBits(0), 0);
+ Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+ // Check if the variable is a static alloca or a byval or inalloca
+ // argument passed in memory. If it is not, then we will ignore this
+ // intrinsic and handle this during isel like dbg.value.
+ int FI = INT_MAX;
+ if (const auto *AI = dyn_cast<AllocaInst>(Address)) {
+ auto SI = FuncInfo->StaticAllocaMap.find(AI);
+ if (SI != FuncInfo->StaticAllocaMap.end())
+ FI = SI->second;
+ } else if (const auto *Arg = dyn_cast<Argument>(Address))
+ FI = FuncInfo->getArgumentFrameIndex(Arg);
+
+ if (FI == INT_MAX)
+ continue;
+
+ DIExpression *Expr = DI->getExpression();
+ if (Offset.getBoolValue())
+ Expr = DIExpression::prepend(Expr, DIExpression::NoDeref,
+ Offset.getZExtValue());
+ MF->setVariableDbgInfo(DI->getVariable(), Expr, FI, DI->getDebugLoc());
+ }
+ }
+}
+
/// Propagate swifterror values through the machine function CFG.
static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) {
auto *TLI = FuncInfo->TLI;
@@ -1317,6 +1373,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
}
createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB);
+ processDbgDeclares(FuncInfo);
+
// Iterate over all basic blocks in the function.
for (const BasicBlock *LLVMBB : RPOT) {
if (OptLevel != CodeGenOpt::None) {
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 23f597db140c..befbd80d7965 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -417,11 +417,10 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
TLI.isZExtFree(SmallVT, Op.getValueType())) {
// We found a type with free casts.
- SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT,
- DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
- Op.getNode()->getOperand(0)),
- DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
- Op.getNode()->getOperand(1)));
+ SDValue X = DAG.getNode(
+ Op.getOpcode(), dl, SmallVT,
+ DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
+ DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1)));
bool NeedZext = DemandedSize > SmallVTBits;
SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND,
dl, Op.getValueType(), X);
@@ -817,7 +816,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
// Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
// are not demanded. This will likely allow the anyext to be folded away.
if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) {
- SDValue InnerOp = InOp.getNode()->getOperand(0);
+ SDValue InnerOp = InOp.getOperand(0);
EVT InnerVT = InnerOp.getValueType();
unsigned InnerBits = InnerVT.getSizeInBits();
if (ShAmt < InnerBits && NewMask.getActiveBits() <= InnerBits &&
diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp
index 4837495777da..2638702da152 100644
--- a/lib/CodeGen/ShrinkWrap.cpp
+++ b/lib/CodeGen/ShrinkWrap.cpp
@@ -282,8 +282,14 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
if (!Restore)
Restore = &MBB;
- else
+ else if (MPDT->getNode(&MBB)) // If the block is not in the post dom tree, it
+ // means the block never returns. If that's the
+ // case, we don't want to call
+ // `findNearestCommonDominator`, which will
+ // return `Restore`.
Restore = MPDT->findNearestCommonDominator(Restore, &MBB);
+ else
+ Restore = nullptr; // Abort, we can't find a restore point in this case.
// Make sure we would be able to insert the restore code before the
// terminator.
@@ -293,7 +299,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
continue;
// One of the terminator needs to happen before the restore point.
if (MBB.succ_empty()) {
- Restore = nullptr;
+ Restore = nullptr; // Abort, we can't find a restore point in this case.
break;
}
// Look for a restore point that post-dominates all the successors.
@@ -419,7 +425,7 @@ static bool isIrreducibleCFG(const MachineFunction &MF,
}
bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
- if (MF.empty() || !isShrinkWrapEnabled(MF))
+ if (skipFunction(*MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF))
return false;
DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index ab578df4069d..e9eff4d0acb2 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -93,8 +93,8 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
doubleUnderDataTy, // __data
VoidPtrTy, // __personality
VoidPtrTy, // __lsda
- doubleUnderJBufTy, // __jbuf
- nullptr);
+ doubleUnderJBufTy // __jbuf
+ );
return true;
}
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 34892680aceb..1d232c71d824 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -232,7 +232,11 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
if (!MD)
return nullptr;
- auto *VM = dyn_cast<ValueAsMetadata>(MD->getOperand(0));
+ const MDOperand &Op = MD->getOperand(0);
+ if (!Op.get())
+ return nullptr;
+
+ auto *VM = dyn_cast<ValueAsMetadata>(Op);
if (!VM)
report_fatal_error("MD_associated operand is not ValueAsMetadata");
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 150195f5f85b..e6c5d8753b83 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -487,6 +487,14 @@ void TargetPassConfig::addIRPasses() {
// Insert calls to mcount-like functions.
addPass(createCountingFunctionInserterPass());
+
+ // Add scalarization of target's unsupported masked memory intrinsics pass.
+ // the unsupported intrinsic will be replaced with a chain of basic blocks,
+ // that stores/loads element one-by-one if the appropriate mask bit is set.
+ addPass(createScalarizeMaskedMemIntrinPass());
+
+ // Expand reduction intrinsics into shuffle sequences if the target wants to.
+ addPass(createExpandReductionsPass());
}
/// Turn exception handling constructs into something the code generators can
@@ -607,6 +615,9 @@ void TargetPassConfig::addMachinePasses() {
addPass(&LocalStackSlotAllocationID, false);
}
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(&LiveRangeShrinkID);
+
// Run pre-ra passes.
addPreRegAlloc();
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 75359fe3c0ea..7392c8327148 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -155,7 +155,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<AAResultsWrapperPass>();
+ AU.addUsedIfAvailable<AAResultsWrapperPass>();
AU.addUsedIfAvailable<LiveVariables>();
AU.addPreserved<LiveVariables>();
AU.addPreserved<SlotIndexes>();
@@ -1627,7 +1627,10 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
InstrItins = MF->getSubtarget().getInstrItineraryData();
LV = getAnalysisIfAvailable<LiveVariables>();
LIS = getAnalysisIfAvailable<LiveIntervals>();
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ if (auto *AAPass = getAnalysisIfAvailable<AAResultsWrapperPass>())
+ AA = &AAPass->getAAResults();
+ else
+ AA = nullptr;
OptLevel = TM.getOptLevel();
bool MadeChange = false;
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index f085132b6a94..407fd9b162e9 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -206,11 +206,12 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
if (InputReg != OutputReg) {
MachineRegisterInfo &MRI = F.getRegInfo();
unsigned InputSub = Input.getSubReg();
- if (InputSub == 0) {
- MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg));
+ if (InputSub == 0 &&
+ MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg))) {
MRI.replaceRegWith(OutputReg, InputReg);
} else {
- // The input register to the PHI has a subregister:
+ // The input register to the PHI has a subregister or it can't be
+ // constrained to the proper register class:
// insert a COPY instead of simply replacing the output
// with the input.
const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo();
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index 410d5a3777d4..8d9353ae5f5e 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -13,7 +13,7 @@ add_llvm_library(LLVMDebugInfoCodeView
ModuleDebugFragmentVisitor.cpp
ModuleDebugInlineeLinesFragment.cpp
ModuleDebugLineFragment.cpp
- ModuleDebugUnknownFragment.cpp
+ RandomAccessTypeVisitor.cpp
RecordSerialization.cpp
StringTable.cpp
SymbolRecordMapping.cpp
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index 0069ee3cc904..b6ed0453d9c4 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -26,8 +26,7 @@ CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
: Callbacks(Callbacks) {}
template <typename T>
-static Error visitKnownRecord(CVTypeVisitor &Visitor, CVType &Record,
- TypeVisitorCallbacks &Callbacks) {
+static Error visitKnownRecord(CVType &Record, TypeVisitorCallbacks &Callbacks) {
TypeRecordKind RK = static_cast<TypeRecordKind>(Record.Type);
T KnownRecord(RK);
if (auto EC = Callbacks.visitKnownRecord(Record, KnownRecord))
@@ -76,7 +75,7 @@ void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) {
Handlers.push_back(&Handler);
}
-Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+Expected<bool> CVTypeVisitor::handleTypeServer(CVType &Record) {
if (Record.Type == TypeLeafKind::LF_TYPESERVER2 && !Handlers.empty()) {
auto TS = deserializeTypeServerRecord(Record);
if (!TS)
@@ -90,16 +89,16 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
// If the handler processed the record, return success.
if (*ExpectedResult)
- return Error::success();
+ return true;
// Otherwise keep searching for a handler, eventually falling out and
// using the default record handler.
}
}
+ return false;
+}
- if (auto EC = Callbacks.visitTypeBegin(Record))
- return EC;
-
+Error CVTypeVisitor::finishVisitation(CVType &Record) {
switch (Record.Type) {
default:
if (auto EC = Callbacks.visitUnknownType(Record))
@@ -107,7 +106,7 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
break;
#define TYPE_RECORD(EnumName, EnumVal, Name) \
case EnumName: { \
- if (auto EC = visitKnownRecord<Name##Record>(*this, Record, Callbacks)) \
+ if (auto EC = visitKnownRecord<Name##Record>(Record, Callbacks)) \
return EC; \
break; \
}
@@ -124,6 +123,32 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
return Error::success();
}
+Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) {
+ auto ExpectedResult = handleTypeServer(Record);
+ if (!ExpectedResult)
+ return ExpectedResult.takeError();
+ if (*ExpectedResult)
+ return Error::success();
+
+ if (auto EC = Callbacks.visitTypeBegin(Record, Index))
+ return EC;
+
+ return finishVisitation(Record);
+}
+
+Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+ auto ExpectedResult = handleTypeServer(Record);
+ if (!ExpectedResult)
+ return ExpectedResult.takeError();
+ if (*ExpectedResult)
+ return Error::success();
+
+ if (auto EC = Callbacks.visitTypeBegin(Record))
+ return EC;
+
+ return finishVisitation(Record);
+}
+
static Error visitMemberRecord(CVMemberRecord &Record,
TypeVisitorCallbacks &Callbacks) {
if (auto EC = Callbacks.visitMemberBegin(Record))
diff --git a/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp b/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp
deleted file mode 100644
index 9fd2cb8ed3e8..000000000000
--- a/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-//===- ModuleDebugUnknownFragment.cpp ---------------------------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h" \ No newline at end of file
diff --git a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
new file mode 100644
index 000000000000..4cb9acbe07d9
--- /dev/null
+++ b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
@@ -0,0 +1,91 @@
+//===- RandomAccessTypeVisitor.cpp ---------------------------- *- C++ --*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h"
+
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+RandomAccessTypeVisitor::RandomAccessTypeVisitor(
+ const CVTypeArray &Types, uint32_t NumRecords,
+ PartialOffsetArray PartialOffsets)
+ : Database(NumRecords), Types(Types), DatabaseVisitor(Database),
+ InternalVisitor(Pipeline), PartialOffsets(PartialOffsets) {
+ Pipeline.addCallbackToPipeline(Deserializer);
+ Pipeline.addCallbackToPipeline(DatabaseVisitor);
+
+ KnownOffsets.resize(Database.capacity());
+}
+
+Error RandomAccessTypeVisitor::visitTypeIndex(TypeIndex TI,
+ TypeVisitorCallbacks &Callbacks) {
+ assert(TI.toArrayIndex() < Database.capacity());
+
+ if (!Database.contains(TI)) {
+ if (auto EC = visitRangeForType(TI))
+ return EC;
+ }
+
+ assert(Database.contains(TI));
+ auto &Record = Database.getTypeRecord(TI);
+ CVTypeVisitor V(Callbacks);
+ return V.visitTypeRecord(Record, TI);
+}
+
+Error RandomAccessTypeVisitor::visitRangeForType(TypeIndex TI) {
+ if (PartialOffsets.empty()) {
+ TypeIndex TIB(TypeIndex::FirstNonSimpleIndex);
+ TypeIndex TIE = TIB + Database.capacity();
+ return visitRange(TIB, 0, TIE);
+ }
+
+ auto Next = std::upper_bound(PartialOffsets.begin(), PartialOffsets.end(), TI,
+ [](TypeIndex Value, const TypeIndexOffset &IO) {
+ return Value < IO.Type;
+ });
+
+ assert(Next != PartialOffsets.begin());
+ auto Prev = std::prev(Next);
+
+ TypeIndex TIB = Prev->Type;
+ TypeIndex TIE;
+ if (Next == PartialOffsets.end()) {
+ TIE = TypeIndex::fromArrayIndex(Database.capacity());
+ } else {
+ TIE = Next->Type;
+ }
+
+ if (auto EC = visitRange(TIB, Prev->Offset, TIE))
+ return EC;
+ return Error::success();
+}
+
+Error RandomAccessTypeVisitor::visitRange(TypeIndex Begin, uint32_t BeginOffset,
+ TypeIndex End) {
+
+ auto RI = Types.at(BeginOffset);
+ assert(RI != Types.end());
+
+ while (Begin != End) {
+ assert(!Database.contains(Begin));
+ if (auto EC = InternalVisitor.visitTypeRecord(*RI, Begin))
+ return EC;
+ KnownOffsets[Begin.toArrayIndex()] = BeginOffset;
+
+ BeginOffset += RI.getRecordLength();
+ ++Begin;
+ ++RI;
+ }
+
+ return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp
index 5b8841041f88..7924440e5e29 100644
--- a/lib/DebugInfo/CodeView/TypeDatabase.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabase.cpp
@@ -65,20 +65,32 @@ static const SimpleTypeEntry SimpleTypeNames[] = {
{"__bool64*", SimpleTypeKind::Boolean64},
};
-TypeDatabase::TypeDatabase(uint32_t ExpectedSize) : TypeNameStorage(Allocator) {
- CVUDTNames.reserve(ExpectedSize);
- TypeRecords.reserve(ExpectedSize);
+TypeDatabase::TypeDatabase(uint32_t Capacity) : TypeNameStorage(Allocator) {
+ CVUDTNames.resize(Capacity);
+ TypeRecords.resize(Capacity);
+ ValidRecords.resize(Capacity);
}
-/// Gets the type index for the next type record.
-TypeIndex TypeDatabase::getNextTypeIndex() const {
- return TypeIndex(TypeIndex::FirstNonSimpleIndex + CVUDTNames.size());
+TypeIndex TypeDatabase::appendType(StringRef Name, const CVType &Data) {
+ TypeIndex TI;
+ TI = getAppendIndex();
+ if (TI.toArrayIndex() >= capacity())
+ grow();
+ recordType(Name, TI, Data);
+ return TI;
}
-/// Records the name of a type, and reserves its type index.
-void TypeDatabase::recordType(StringRef Name, const CVType &Data) {
- CVUDTNames.push_back(Name);
- TypeRecords.push_back(Data);
+void TypeDatabase::recordType(StringRef Name, TypeIndex Index,
+ const CVType &Data) {
+ uint32_t AI = Index.toArrayIndex();
+
+ assert(!contains(Index));
+ assert(AI < capacity());
+
+ CVUDTNames[AI] = Name;
+ TypeRecords[AI] = Data;
+ ValidRecords.set(AI);
+ ++Count;
}
/// Saves the name in a StringSet and creates a stable StringRef.
@@ -104,24 +116,47 @@ StringRef TypeDatabase::getTypeName(TypeIndex Index) const {
return "<unknown simple type>";
}
- uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
- if (I < CVUDTNames.size())
- return CVUDTNames[I];
+ if (contains(Index))
+ return CVUDTNames[Index.toArrayIndex()];
return "<unknown UDT>";
}
const CVType &TypeDatabase::getTypeRecord(TypeIndex Index) const {
- return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex];
+ assert(contains(Index));
+ return TypeRecords[Index.toArrayIndex()];
}
CVType &TypeDatabase::getTypeRecord(TypeIndex Index) {
- return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex];
+ assert(contains(Index));
+ return TypeRecords[Index.toArrayIndex()];
+}
+
+bool TypeDatabase::contains(TypeIndex Index) const {
+ uint32_t AI = Index.toArrayIndex();
+ if (AI >= capacity())
+ return false;
+
+ return ValidRecords.test(AI);
}
-bool TypeDatabase::containsTypeIndex(TypeIndex Index) const {
- uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
- return I < CVUDTNames.size();
+uint32_t TypeDatabase::size() const { return Count; }
+
+uint32_t TypeDatabase::capacity() const { return TypeRecords.size(); }
+
+void TypeDatabase::grow() {
+ TypeRecords.emplace_back();
+ CVUDTNames.emplace_back();
+ ValidRecords.resize(ValidRecords.size() + 1);
}
-uint32_t TypeDatabase::size() const { return CVUDTNames.size(); }
+bool TypeDatabase::empty() const { return size() == 0; }
+
+TypeIndex TypeDatabase::getAppendIndex() const {
+ if (empty())
+ return TypeIndex::fromArrayIndex(0);
+
+ int Index = ValidRecords.find_last();
+ assert(Index != -1);
+ return TypeIndex::fromArrayIndex(Index) + 1;
+}
diff --git a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
index c234afd2288b..8d97f8b1cb40 100644
--- a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
@@ -15,7 +15,7 @@ using namespace llvm;
using namespace llvm::codeview;
-Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
+Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record) {
assert(!IsInFieldList);
// Reset Name to the empty string. If the visitor sets it, we know it.
Name = "";
@@ -28,6 +28,22 @@ Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
return Error::success();
}
+Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
+ if (auto EC = visitTypeBegin(Record))
+ return EC;
+
+ CurrentTypeIndex = Index;
+ return Error::success();
+}
+
+StringRef TypeDatabaseVisitor::getTypeName(TypeIndex Index) const {
+ return TypeDB->getTypeName(Index);
+}
+
+StringRef TypeDatabaseVisitor::saveTypeName(StringRef Name) {
+ return TypeDB->saveTypeName(Name);
+}
+
Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
if (CVR.Type == LF_FIELDLIST) {
assert(IsInFieldList);
@@ -39,7 +55,12 @@ Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
// CVUDTNames is indexed by type index, and must have one entry for every
// type. Field list members are not recorded, and are only referenced by
// their containing field list record.
- TypeDB.recordType(Name, CVR);
+ if (CurrentTypeIndex)
+ TypeDB->recordType(Name, *CurrentTypeIndex, CVR);
+ else
+ TypeDB->appendType(Name, CVR);
+
+ CurrentTypeIndex.reset();
return Error::success();
}
@@ -73,13 +94,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
uint32_t Size = Indices.size();
SmallString<256> TypeName("(");
for (uint32_t I = 0; I < Size; ++I) {
- StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+ StringRef ArgTypeName = getTypeName(Indices[I]);
TypeName.append(ArgTypeName);
if (I + 1 != Size)
TypeName.append(", ");
}
TypeName.push_back(')');
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
return Error::success();
}
@@ -89,13 +110,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
uint32_t Size = Indices.size();
SmallString<256> TypeName("\"");
for (uint32_t I = 0; I < Size; ++I) {
- StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+ StringRef ArgTypeName = getTypeName(Indices[I]);
TypeName.append(ArgTypeName);
if (I + 1 != Size)
TypeName.append("\" \"");
}
TypeName.push_back('\"');
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
return Error::success();
}
@@ -132,26 +153,26 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
ProcedureRecord &Proc) {
- StringRef ReturnTypeName = TypeDB.getTypeName(Proc.getReturnType());
- StringRef ArgListTypeName = TypeDB.getTypeName(Proc.getArgumentList());
+ StringRef ReturnTypeName = getTypeName(Proc.getReturnType());
+ StringRef ArgListTypeName = getTypeName(Proc.getArgumentList());
SmallString<256> TypeName(ReturnTypeName);
TypeName.push_back(' ');
TypeName.append(ArgListTypeName);
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
return Error::success();
}
Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
MemberFunctionRecord &MF) {
- StringRef ReturnTypeName = TypeDB.getTypeName(MF.getReturnType());
- StringRef ClassTypeName = TypeDB.getTypeName(MF.getClassType());
- StringRef ArgListTypeName = TypeDB.getTypeName(MF.getArgumentList());
+ StringRef ReturnTypeName = getTypeName(MF.getReturnType());
+ StringRef ClassTypeName = getTypeName(MF.getClassType());
+ StringRef ArgListTypeName = getTypeName(MF.getArgumentList());
SmallString<256> TypeName(ReturnTypeName);
TypeName.push_back(' ');
TypeName.append(ClassTypeName);
TypeName.append("::");
TypeName.append(ArgListTypeName);
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
return Error::success();
}
@@ -171,13 +192,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
if (Ptr.isPointerToMember()) {
const MemberPointerInfo &MI = Ptr.getMemberInfo();
- StringRef PointeeName = TypeDB.getTypeName(Ptr.getReferentType());
- StringRef ClassName = TypeDB.getTypeName(MI.getContainingType());
+ StringRef PointeeName = getTypeName(Ptr.getReferentType());
+ StringRef ClassName = getTypeName(MI.getContainingType());
SmallString<256> TypeName(PointeeName);
TypeName.push_back(' ');
TypeName.append(ClassName);
TypeName.append("::*");
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
} else {
SmallString<256> TypeName;
if (Ptr.isConst())
@@ -187,7 +208,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
if (Ptr.isUnaligned())
TypeName.append("__unaligned ");
- TypeName.append(TypeDB.getTypeName(Ptr.getReferentType()));
+ TypeName.append(getTypeName(Ptr.getReferentType()));
if (Ptr.getMode() == PointerMode::LValueReference)
TypeName.append("&");
@@ -197,7 +218,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
TypeName.append("*");
if (!TypeName.empty())
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
}
return Error::success();
}
@@ -205,7 +226,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
- StringRef ModifiedName = TypeDB.getTypeName(Mod.getModifiedType());
+ StringRef ModifiedName = getTypeName(Mod.getModifiedType());
SmallString<256> TypeName;
if (Mods & uint16_t(ModifierOptions::Const))
TypeName.append("const ");
@@ -214,14 +235,14 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
if (Mods & uint16_t(ModifierOptions::Unaligned))
TypeName.append("__unaligned ");
TypeName.append(ModifiedName);
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
return Error::success();
}
Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
VFTableShapeRecord &Shape) {
- Name = TypeDB.saveTypeName("<vftable " + utostr(Shape.getEntryCount()) +
- " methods>");
+ Name =
+ saveTypeName("<vftable " + utostr(Shape.getEntryCount()) + " methods>");
return Error::success();
}
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 870d95221e7d..27a6e0987886 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -173,10 +173,13 @@ void TypeDumpVisitor::printItemIndex(StringRef FieldName, TypeIndex TI) const {
}
Error TypeDumpVisitor::visitTypeBegin(CVType &Record) {
+ TypeIndex TI = getSourceDB().getAppendIndex();
+ return visitTypeBegin(Record, TI);
+}
+
+Error TypeDumpVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
W->startLine() << getLeafTypeName(Record.Type);
- W->getOStream() << " ("
- << HexNumber(getSourceDB().getNextTypeIndex().getIndex())
- << ")";
+ W->getOStream() << " (" << HexNumber(Index.getIndex()) << ")";
W->getOStream() << " {\n";
W->indent();
W->printEnum("TypeLeafKind", unsigned(Record.Type),
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 246899ac12b9..59a060d143ff 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -66,7 +66,7 @@ uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size,
RelocAddrMap::const_iterator AI = Relocs->find(*Off);
if (AI == Relocs->end())
return Data.getUnsigned(Off, Size);
- return Data.getUnsigned(Off, Size) + AI->second.second;
+ return Data.getUnsigned(Off, Size) + AI->second.Value;
}
static void dumpAccelSection(raw_ostream &OS, StringRef Name,
@@ -905,16 +905,23 @@ static Error createError(const Twine &Reason, llvm::Error E) {
/// Returns the address of symbol relocation used against. Used for futher
/// relocations computation. Symbol's section load address is taken in account if
/// LoadedObjectInfo interface is provided.
-static Expected<uint64_t> getSymbolAddress(const object::ObjectFile &Obj,
- const RelocationRef &Reloc,
- const LoadedObjectInfo *L) {
+static Expected<uint64_t>
+getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc,
+ const LoadedObjectInfo *L,
+ std::map<SymbolRef, uint64_t> &Cache) {
uint64_t Ret = 0;
object::section_iterator RSec = Obj.section_end();
object::symbol_iterator Sym = Reloc.getSymbol();
+ std::map<SymbolRef, uint64_t>::iterator CacheIt = Cache.end();
// First calculate the address of the symbol or section as it appears
// in the object file
if (Sym != Obj.symbol_end()) {
+ bool New;
+ std::tie(CacheIt, New) = Cache.insert({*Sym, 0});
+ if (!New)
+ return CacheIt->second;
+
Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
if (!SymAddrOrErr)
return createError("error: failed to compute symbol address: ",
@@ -943,6 +950,10 @@ static Expected<uint64_t> getSymbolAddress(const object::ObjectFile &Obj,
if (L && RSec != Obj.section_end())
if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec))
Ret += SectionLoadAddress - RSec->getAddress();
+
+ if (CacheIt != Cache.end())
+ CacheIt->second = Ret;
+
return Ret;
}
@@ -1075,6 +1086,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
continue;
}
+ std::map<SymbolRef, uint64_t> AddrCache;
if (Section.relocation_begin() != Section.relocation_end()) {
uint64_t SectionSize = RelocatedSection->getSize();
for (const RelocationRef &Reloc : Section.relocations()) {
@@ -1083,7 +1095,8 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
if (isRelocScattered(Obj, Reloc))
continue;
- Expected<uint64_t> SymAddrOrErr = getSymbolAddress(Obj, Reloc, L);
+ Expected<uint64_t> SymAddrOrErr =
+ getSymbolAddress(Obj, Reloc, L, AddrCache);
if (!SymAddrOrErr) {
errs() << toString(SymAddrOrErr.takeError()) << '\n';
continue;
@@ -1114,7 +1127,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
<< " at " << format("%p", Address)
<< " with width " << format("%d", R.Width)
<< "\n");
- Map->insert(std::make_pair(Address, std::make_pair(R.Width, R.Value)));
+ Map->insert({Address, {(uint8_t)R.Width, R.Value}});
}
}
}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index 0cf71f530446..6601393d7459 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -54,9 +54,8 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
if (ParsedCUOffsets.insert(CUOffset).second) {
DWARFAddressRangesVector CURanges;
CU->collectAddressRanges(CURanges);
- for (const auto &R : CURanges) {
- appendRange(CUOffset, R.first, R.second);
- }
+ for (const auto &R : CURanges)
+ appendRange(CUOffset, R.LowPC, R.HighPC);
}
}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index 9380fe8fe85d..8da797750abd 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -69,8 +69,8 @@ DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const {
if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
BaseAddress = RLE.EndAddress;
} else {
- Res.push_back(std::make_pair(BaseAddress + RLE.StartAddress,
- BaseAddress + RLE.EndAddress));
+ Res.push_back(
+ {BaseAddress + RLE.StartAddress, BaseAddress + RLE.EndAddress});
}
}
return Res;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 24039eb35209..e3bd759ba94b 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -60,8 +60,8 @@ static void dumpRanges(raw_ostream &OS, const DWARFAddressRangesVector& Ranges,
OS << '\n';
OS.indent(Indent);
OS << format("[0x%0*" PRIx64 " - 0x%0*" PRIx64 ")",
- AddressSize*2, Range.first,
- AddressSize*2, Range.second);
+ AddressSize*2, Range.LowPC,
+ AddressSize*2, Range.HighPC);
}
}
@@ -229,9 +229,9 @@ DWARFDie::getAddressRanges() const {
return DWARFAddressRangesVector();
// Single range specified by low/high PC.
uint64_t LowPC, HighPC;
- if (getLowAndHighPC(LowPC, HighPC)) {
- return DWARFAddressRangesVector(1, std::make_pair(LowPC, HighPC));
- }
+ if (getLowAndHighPC(LowPC, HighPC))
+ return {{LowPC, HighPC}};
+
// Multiple ranges from .debug_ranges section.
auto RangesOffset = toSectionOffset(find(DW_AT_ranges));
if (RangesOffset) {
@@ -257,7 +257,7 @@ DWARFDie::collectChildrenAddressRanges(DWARFAddressRangesVector& Ranges) const {
bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const {
for (const auto& R : getAddressRanges()) {
- if (R.first <= Address && Address < R.second)
+ if (R.LowPC <= Address && Address < R.HighPC)
return true;
}
return false;
diff --git a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index e0f819383289..25824f6eb83b 100644
--- a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -24,7 +24,11 @@ bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
return false;
TypeHash = debug_info.getU64(offset_ptr);
TypeOffset = debug_info.getU32(offset_ptr);
- return TypeOffset < getLength();
+ // TypeOffset is relative to the beginning of the header,
+ // so we have to account for the leading length field.
+ // FIXME: The size of the length field is 12 in DWARF64.
+ unsigned SizeOfLength = 4;
+ return TypeOffset < getLength() + SizeOfLength;
}
void DWARFTypeUnit::dump(raw_ostream &OS, bool SummarizeTypes) {
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index f50487fc3ba3..3835d4da9ae9 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -349,18 +349,18 @@ void DWARFUnit::updateAddressDieMap(DWARFDie Die) {
if (Die.isSubroutineDIE()) {
for (const auto &R : Die.getAddressRanges()) {
// Ignore 0-sized ranges.
- if (R.first == R.second)
+ if (R.LowPC == R.HighPC)
continue;
- auto B = AddrDieMap.upper_bound(R.first);
- if (B != AddrDieMap.begin() && R.first < (--B)->second.first) {
+ auto B = AddrDieMap.upper_bound(R.LowPC);
+ if (B != AddrDieMap.begin() && R.LowPC < (--B)->second.first) {
// The range is a sub-range of existing ranges, we need to split the
// existing range.
- if (R.second < B->second.first)
- AddrDieMap[R.second] = B->second;
- if (R.first > B->first)
- AddrDieMap[B->first].first = R.first;
+ if (R.HighPC < B->second.first)
+ AddrDieMap[R.HighPC] = B->second;
+ if (R.LowPC > B->first)
+ AddrDieMap[B->first].first = R.LowPC;
}
- AddrDieMap[R.first] = std::make_pair(R.second, Die);
+ AddrDieMap[R.LowPC] = std::make_pair(R.HighPC, Die);
}
}
// Parent DIEs are added to the AddrDieMap prior to the Children DIEs to
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 9494e876da15..8a544296f65c 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
using namespace dwarf;
using namespace object;
-void DWARFVerifier::verifyDebugInfoAttribute(DWARFDie &Die,
+void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
DWARFAttribute &AttrValue) {
const auto Attr = AttrValue.Attr;
switch (Attr) {
@@ -68,7 +68,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(DWARFDie &Die,
}
}
-void DWARFVerifier::verifyDebugInfoForm(DWARFDie &Die,
+void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
DWARFAttribute &AttrValue) {
const auto Form = AttrValue.Value.getForm();
switch (Form) {
@@ -136,7 +136,7 @@ void DWARFVerifier::verifyDebugInfoForm(DWARFDie &Die,
}
}
-void DWARFVerifier::veifyDebugInfoReferences() {
+void DWARFVerifier::verifyDebugInfoReferences() {
// Take all references and make sure they point to an actual DIE by
// getting the DIE by offset and emitting an error
OS << "Verifying .debug_info references...\n";
@@ -172,7 +172,7 @@ bool DWARFVerifier::handleDebugInfo() {
}
}
}
- veifyDebugInfoReferences();
+ verifyDebugInfoReferences();
return NumDebugInfoErrors == 0;
}
diff --git a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 375c35b11145..701a318511b8 100644
--- a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -109,7 +109,7 @@ uint32_t TpiStreamBuilder::calculateHashBufferSize() const {
}
uint32_t TpiStreamBuilder::calculateIndexOffsetSize() const {
- return TypeIndexOffsets.size() * sizeof(TypeIndexOffset);
+ return TypeIndexOffsets.size() * sizeof(codeview::TypeIndexOffset);
}
Error TpiStreamBuilder::finalizeMsfLayout() {
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index a5100a56bcf1..a27573f93b97 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -94,9 +94,8 @@ class OrcMCJITReplacement : public ExecutionEngine {
return ClientMM->registerEHFrames(Addr, LoadAddr, Size);
}
- void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
- size_t Size) override {
- return ClientMM->deregisterEHFrames(Addr, LoadAddr, Size);
+ void deregisterEHFrames() override {
+ return ClientMM->deregisterEHFrames();
}
void notifyObjectLoaded(RuntimeDyld &RTDyld,
diff --git a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index de73fbde8eb7..99e84b7496d4 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -134,6 +134,18 @@ void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr,
#endif
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
+ size_t Size) {
+ registerEHFramesInProcess(Addr, Size);
+ EHFrames.push_back({Addr, Size});
+}
+
+void RTDyldMemoryManager::deregisterEHFrames() {
+ for (auto &Frame : EHFrames)
+ deregisterEHFramesInProcess(Frame.Addr, Frame.Size);
+ EHFrames.clear();
+}
+
static int jit_noop() {
return 0;
}
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index df9d2ceba329..e9a4b71c903d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -73,7 +73,9 @@ namespace llvm {
void RuntimeDyldImpl::registerEHFrames() {}
-void RuntimeDyldImpl::deregisterEHFrames() {}
+void RuntimeDyldImpl::deregisterEHFrames() {
+ MemMgr.deregisterEHFrames();
+}
#ifndef NDEBUG
static void dumpSectionMemory(const SectionEntry &S, StringRef State) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 50f63fb8dd39..660843765b3f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -221,22 +221,10 @@ void RuntimeDyldELF::registerEHFrames() {
uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
size_t EHFrameSize = Sections[EHFrameSID].getSize();
MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
- RegisteredEHFrameSections.push_back(EHFrameSID);
}
UnregisteredEHFrameSections.clear();
}
-void RuntimeDyldELF::deregisterEHFrames() {
- for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) {
- SID EHFrameSID = RegisteredEHFrameSections[i];
- uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress();
- uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
- size_t EHFrameSize = Sections[EHFrameSID].getSize();
- MemMgr.deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
- }
- RegisteredEHFrameSections.clear();
-}
-
std::unique_ptr<RuntimeDyldELF>
llvm::RuntimeDyldELF::create(Triple::ArchType Arch,
RuntimeDyld::MemoryManager &MemMgr,
@@ -802,20 +790,35 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
writeInt32BE(LocalAddress, Delta / 2);
break;
}
+ case ELF::R_390_PC16: {
+ int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
+ assert(int16_t(Delta) == Delta && "R_390_PC16 overflow");
+ writeInt16BE(LocalAddress, Delta);
+ break;
+ }
case ELF::R_390_PC32: {
int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
assert(int32_t(Delta) == Delta && "R_390_PC32 overflow");
writeInt32BE(LocalAddress, Delta);
break;
}
- case ELF::R_390_64:
- writeInt64BE(LocalAddress, Value + Addend);
- break;
case ELF::R_390_PC64: {
int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
writeInt64BE(LocalAddress, Delta);
break;
}
+ case ELF::R_390_8:
+ *LocalAddress = (uint8_t)(Value + Addend);
+ break;
+ case ELF::R_390_16:
+ writeInt16BE(LocalAddress, Value + Addend);
+ break;
+ case ELF::R_390_32:
+ writeInt32BE(LocalAddress, Value + Addend);
+ break;
+ case ELF::R_390_64:
+ writeInt64BE(LocalAddress, Value + Addend);
+ break;
}
}
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 84dd810101f3..fb5da6dd8bbb 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -152,7 +152,6 @@ private:
// in a table until we receive a request to register all unregistered
// EH frame sections with the memory manager.
SmallVector<SID, 2> UnregisteredEHFrameSections;
- SmallVector<SID, 2> RegisteredEHFrameSections;
// Map between GOT relocation value and corresponding GOT offset
std::map<RelocationValueRef, uint64_t> GOTOffsetMap;
@@ -180,7 +179,6 @@ public:
StubMap &Stubs) override;
bool isCompatibleFile(const object::ObjectFile &Obj) const override;
void registerEHFrames() override;
- void deregisterEHFrames() override;
Error finalizeLoad(const ObjectFile &Obj,
ObjSectionToIDMap &SectionMap) override;
};
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index f5cc883d98fd..18c23c5a2a5d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -515,7 +515,7 @@ public:
virtual void registerEHFrames();
- virtual void deregisterEHFrames();
+ void deregisterEHFrames();
virtual Error finalizeLoad(const ObjectFile &ObjImg,
ObjSectionToIDMap &SectionMap) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
index 0398413e1532..6aa1a2bdb926 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
@@ -217,7 +217,6 @@ public:
}
void registerEHFrames() override {}
- void deregisterEHFrames() override {}
};
}
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
index 8c6af0bd9c6d..318afa21a88b 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
@@ -316,7 +316,6 @@ public:
}
void registerEHFrames() override {}
- void deregisterEHFrames() override {}
};
}
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 109beb36f1ee..26e73989d7ed 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -194,9 +194,6 @@ public:
}
UnregisteredEHFrameSections.clear();
}
- void deregisterEHFrames() override {
- // Stub
- }
Error finalizeLoad(const ObjectFile &Obj,
ObjSectionToIDMap &SectionMap) override {
// Look for and record the EH frame section IDs.
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index b85ba210afb3..e93c79cfcec6 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -656,7 +656,8 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
SMR.WaitClient();
size_t Size = SMR.ReadByteArraySize();
SMR.WriteByteArray(nullptr, 0);
- F->RunOne(SMR.GetByteArray(), Size);
+ const Unit tmp(SMR.GetByteArray(), SMR.GetByteArray() + Size);
+ F->RunOne(tmp.data(), tmp.size());
SMR.PostServer();
}
return 0;
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 0a1ff1b1df6a..7ff196c8fa96 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -92,10 +92,10 @@ FUZZER_FLAG_INT(print_pcs, 0, "If 1, print out newly covered PCs.")
FUZZER_FLAG_INT(print_final_stats, 0, "If 1, print statistics at exit.")
FUZZER_FLAG_INT(print_corpus_stats, 0,
"If 1, print statistics on corpus elements at exit.")
-FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information at exit."
- " Experimental, only with trace-pc-guard")
-FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information at exit."
- " Experimental, only with trace-pc-guard")
+FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information as text"
+ " at exit.")
+FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information as a"
+ " .sancov file at exit.")
FUZZER_FLAG_INT(handle_segv, 1, "If 1, try to intercept SIGSEGV.")
FUZZER_FLAG_INT(handle_bus, 1, "If 1, try to intercept SIGBUS.")
FUZZER_FLAG_INT(handle_abrt, 1, "If 1, try to intercept SIGABRT.")
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index ad067ee2c0d9..5f184c2316e2 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -91,6 +91,7 @@ public:
private:
void AlarmCallback();
void CrashCallback();
+ void CrashOnOverwrittenData();
void InterruptCallback();
void MutateAndTestOne();
void ReportNewCoverage(InputInfo *II, const Unit &U);
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index d84c3dbdaf77..14caa203c5ef 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -422,6 +422,24 @@ size_t Fuzzer::GetCurrentUnitInFuzzingThead(const uint8_t **Data) const {
return CurrentUnitSize;
}
+void Fuzzer::CrashOnOverwrittenData() {
+ Printf("==%d== ERROR: libFuzzer: fuzz target overwrites it's const input\n",
+ GetPid());
+ DumpCurrentUnit("crash-");
+ Printf("SUMMARY: libFuzzer: out-of-memory\n");
+ _Exit(Options.ErrorExitCode); // Stop right now.
+}
+
+// Compare two arrays, but not all bytes if the arrays are large.
+static bool LooseMemeq(const uint8_t *A, const uint8_t *B, size_t Size) {
+ const size_t Limit = 64;
+ if (Size <= 64)
+ return !memcmp(A, B, Size);
+ // Compare first and last Limit/2 bytes.
+ return !memcmp(A, B, Limit / 2) &&
+ !memcmp(A + Size - Limit / 2, B + Size - Limit / 2, Limit / 2);
+}
+
void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
assert(InFuzzingThread());
if (SMR.IsClient())
@@ -443,6 +461,8 @@ void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
(void)Res;
assert(Res == 0);
HasMoreMallocsThanFrees = AllocTracer.Stop();
+ if (!LooseMemeq(DataCopy, Data, Size))
+ CrashOnOverwrittenData();
CurrentUnitSize = 0;
delete[] DataCopy;
}
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
index cd846c7deec5..e60d4130de10 100644
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -217,11 +217,12 @@ DictionaryEntry MutationDispatcher::MakeDictionaryEntryFromCMP(
size_t NumPositions = 0;
for (const uint8_t *Cur = Data;
Cur < End && NumPositions < kMaxNumPositions; Cur++) {
- Cur = (uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize);
+ Cur =
+ (const uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize);
if (!Cur) break;
Positions[NumPositions++] = Cur - Data;
}
- if (!NumPositions) break;
+ if (!NumPositions) continue;
return DictionaryEntry(W, Positions[Rand(NumPositions)]);
}
DictionaryEntry DE(W);
diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp
index b3a54e57fceb..3815ed11cf60 100644
--- a/lib/Fuzzer/afl/afl_driver.cpp
+++ b/lib/Fuzzer/afl/afl_driver.cpp
@@ -59,6 +59,11 @@ statistics from the file. If that fails then the process will quit.
#include <signal.h>
#include <sys/resource.h>
#include <sys/time.h>
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+
// Platform detection. Copied from FuzzerInternal.h
#ifdef __linux__
#define LIBFUZZER_LINUX 1
@@ -245,17 +250,39 @@ extern "C" size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize) {
return 0;
}
+// Execute any files provided as parameters.
+int ExecuteFilesOnyByOne(int argc, char **argv) {
+ for (int i = 1; i < argc; i++) {
+ std::ifstream in(argv[i]);
+ in.seekg(0, in.end);
+ size_t length = in.tellg();
+ in.seekg (0, in.beg);
+ std::cout << "Reading " << length << " bytes from " << argv[i] << std::endl;
+ // Allocate exactly length bytes so that we reliably catch buffer overflows.
+ std::vector<char> bytes(length);
+ in.read(bytes.data(), bytes.size());
+ assert(in);
+ LLVMFuzzerTestOneInput(reinterpret_cast<const uint8_t *>(bytes.data()),
+ bytes.size());
+ std::cout << "Execution successfull" << std::endl;
+ }
+ return 0;
+}
+
int main(int argc, char **argv) {
- fprintf(stderr, "======================= INFO =========================\n"
- "This binary is built for AFL-fuzz.\n"
- "To run the target function on a single input execute this:\n"
- " %s < INPUT_FILE\n"
- "To run the fuzzing execute this:\n"
- " afl-fuzz [afl-flags] %s [N] "
- "-- run N fuzzing iterations before "
- "re-spawning the process (default: 1000)\n"
- "======================================================\n",
- argv[0], argv[0]);
+ fprintf(stderr,
+ "======================= INFO =========================\n"
+ "This binary is built for AFL-fuzz.\n"
+ "To run the target function on individual input(s) execute this:\n"
+ " %s < INPUT_FILE\n"
+ "or\n"
+ " %s INPUT_FILE1 [INPUT_FILE2 ... ]\n"
+ "To fuzz with afl-fuzz execute this:\n"
+ " afl-fuzz [afl-flags] %s [-N]\n"
+ "afl-fuzz will run N iterations before "
+ "re-spawning the process (default: 1000)\n"
+ "======================================================\n",
+ argv[0], argv[0], argv[0]);
if (LLVMFuzzerInitialize)
LLVMFuzzerInitialize(&argc, &argv);
// Do any other expensive one-time initialization here.
@@ -266,8 +293,14 @@ int main(int argc, char **argv) {
__afl_manual_init();
int N = 1000;
- if (argc >= 2)
- N = atoi(argv[1]);
+ if (argc == 2 && argv[1][0] == '-')
+ N = atoi(argv[1] + 1);
+ else if(argc == 2 && (N = atoi(argv[1])) > 0)
+ fprintf(stderr, "WARNING: using the deprecated call style `%s %d`\n",
+ argv[0], N);
+ else if (argc > 1)
+ return ExecuteFilesOnyByOne(argc, argv);
+
assert(N > 0);
time_t unit_time_secs;
int num_runs = 0;
diff --git a/lib/Fuzzer/test/AFLDriverTest.cpp b/lib/Fuzzer/test/AFLDriverTest.cpp
index 3dd0b6117305..e3f5f7100883 100644
--- a/lib/Fuzzer/test/AFLDriverTest.cpp
+++ b/lib/Fuzzer/test/AFLDriverTest.cpp
@@ -4,19 +4,25 @@
// Contains dummy functions used to avoid dependency on AFL.
#include <stdint.h>
#include <stdlib.h>
+#include <stdio.h>
extern "C" void __afl_manual_init() {}
-extern "C" int __afl_persistent_loop(unsigned int) {
+extern "C" int __afl_persistent_loop(unsigned int N) {
+ static int Count = N;
+ fprintf(stderr, "__afl_persistent_loop calle, Count = %d\n", Count);
+ if (Count--) return 1;
return 0;
}
// This declaration exists to prevent the Darwin linker
// from complaining about this being a missing weak symbol.
extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
+ fprintf(stderr, "LLVMFuzzerInitialize called\n");
return 0;
}
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+ fprintf(stderr, "LLVMFuzzerTestOneInput called; Size = %zd\n", Size);
return 0;
}
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index cd049d3f03d8..b39938a705f6 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -104,6 +104,7 @@ set(Tests
OneHugeAllocTest
OutOfMemoryTest
OutOfMemorySingleLargeMallocTest
+ OverwriteInputTest
RepeatedMemcmp
RepeatedBytesTest
SimpleCmpTest
diff --git a/lib/Fuzzer/test/OverwriteInputTest.cpp b/lib/Fuzzer/test/OverwriteInputTest.cpp
new file mode 100644
index 000000000000..e688682346a6
--- /dev/null
+++ b/lib/Fuzzer/test/OverwriteInputTest.cpp
@@ -0,0 +1,13 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. Make sure we abort if Data is overwritten.
+#include <cstdint>
+#include <iostream>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+ if (Size)
+ *const_cast<uint8_t*>(Data) = 1;
+ return 0;
+}
+
diff --git a/lib/Fuzzer/test/afl-driver.test b/lib/Fuzzer/test/afl-driver.test
new file mode 100644
index 000000000000..6eab23cc3636
--- /dev/null
+++ b/lib/Fuzzer/test/afl-driver.test
@@ -0,0 +1,26 @@
+REQUIRES: linux
+RUN: echo -n "abc" > %t.file3
+RUN: echo -n "abcd" > %t.file4
+
+RUN: AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefix=CHECK1
+CHECK1: __afl_persistent_loop calle, Count = 1000
+CHECK1: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest < %t.file3 -42 2>&1 | FileCheck %s --check-prefix=CHECK2
+CHECK2: __afl_persistent_loop calle, Count = 42
+CHECK2: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest < %t.file3 666 2>&1 | FileCheck %s --check-prefix=CHECK3
+CHECK3: WARNING: using the deprecated call style
+CHECK3: __afl_persistent_loop calle, Count = 666
+CHECK3: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest %t.file3 2>&1 | FileCheck %s --check-prefix=CHECK4
+CHECK4: LLVMFuzzerTestOneInput called; Size = 3
+
+RUN: AFLDriverTest %t.file3 %t.file4 2>&1 | FileCheck %s --check-prefix=CHECK5
+CHECK5: LLVMFuzzerTestOneInput called; Size = 3
+CHECK5: LLVMFuzzerTestOneInput called; Size = 4
diff --git a/lib/Fuzzer/test/overwrite-input.test b/lib/Fuzzer/test/overwrite-input.test
new file mode 100644
index 000000000000..81c27909e8df
--- /dev/null
+++ b/lib/Fuzzer/test/overwrite-input.test
@@ -0,0 +1,2 @@
+RUN: not LLVMFuzzer-OverwriteInputTest 2>&1 | FileCheck %s
+CHECK: ERROR: libFuzzer: fuzz target overwrites it's const input
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 4c6e3e3788bd..ec4663018bd4 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -805,6 +805,9 @@ void SlotTracker::processModule() {
if (!Var.hasName())
CreateModuleSlot(&Var);
processGlobalObjectMetadata(Var);
+ auto Attrs = Var.getAttributes();
+ if (Attrs.hasAttributes())
+ CreateAttributeSetSlot(Attrs);
}
for (const GlobalAlias &A : TheModule->aliases()) {
@@ -2502,6 +2505,10 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
GV->getAllMetadata(MDs);
printMetadataAttachments(MDs, ", ");
+ auto Attrs = GV->getAttributes();
+ if (Attrs.hasAttributes())
+ Out << " #" << Machine.getAttributeGroupSlot(Attrs);
+
printInfoComment(*GV);
}
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index cf2925254695..acfac316e91e 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -1,4 +1,4 @@
-//===-- AttributeImpl.h - Attribute Internals -------------------*- C++ -*-===//
+//===- AttributeImpl.h - Attribute Internals --------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -21,9 +21,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Attributes.h"
#include "llvm/Support/TrailingObjects.h"
-#include <algorithm>
#include <cassert>
-#include <climits>
#include <cstddef>
#include <cstdint>
#include <string>
@@ -80,11 +78,13 @@ public:
else
Profile(ID, getKindAsString(), getValueAsString());
}
+
static void Profile(FoldingSetNodeID &ID, Attribute::AttrKind Kind,
uint64_t Val) {
ID.AddInteger(Kind);
if (Val) ID.AddInteger(Val);
}
+
static void Profile(FoldingSetNodeID &ID, StringRef Kind, StringRef Values) {
ID.AddString(Kind);
if (!Values.empty()) ID.AddString(Values);
@@ -114,9 +114,10 @@ public:
};
class IntAttributeImpl : public EnumAttributeImpl {
- void anchor() override;
uint64_t Val;
+ void anchor() override;
+
public:
IntAttributeImpl(Attribute::AttrKind Kind, uint64_t Val)
: EnumAttributeImpl(IntAttrEntry, Kind), Val(Val) {
@@ -188,20 +189,22 @@ public:
std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
std::string getAsString(bool InAttrGrp) const;
- typedef const Attribute *iterator;
+ using iterator = const Attribute *;
+
iterator begin() const { return getTrailingObjects<Attribute>(); }
iterator end() const { return begin() + NumAttrs; }
void Profile(FoldingSetNodeID &ID) const {
Profile(ID, makeArrayRef(begin(), end()));
}
+
static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
for (const auto &Attr : AttrList)
Attr.Profile(ID);
}
};
-typedef std::pair<unsigned, AttributeSet> IndexAttrPair;
+using IndexAttrPair = std::pair<unsigned, AttributeSet>;
//===----------------------------------------------------------------------===//
/// \class
@@ -265,7 +268,8 @@ public:
return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
}
- typedef AttributeSet::iterator iterator;
+ using iterator = AttributeSet::iterator;
+
iterator begin(unsigned Slot) const {
return getSlotAttributes(Slot).begin();
}
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 3b1140ab542c..ce60367a6c8b 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -34,6 +34,8 @@
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
+#include <climits>
+#include <cstddef>
#include <cstdint>
#include <limits>
#include <map>
@@ -504,16 +506,74 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) {
return AttributeSet(AttributeSetNode::get(C, Attrs));
}
+AttributeSet AttributeSet::addAttribute(LLVMContext &C,
+ Attribute::AttrKind Kind) const {
+ if (hasAttribute(Kind)) return *this;
+ AttrBuilder B;
+ B.addAttribute(Kind);
+ return addAttributes(C, AttributeSet::get(C, B));
+}
+
+AttributeSet AttributeSet::addAttribute(LLVMContext &C, StringRef Kind,
+ StringRef Value) const {
+ AttrBuilder B;
+ B.addAttribute(Kind, Value);
+ return addAttributes(C, AttributeSet::get(C, B));
+}
+
+AttributeSet AttributeSet::addAttributes(LLVMContext &C,
+ const AttributeSet AS) const {
+ if (!hasAttributes())
+ return AS;
+
+ if (!AS.hasAttributes())
+ return *this;
+
+ AttrBuilder B(AS);
+ for (Attribute I : *this)
+ B.addAttribute(I);
+
+ return get(C, B);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
+ Attribute::AttrKind Kind) const {
+ if (!hasAttribute(Kind)) return *this;
+ AttrBuilder B;
+ B.addAttribute(Kind);
+ return removeAttributes(C, B);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
+ StringRef Kind) const {
+ if (!hasAttribute(Kind)) return *this;
+ AttrBuilder B;
+ B.addAttribute(Kind);
+ return removeAttributes(C, B);
+}
+
+AttributeSet AttributeSet::removeAttributes(LLVMContext &C,
+ const AttrBuilder &Attrs) const {
+
+ // FIXME it is not obvious how this should work for alignment.
+ // For now, say we can't pass in alignment, which no current use does.
+ assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!");
+
+ AttrBuilder B(*this);
+ B.remove(Attrs);
+ return get(C, B);
+}
+
unsigned AttributeSet::getNumAttributes() const {
return SetNode ? SetNode->getNumAttributes() : 0;
}
bool AttributeSet::hasAttribute(Attribute::AttrKind Kind) const {
- return SetNode ? SetNode->hasAttribute(Kind) : 0;
+ return SetNode ? SetNode->hasAttribute(Kind) : false;
}
bool AttributeSet::hasAttribute(StringRef Kind) const {
- return SetNode ? SetNode->hasAttribute(Kind) : 0;
+ return SetNode ? SetNode->hasAttribute(Kind) : false;
}
Attribute AttributeSet::getAttribute(Attribute::AttrKind Kind) const {
@@ -557,6 +617,14 @@ AttributeSet::iterator AttributeSet::end() const {
return SetNode ? SetNode->end() : nullptr;
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void AttributeSet::dump() const {
+ dbgs() << "AS =\n";
+ dbgs() << " { ";
+ dbgs() << getAsString(true) << " }\n";
+}
+#endif
+
//===----------------------------------------------------------------------===//
// AttributeSetNode Definition
//===----------------------------------------------------------------------===//
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 80b117015ede..a20f3f811c8d 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -2041,9 +2041,6 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
Optional<unsigned> InRangeIndex,
ArrayRef<Value *> Idxs) {
if (Idxs.empty()) return C;
- Constant *Idx0 = cast<Constant>(Idxs[0]);
- if ((Idxs.size() == 1 && Idx0->isNullValue()))
- return C;
if (isa<UndefValue>(C)) {
Type *GEPTy = GetElementPtrInst::getGEPReturnType(
@@ -2051,10 +2048,15 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
return UndefValue::get(GEPTy);
}
+ Constant *Idx0 = cast<Constant>(Idxs[0]);
+ if (Idxs.size() == 1 && (Idx0->isNullValue() || isa<UndefValue>(Idx0)))
+ return C;
+
if (C->isNullValue()) {
bool isNull = true;
for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
- if (!cast<Constant>(Idxs[i])->isNullValue()) {
+ if (!isa<UndefValue>(Idxs[i]) &&
+ !cast<Constant>(Idxs[i])->isNullValue()) {
isNull = false;
break;
}
diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp
index aeb1257754f3..509caba3acd4 100644
--- a/lib/IR/ConstantRange.cpp
+++ b/lib/IR/ConstantRange.cpp
@@ -278,7 +278,7 @@ APInt ConstantRange::getUnsignedMax() const {
}
APInt ConstantRange::getUnsignedMin() const {
- if (isFullSet() || (isWrappedSet() && getUpper() != 0))
+ if (isFullSet() || (isWrappedSet() && !getUpper().isNullValue()))
return APInt::getMinValue(getBitWidth());
return getLower();
}
@@ -442,7 +442,7 @@ ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const {
APInt L = CR.Lower.ult(Lower) ? CR.Lower : Lower;
APInt U = (CR.Upper - 1).ugt(Upper - 1) ? CR.Upper : Upper;
- if (L == 0 && U == 0)
+ if (L.isNullValue() && U.isNullValue())
return ConstantRange(getBitWidth());
return ConstantRange(std::move(L), std::move(U));
@@ -757,7 +757,8 @@ ConstantRange::multiply(const ConstantRange &Other) const {
// from one positive number to another which is as good as we can generate.
// In this case, skip the extra work of generating signed ranges which aren't
// going to be better than this range.
- if (!UR.isWrappedSet() && UR.getLower().isNonNegative())
+ if (!UR.isWrappedSet() &&
+ (UR.getUpper().isNonNegative() || UR.getUpper().isMinSignedValue()))
return UR;
// Now the signed range. Because we could be dealing with negative numbers
@@ -834,7 +835,7 @@ ConstantRange::umin(const ConstantRange &Other) const {
ConstantRange
ConstantRange::udiv(const ConstantRange &RHS) const {
- if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax() == 0)
+ if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax().isNullValue())
return ConstantRange(getBitWidth(), /*isFullSet=*/false);
if (RHS.isFullSet())
return ConstantRange(getBitWidth(), /*isFullSet=*/true);
@@ -842,7 +843,7 @@ ConstantRange::udiv(const ConstantRange &RHS) const {
APInt Lower = getUnsignedMin().udiv(RHS.getUnsignedMax());
APInt RHS_umin = RHS.getUnsignedMin();
- if (RHS_umin == 0) {
+ if (RHS_umin.isNullValue()) {
// We want the lowest value in RHS excluding zero. Usually that would be 1
// except for a range in the form of [X, 1) in which case it would be X.
if (RHS.getUpper() == 1)
@@ -892,29 +893,33 @@ ConstantRange::shl(const ConstantRange &Other) const {
if (isEmptySet() || Other.isEmptySet())
return ConstantRange(getBitWidth(), /*isFullSet=*/false);
- APInt min = getUnsignedMin().shl(Other.getUnsignedMin());
- APInt max = getUnsignedMax().shl(Other.getUnsignedMax());
+ APInt max = getUnsignedMax();
+ APInt Other_umax = Other.getUnsignedMax();
- // there's no overflow!
- APInt Zeros(getBitWidth(), getUnsignedMax().countLeadingZeros());
- if (Zeros.ugt(Other.getUnsignedMax()))
- return ConstantRange(std::move(min), std::move(max) + 1);
+ // there's overflow!
+ if (Other_umax.uge(max.countLeadingZeros()))
+ return ConstantRange(getBitWidth(), /*isFullSet=*/true);
// FIXME: implement the other tricky cases
- return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+ APInt min = getUnsignedMin();
+ min <<= Other.getUnsignedMin();
+ max <<= Other_umax;
+
+ return ConstantRange(std::move(min), std::move(max) + 1);
}
ConstantRange
ConstantRange::lshr(const ConstantRange &Other) const {
if (isEmptySet() || Other.isEmptySet())
return ConstantRange(getBitWidth(), /*isFullSet=*/false);
-
- APInt max = getUnsignedMax().lshr(Other.getUnsignedMin());
+
+ APInt max = getUnsignedMax().lshr(Other.getUnsignedMin()) + 1;
APInt min = getUnsignedMin().lshr(Other.getUnsignedMax());
- if (min == max + 1)
+ if (min == max)
return ConstantRange(getBitWidth(), /*isFullSet=*/true);
- return ConstantRange(std::move(min), std::move(max) + 1);
+ return ConstantRange(std::move(min), std::move(max));
}
ConstantRange ConstantRange::inverse() const {
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index ffc8f2e4303b..4b9d89cda539 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -30,7 +30,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
-#include <cstdarg>
+
using namespace llvm;
//===----------------------------------------------------------------------===//
@@ -966,16 +966,6 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V);
}
-Constant *ConstantStruct::get(StructType *T, ...) {
- va_list ap;
- SmallVector<Constant*, 8> Values;
- va_start(ap, T);
- while (Constant *Val = va_arg(ap, llvm::Constant*))
- Values.push_back(Val);
- va_end(ap);
- return get(T, Values);
-}
-
ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
: ConstantAggregate(T, ConstantVectorVal, V) {
assert(V.size() == T->getNumElements() &&
@@ -1810,8 +1800,7 @@ Constant *ConstantExpr::getSizeOf(Type* Ty) {
Constant *ConstantExpr::getAlignOf(Type* Ty) {
// alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1
// Note that a non-inbounds gep is used, as null isn't within any object.
- Type *AligningTy =
- StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, nullptr);
+ Type *AligningTy = StructType::get(Type::getInt1Ty(Ty->getContext()), Ty);
Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo(0));
Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0);
Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index eda751d8af4a..25eb9452d9d0 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -22,6 +22,7 @@
#include "llvm/ADT/None.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/InlineAsm.h"
@@ -387,31 +388,34 @@ struct ConstantExprKeyType;
template <class ConstantClass> struct ConstantInfo;
template <> struct ConstantInfo<ConstantExpr> {
- typedef ConstantExprKeyType ValType;
- typedef Type TypeClass;
+ using ValType = ConstantExprKeyType;
+ using TypeClass = Type;
};
template <> struct ConstantInfo<InlineAsm> {
- typedef InlineAsmKeyType ValType;
- typedef PointerType TypeClass;
+ using ValType = InlineAsmKeyType;
+ using TypeClass = PointerType;
};
template <> struct ConstantInfo<ConstantArray> {
- typedef ConstantAggrKeyType<ConstantArray> ValType;
- typedef ArrayType TypeClass;
+ using ValType = ConstantAggrKeyType<ConstantArray>;
+ using TypeClass = ArrayType;
};
template <> struct ConstantInfo<ConstantStruct> {
- typedef ConstantAggrKeyType<ConstantStruct> ValType;
- typedef StructType TypeClass;
+ using ValType = ConstantAggrKeyType<ConstantStruct>;
+ using TypeClass = StructType;
};
template <> struct ConstantInfo<ConstantVector> {
- typedef ConstantAggrKeyType<ConstantVector> ValType;
- typedef VectorType TypeClass;
+ using ValType = ConstantAggrKeyType<ConstantVector>;
+ using TypeClass = VectorType;
};
template <class ConstantClass> struct ConstantAggrKeyType {
ArrayRef<Constant *> Operands;
+
ConstantAggrKeyType(ArrayRef<Constant *> Operands) : Operands(Operands) {}
+
ConstantAggrKeyType(ArrayRef<Constant *> Operands, const ConstantClass *)
: Operands(Operands) {}
+
ConstantAggrKeyType(const ConstantClass *C,
SmallVectorImpl<Constant *> &Storage) {
assert(Storage.empty() && "Expected empty storage");
@@ -437,7 +441,8 @@ template <class ConstantClass> struct ConstantAggrKeyType {
return hash_combine_range(Operands.begin(), Operands.end());
}
- typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
+ using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
+
ConstantClass *create(TypeClass *Ty) const {
return new (Operands.size()) ConstantClass(Ty, Operands);
}
@@ -457,6 +462,7 @@ struct InlineAsmKeyType {
: AsmString(AsmString), Constraints(Constraints), FTy(FTy),
HasSideEffects(HasSideEffects), IsAlignStack(IsAlignStack),
AsmDialect(AsmDialect) {}
+
InlineAsmKeyType(const InlineAsm *Asm, SmallVectorImpl<Constant *> &)
: AsmString(Asm->getAsmString()), Constraints(Asm->getConstraintString()),
FTy(Asm->getFunctionType()), HasSideEffects(Asm->hasSideEffects()),
@@ -483,7 +489,8 @@ struct InlineAsmKeyType {
AsmDialect, FTy);
}
- typedef ConstantInfo<InlineAsm>::TypeClass TypeClass;
+ using TypeClass = ConstantInfo<InlineAsm>::TypeClass;
+
InlineAsm *create(TypeClass *Ty) const {
assert(PointerType::getUnqual(FTy) == Ty);
return new InlineAsm(FTy, AsmString, Constraints, HasSideEffects,
@@ -507,11 +514,13 @@ struct ConstantExprKeyType {
: Opcode(Opcode), SubclassOptionalData(SubclassOptionalData),
SubclassData(SubclassData), Ops(Ops), Indexes(Indexes),
ExplicitTy(ExplicitTy) {}
+
ConstantExprKeyType(ArrayRef<Constant *> Operands, const ConstantExpr *CE)
: Opcode(CE->getOpcode()),
SubclassOptionalData(CE->getRawSubclassOptionalData()),
SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands),
Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()) {}
+
ConstantExprKeyType(const ConstantExpr *CE,
SmallVectorImpl<Constant *> &Storage)
: Opcode(CE->getOpcode()),
@@ -553,7 +562,8 @@ struct ConstantExprKeyType {
hash_combine_range(Indexes.begin(), Indexes.end()));
}
- typedef ConstantInfo<ConstantExpr>::TypeClass TypeClass;
+ using TypeClass = ConstantInfo<ConstantExpr>::TypeClass;
+
ConstantExpr *create(TypeClass *Ty) const {
switch (Opcode) {
default:
@@ -594,16 +604,17 @@ struct ConstantExprKeyType {
template <class ConstantClass> class ConstantUniqueMap {
public:
- typedef typename ConstantInfo<ConstantClass>::ValType ValType;
- typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
- typedef std::pair<TypeClass *, ValType> LookupKey;
+ using ValType = typename ConstantInfo<ConstantClass>::ValType;
+ using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
+ using LookupKey = std::pair<TypeClass *, ValType>;
/// Key and hash together, so that we compute the hash only once and reuse it.
- typedef std::pair<unsigned, LookupKey> LookupKeyHashed;
+ using LookupKeyHashed = std::pair<unsigned, LookupKey>;
private:
struct MapInfo {
- typedef DenseMapInfo<ConstantClass *> ConstantClassInfo;
+ using ConstantClassInfo = DenseMapInfo<ConstantClass *>;
+
static inline ConstantClass *getEmptyKey() {
return ConstantClassInfo::getEmptyKey();
}
@@ -643,7 +654,7 @@ private:
};
public:
- typedef DenseSet<ConstantClass *, MapInfo> MapTy;
+ using MapTy = DenseSet<ConstantClass *, MapInfo>;
private:
MapTy Map;
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index cdbe237766a3..e6c49cad0722 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -672,6 +672,24 @@ void DIExpression::appendOffset(SmallVectorImpl<uint64_t> &Ops,
}
}
+bool DIExpression::extractIfOffset(int64_t &Offset) const {
+ if (getNumElements() == 0) {
+ Offset = 0;
+ return true;
+ }
+ if (getNumElements() != 2)
+ return false;
+ if (Elements[0] == dwarf::DW_OP_plus) {
+ Offset = Elements[1];
+ return true;
+ }
+ if (Elements[0] == dwarf::DW_OP_minus) {
+ Offset = -Elements[1];
+ return true;
+ }
+ return false;
+}
+
DIExpression *DIExpression::prepend(const DIExpression *Expr, bool Deref,
int64_t Offset, bool StackValue) {
SmallVector<uint64_t, 8> Ops;
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index f31074a7ad44..3168ec6944a3 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -8,6 +8,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "LLVMContextImpl.h"
#include "llvm/IR/DebugInfo.h"
using namespace llvm;
@@ -66,6 +67,119 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col, const MDNode *Scope,
const_cast<MDNode *>(InlinedAt));
}
+DebugLoc DebugLoc::appendInlinedAt(DebugLoc DL, DILocation *InlinedAt,
+ LLVMContext &Ctx,
+ DenseMap<const MDNode *, MDNode *> &Cache,
+ bool ReplaceLast) {
+ SmallVector<DILocation *, 3> InlinedAtLocations;
+ DILocation *Last = InlinedAt;
+ DILocation *CurInlinedAt = DL;
+
+ // Gather all the inlined-at nodes.
+ while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
+ // Skip any we've already built nodes for.
+ if (auto *Found = Cache[IA]) {
+ Last = cast<DILocation>(Found);
+ break;
+ }
+
+ if (ReplaceLast && !IA->getInlinedAt())
+ break;
+ InlinedAtLocations.push_back(IA);
+ CurInlinedAt = IA;
+ }
+
+ // Starting from the top, rebuild the nodes to point to the new inlined-at
+ // location (then rebuilding the rest of the chain behind it) and update the
+ // map of already-constructed inlined-at nodes.
+ for (const DILocation *MD : reverse(InlinedAtLocations))
+ Cache[MD] = Last = DILocation::getDistinct(
+ Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
+
+ return Last;
+}
+
+/// Reparent \c Scope from \c OrigSP to \c NewSP.
+static DIScope *reparentScope(LLVMContext &Ctx, DIScope *Scope,
+ DISubprogram *OrigSP, DISubprogram *NewSP,
+ DenseMap<const MDNode *, MDNode *> &Cache) {
+ SmallVector<DIScope *, 3> ScopeChain;
+ DIScope *Last = NewSP;
+ DIScope *CurScope = Scope;
+ do {
+ if (auto *SP = dyn_cast<DISubprogram>(CurScope)) {
+ // Don't rewrite this scope chain if it doesn't lead to the replaced SP.
+ if (SP != OrigSP)
+ return Scope;
+ Cache.insert({OrigSP, NewSP});
+ break;
+ }
+ if (auto *Found = Cache[CurScope]) {
+ Last = cast<DIScope>(Found);
+ break;
+ }
+ ScopeChain.push_back(CurScope);
+ } while ((CurScope = CurScope->getScope().resolve()));
+
+ // Starting from the top, rebuild the nodes to point to the new inlined-at
+ // location (then rebuilding the rest of the chain behind it) and update the
+ // map of already-constructed inlined-at nodes.
+ for (const DIScope *MD : reverse(ScopeChain)) {
+ if (auto *LB = dyn_cast<DILexicalBlock>(MD))
+ Cache[MD] = Last = DILexicalBlock::getDistinct(
+ Ctx, Last, LB->getFile(), LB->getLine(), LB->getColumn());
+ else if (auto *LB = dyn_cast<DILexicalBlockFile>(MD))
+ Cache[MD] = Last = DILexicalBlockFile::getDistinct(
+ Ctx, Last, LB->getFile(), LB->getDiscriminator());
+ else
+ llvm_unreachable("illegal parent scope");
+ }
+ return Last;
+}
+
+void DebugLoc::reparentDebugInfo(Instruction &I, DISubprogram *OrigSP,
+ DISubprogram *NewSP,
+ DenseMap<const MDNode *, MDNode *> &Cache) {
+ auto DL = I.getDebugLoc();
+ if (!OrigSP || !NewSP || OrigSP == NewSP || !DL)
+ return;
+
+ // Reparent the debug location.
+ auto &Ctx = I.getContext();
+ DILocation *InlinedAt = DL->getInlinedAt();
+ if (InlinedAt) {
+ while (auto *IA = InlinedAt->getInlinedAt())
+ InlinedAt = IA;
+ auto NewScope =
+ reparentScope(Ctx, InlinedAt->getScope(), OrigSP, NewSP, Cache);
+ InlinedAt =
+ DebugLoc::get(InlinedAt->getLine(), InlinedAt->getColumn(), NewScope);
+ }
+ I.setDebugLoc(
+ DebugLoc::get(DL.getLine(), DL.getCol(),
+ reparentScope(Ctx, DL->getScope(), OrigSP, NewSP, Cache),
+ DebugLoc::appendInlinedAt(DL, InlinedAt, Ctx, Cache,
+ ReplaceLastInlinedAt)));
+
+ // Fix up debug variables to point to NewSP.
+ auto reparentVar = [&](DILocalVariable *Var) {
+ return DILocalVariable::getDistinct(
+ Ctx,
+ cast<DILocalScope>(
+ reparentScope(Ctx, Var->getScope(), OrigSP, NewSP, Cache)),
+ Var->getName(), Var->getFile(), Var->getLine(), Var->getType(),
+ Var->getArg(), Var->getFlags(), Var->getAlignInBits());
+ };
+ if (auto *DbgValue = dyn_cast<DbgValueInst>(&I)) {
+ auto *Var = DbgValue->getVariable();
+ I.setOperand(2, MetadataAsValue::get(Ctx, reparentVar(Var)));
+ } else if (auto *DbgDeclare = dyn_cast<DbgDeclareInst>(&I)) {
+ auto *Var = DbgDeclare->getVariable();
+ I.setOperand(1, MetadataAsValue::get(Ctx, reparentVar(Var)));
+ }
+}
+
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void DebugLoc::dump() const {
if (!Loc)
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 395b6158e0c8..e73f53f3202d 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -12,20 +12,31 @@
// Diagnostics reporting is still done as part of the LLVMContext.
//===----------------------------------------------------------------------===//
-#include "llvm/IR/DiagnosticInfo.h"
-#include "LLVMContextImpl.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/Regex.h"
#include <atomic>
+#include <cassert>
+#include <memory>
#include <string>
using namespace llvm;
@@ -53,6 +64,8 @@ struct PassRemarksOpt {
}
};
+} // end anonymous namespace
+
static PassRemarksOpt PassRemarksOptLoc;
static PassRemarksOpt PassRemarksMissedOptLoc;
static PassRemarksOpt PassRemarksAnalysisOptLoc;
@@ -85,7 +98,6 @@ PassRemarksAnalysis(
"the given regular expression"),
cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired,
cl::ZeroOrMore);
-}
int llvm::getNextAvailablePluginDiagnosticKind() {
static std::atomic<int> PluginKindID(DK_FirstPluginKind);
@@ -97,8 +109,7 @@ const char *OptimizationRemarkAnalysis::AlwaysPrint = "";
DiagnosticInfoInlineAsm::DiagnosticInfoInlineAsm(const Instruction &I,
const Twine &MsgStr,
DiagnosticSeverity Severity)
- : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(0), MsgStr(MsgStr),
- Instr(&I) {
+ : DiagnosticInfo(DK_InlineAsm, Severity), MsgStr(MsgStr), Instr(&I) {
if (const MDNode *SrcLoc = I.getMetadata("srcloc")) {
if (SrcLoc->getNumOperands() != 0)
if (const auto *CI =
@@ -193,7 +204,7 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V
// Only include names that correspond to user variables. FIXME: we should use
// debug info if available to get the name of the user variable.
if (isa<llvm::Argument>(V) || isa<GlobalValue>(V))
- Val = GlobalValue::getRealLinkageName(V->getName());
+ Val = GlobalValue::dropLLVMManglingEscape(V->getName());
else if (isa<Constant>(V)) {
raw_string_ostream OS(Val);
V->printAsOperand(OS, /*PrintType=*/false);
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 58c060550322..16a9e51b8306 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -1,4 +1,4 @@
-//===-- Function.cpp - Implement the Global object classes ----------------===//
+//===- Function.cpp - Implement the Global object classes -----------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,21 +11,51 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/IR/Function.h"
#include "LLVMContextImpl.h"
#include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
using namespace llvm;
// Explicit instantiations of SymbolTableListTraits since some of the methods
@@ -36,7 +66,7 @@ template class llvm::SymbolTableListTraits<BasicBlock>;
// Argument Implementation
//===----------------------------------------------------------------------===//
-void Argument::anchor() { }
+void Argument::anchor() {}
Argument::Argument(Type *Ty, const Twine &Name, Function *Par, unsigned ArgNo)
: Value(Ty, Value::ArgumentVal), Parent(Par), ArgNo(ArgNo) {
@@ -186,7 +216,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
Module *ParentModule)
: GlobalObject(Ty, Value::FunctionVal,
OperandTraits<Function>::op_begin(this), 0, Linkage, name),
- Arguments(nullptr), NumArgs(Ty->getNumParams()) {
+ NumArgs(Ty->getNumParams()) {
assert(FunctionType::isValidReturnType(getReturnType()) &&
"invalid return type");
setGlobalObjectSubClassData(0);
@@ -386,24 +416,20 @@ void Function::clearGC() {
/// Copy all additional attributes (those not needed to create a Function) from
/// the Function Src to this one.
-void Function::copyAttributesFrom(const GlobalValue *Src) {
+void Function::copyAttributesFrom(const Function *Src) {
GlobalObject::copyAttributesFrom(Src);
- const Function *SrcF = dyn_cast<Function>(Src);
- if (!SrcF)
- return;
-
- setCallingConv(SrcF->getCallingConv());
- setAttributes(SrcF->getAttributes());
- if (SrcF->hasGC())
- setGC(SrcF->getGC());
+ setCallingConv(Src->getCallingConv());
+ setAttributes(Src->getAttributes());
+ if (Src->hasGC())
+ setGC(Src->getGC());
else
clearGC();
- if (SrcF->hasPersonalityFn())
- setPersonalityFn(SrcF->getPersonalityFn());
- if (SrcF->hasPrefixData())
- setPrefixData(SrcF->getPrefixData());
- if (SrcF->hasPrologueData())
- setPrologueData(SrcF->getPrologueData());
+ if (Src->hasPersonalityFn())
+ setPersonalityFn(Src->getPersonalityFn());
+ if (Src->hasPrefixData())
+ setPrefixData(Src->getPrefixData());
+ if (Src->hasPrologueData())
+ setPrologueData(Src->getPrologueData());
}
/// Table of string intrinsic names indexed by enum value.
@@ -486,10 +512,10 @@ void Function::recalculateIntrinsicID() {
static std::string getMangledTypeStr(Type* Ty) {
std::string Result;
if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
- Result += "p" + llvm::utostr(PTyp->getAddressSpace()) +
+ Result += "p" + utostr(PTyp->getAddressSpace()) +
getMangledTypeStr(PTyp->getElementType());
} else if (ArrayType* ATyp = dyn_cast<ArrayType>(Ty)) {
- Result += "a" + llvm::utostr(ATyp->getNumElements()) +
+ Result += "a" + utostr(ATyp->getNumElements()) +
getMangledTypeStr(ATyp->getElementType());
} else if (StructType *STyp = dyn_cast<StructType>(Ty)) {
if (!STyp->isLiteral()) {
@@ -534,7 +560,6 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
return Result;
}
-
/// IIT_Info - These are enumerators that describe the entries returned by the
/// getIntrinsicInfoTableEntries function.
///
@@ -585,9 +610,10 @@ enum IIT_Info {
static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
+ using namespace Intrinsic;
+
IIT_Info Info = IIT_Info(Infos[NextElt++]);
unsigned StructElts = 2;
- using namespace Intrinsic;
switch (Info) {
case IIT_Done:
@@ -742,7 +768,6 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
llvm_unreachable("unhandled");
}
-
#define GET_INTRINSIC_GENERATOR_GLOBAL
#include "llvm/IR/Intrinsics.gen"
#undef GET_INTRINSIC_GENERATOR_GLOBAL
@@ -780,10 +805,10 @@ void Intrinsic::getIntrinsicInfoTableEntries(ID id,
DecodeIITType(NextElt, IITEntries, T);
}
-
static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
ArrayRef<Type*> Tys, LLVMContext &Context) {
using namespace Intrinsic;
+
IITDescriptor D = Infos.front();
Infos = Infos.slice(1);
@@ -855,12 +880,10 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
case IITDescriptor::VecOfAnyPtrsToElt:
// Return the overloaded type (which determines the pointers address space)
return Tys[D.getOverloadArgNumber()];
- }
+ }
llvm_unreachable("unhandled");
}
-
-
FunctionType *Intrinsic::getType(LLVMContext &Context,
ID id, ArrayRef<Type*> Tys) {
SmallVector<IITDescriptor, 8> Table;
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 5f338f58d940..17d27b016cf2 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -69,6 +69,30 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
setDLLStorageClass(Src->getDLLStorageClass());
}
+void GlobalValue::removeFromParent() {
+ switch (getValueID()) {
+#define HANDLE_GLOBAL_VALUE(NAME) \
+ case Value::NAME##Val: \
+ return static_cast<NAME *>(this)->removeFromParent();
+#include "llvm/IR/Value.def"
+ default:
+ break;
+ }
+ llvm_unreachable("not a global");
+}
+
+void GlobalValue::eraseFromParent() {
+ switch (getValueID()) {
+#define HANDLE_GLOBAL_VALUE(NAME) \
+ case Value::NAME##Val: \
+ return static_cast<NAME *>(this)->eraseFromParent();
+#include "llvm/IR/Value.def"
+ default:
+ break;
+ }
+ llvm_unreachable("not a global");
+}
+
unsigned GlobalValue::getAlignment() const {
if (auto *GA = dyn_cast<GlobalAlias>(this)) {
// In general we cannot compute this at the IR level, but we try.
@@ -93,12 +117,10 @@ void GlobalObject::setAlignment(unsigned Align) {
assert(getAlignment() == Align && "Alignment representation error!");
}
-void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
+void GlobalObject::copyAttributesFrom(const GlobalObject *Src) {
GlobalValue::copyAttributesFrom(Src);
- if (const auto *GV = dyn_cast<GlobalObject>(Src)) {
- setAlignment(GV->getAlignment());
- setSection(GV->getSection());
- }
+ setAlignment(Src->getAlignment());
+ setSection(Src->getSection());
}
std::string GlobalValue::getGlobalIdentifier(StringRef Name,
@@ -233,7 +255,7 @@ bool GlobalValue::canIncreaseAlignment() const {
const GlobalObject *GlobalValue::getBaseObject() const {
if (auto *GO = dyn_cast<GlobalObject>(this))
return GO;
- if (auto *GA = dyn_cast<GlobalAlias>(this))
+ if (auto *GA = dyn_cast<GlobalIndirectSymbol>(this))
return GA->getBaseObject();
return nullptr;
}
@@ -333,12 +355,11 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
/// Copy all additional attributes (those not needed to create a GlobalVariable)
/// from the GlobalVariable Src to this one.
-void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
+void GlobalVariable::copyAttributesFrom(const GlobalVariable *Src) {
GlobalObject::copyAttributesFrom(Src);
- if (const GlobalVariable *SrcVar = dyn_cast<GlobalVariable>(Src)) {
- setThreadLocalMode(SrcVar->getThreadLocalMode());
- setExternallyInitialized(SrcVar->isExternallyInitialized());
- }
+ setThreadLocalMode(Src->getThreadLocalMode());
+ setExternallyInitialized(Src->isExternallyInitialized());
+ setAttributes(Src->getAttributes());
}
void GlobalVariable::dropAllReferences() {
diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index e265a823687f..3477c087967f 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@@ -161,6 +161,94 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
return CI;
}
+static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
+ Value *Src) {
+ Module *M = Builder->GetInsertBlock()->getParent()->getParent();
+ Value *Ops[] = {Src};
+ Type *Tys[] = { Src->getType()->getVectorElementType(), Src->getType() };
+ auto Decl = Intrinsic::getDeclaration(M, ID, Tys);
+ return createCallHelper(Decl, Ops, Builder);
+}
+
+CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
+ Module *M = GetInsertBlock()->getParent()->getParent();
+ Value *Ops[] = {Acc, Src};
+ Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+ Src->getType()};
+ auto Decl = Intrinsic::getDeclaration(
+ M, Intrinsic::experimental_vector_reduce_fadd, Tys);
+ return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
+ Module *M = GetInsertBlock()->getParent()->getParent();
+ Value *Ops[] = {Acc, Src};
+ Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+ Src->getType()};
+ auto Decl = Intrinsic::getDeclaration(
+ M, Intrinsic::experimental_vector_reduce_fmul, Tys);
+ return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
+ auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax
+ : Intrinsic::experimental_vector_reduce_umax;
+ return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
+ auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin
+ : Intrinsic::experimental_vector_reduce_umin;
+ return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
+ auto Rdx = getReductionIntrinsic(
+ this, Intrinsic::experimental_vector_reduce_fmax, Src);
+ if (NoNaN) {
+ FastMathFlags FMF;
+ FMF.setNoNaNs();
+ Rdx->setFastMathFlags(FMF);
+ }
+ return Rdx;
+}
+
+CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) {
+ auto Rdx = getReductionIntrinsic(
+ this, Intrinsic::experimental_vector_reduce_fmin, Src);
+ if (NoNaN) {
+ FastMathFlags FMF;
+ FMF.setNoNaNs();
+ Rdx->setFastMathFlags(FMF);
+ }
+ return Rdx;
+}
+
CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
assert(isa<PointerType>(Ptr->getType()) &&
"lifetime.start only applies to pointers.");
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 906a28a5c887..91b9d9232b54 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -534,6 +534,30 @@ bool Instruction::isAtomic() const {
}
}
+bool Instruction::hasAtomicLoad() const {
+ assert(isAtomic());
+ switch (getOpcode()) {
+ default:
+ return false;
+ case Instruction::AtomicCmpXchg:
+ case Instruction::AtomicRMW:
+ case Instruction::Load:
+ return true;
+ }
+}
+
+bool Instruction::hasAtomicStore() const {
+ assert(isAtomic());
+ switch (getOpcode()) {
+ default:
+ return false;
+ case Instruction::AtomicCmpXchg:
+ case Instruction::AtomicRMW:
+ case Instruction::Store:
+ return true;
+ }
+}
+
bool Instruction::mayThrow() const {
if (const CallInst *CI = dyn_cast<CallInst>(this))
return !CI->doesNotThrow();
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index a60cc375d568..5a5b9c0d06bb 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -1,4 +1,4 @@
-//===-- Instructions.cpp - Implement the LLVM instructions ----------------===//
+//===- Instructions.cpp - Implement the LLVM instructions -----------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,18 +12,36 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/IR/Instructions.h"
#include "LLVMContextImpl.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
-#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
using namespace llvm;
//===----------------------------------------------------------------------===//
@@ -42,7 +60,42 @@ User::op_iterator CallSite::getCallee() const {
//===----------------------------------------------------------------------===//
// Out of line virtual method, so the vtable, etc has a home.
-TerminatorInst::~TerminatorInst() {
+TerminatorInst::~TerminatorInst() = default;
+
+unsigned TerminatorInst::getNumSuccessors() const {
+ switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS) \
+ case Instruction::OPC: \
+ return static_cast<const CLASS *>(this)->getNumSuccessorsV();
+#include "llvm/IR/Instruction.def"
+ default:
+ break;
+ }
+ llvm_unreachable("not a terminator");
+}
+
+BasicBlock *TerminatorInst::getSuccessor(unsigned idx) const {
+ switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS) \
+ case Instruction::OPC: \
+ return static_cast<const CLASS *>(this)->getSuccessorV(idx);
+#include "llvm/IR/Instruction.def"
+ default:
+ break;
+ }
+ llvm_unreachable("not a terminator");
+}
+
+void TerminatorInst::setSuccessor(unsigned idx, BasicBlock *B) {
+ switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS) \
+ case Instruction::OPC: \
+ return static_cast<CLASS *>(this)->setSuccessorV(idx, B);
+#include "llvm/IR/Instruction.def"
+ default:
+ break;
+ }
+ llvm_unreachable("not a terminator");
}
//===----------------------------------------------------------------------===//
@@ -50,8 +103,7 @@ TerminatorInst::~TerminatorInst() {
//===----------------------------------------------------------------------===//
// Out of line virtual method, so the vtable, etc has a home.
-UnaryInstruction::~UnaryInstruction() {
-}
+UnaryInstruction::~UnaryInstruction() = default;
//===----------------------------------------------------------------------===//
// SelectInst Class
@@ -82,7 +134,6 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
return nullptr;
}
-
//===----------------------------------------------------------------------===//
// PHINode Class
//===----------------------------------------------------------------------===//
@@ -242,8 +293,7 @@ void LandingPadInst::addClause(Constant *Val) {
// CallInst Implementation
//===----------------------------------------------------------------------===//
-CallInst::~CallInst() {
-}
+CallInst::~CallInst() = default;
void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
@@ -541,7 +591,6 @@ Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
ArraySize, OpB, MallocF, Name);
}
-
/// CreateMalloc - Generate the IR for a call to malloc:
/// 1. Compute the malloc call's argument as the specified type's size,
/// possibly multiplied by the array size if the array size is not
@@ -692,9 +741,11 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
BasicBlock *InvokeInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
+
unsigned InvokeInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
return setSuccessor(idx, B);
}
@@ -821,6 +872,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, Instruction *InsertBefore)
if (retVal)
Op<0>() = retVal;
}
+
ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
: TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
@@ -828,6 +880,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
if (retVal)
Op<0>() = retVal;
}
+
ReturnInst::ReturnInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
: TerminatorInst(Type::getVoidTy(Context), Instruction::Ret,
OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {
@@ -847,8 +900,7 @@ BasicBlock *ReturnInst::getSuccessorV(unsigned idx) const {
llvm_unreachable("ReturnInst has no successors!");
}
-ReturnInst::~ReturnInst() {
-}
+ReturnInst::~ReturnInst() = default;
//===----------------------------------------------------------------------===//
// ResumeInst Implementation
@@ -930,9 +982,11 @@ BasicBlock *CleanupReturnInst::getSuccessorV(unsigned Idx) const {
assert(Idx == 0);
return getUnwindDest();
}
+
unsigned CleanupReturnInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void CleanupReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
assert(Idx == 0);
setUnwindDest(B);
@@ -973,9 +1027,11 @@ BasicBlock *CatchReturnInst::getSuccessorV(unsigned Idx) const {
assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
return getSuccessor();
}
+
unsigned CatchReturnInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void CatchReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
setSuccessor(B);
@@ -1067,9 +1123,11 @@ void CatchSwitchInst::removeHandler(handler_iterator HI) {
BasicBlock *CatchSwitchInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
+
unsigned CatchSwitchInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void CatchSwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
setSuccessor(idx, B);
}
@@ -1155,6 +1213,7 @@ BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
assert(IfTrue && "Branch destination may not be null!");
Op<-1>() = IfTrue;
}
+
BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
Instruction *InsertBefore)
: TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
@@ -1189,7 +1248,6 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
#endif
}
-
BranchInst::BranchInst(const BranchInst &BI) :
TerminatorInst(Type::getVoidTy(BI.getContext()), Instruction::Br,
OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
@@ -1216,14 +1274,15 @@ void BranchInst::swapSuccessors() {
BasicBlock *BranchInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
+
unsigned BranchInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void BranchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
setSuccessor(idx, B);
}
-
//===----------------------------------------------------------------------===//
// AllocaInst Implementation
//===----------------------------------------------------------------------===//
@@ -1279,8 +1338,7 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
}
// Out of line virtual method, so the vtable, etc has a home.
-AllocaInst::~AllocaInst() {
-}
+AllocaInst::~AllocaInst() = default;
void AllocaInst::setAlignment(unsigned Align) {
assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
@@ -1543,8 +1601,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
SynchronizationScope SynchScope,
Instruction *InsertBefore)
: Instruction(
- StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
- nullptr),
+ StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
@@ -1556,8 +1613,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
SynchronizationScope SynchScope,
BasicBlock *InsertAtEnd)
: Instruction(
- StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
- nullptr),
+ StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
OperandTraits<AtomicCmpXchgInst>::operands(this), InsertAtEnd) {
Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
@@ -1771,14 +1827,12 @@ ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
setName(Name);
}
-
bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
if (!Val->getType()->isVectorTy() || !Index->getType()->isIntegerTy())
return false;
return true;
}
-
//===----------------------------------------------------------------------===//
// InsertElementInst Implementation
//===----------------------------------------------------------------------===//
@@ -1825,7 +1879,6 @@ bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt,
return true;
}
-
//===----------------------------------------------------------------------===//
// ShuffleVectorInst Implementation
//===----------------------------------------------------------------------===//
@@ -1938,7 +1991,6 @@ void ShuffleVectorInst::getShuffleMask(Constant *Mask,
}
}
-
//===----------------------------------------------------------------------===//
// InsertValueInst Class
//===----------------------------------------------------------------------===//
@@ -1951,7 +2003,7 @@ void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
// (other than weirdness with &*IdxBegin being invalid; see
// getelementptr's init routine for example). But there's no
// present need to support it.
- assert(Idxs.size() > 0 && "InsertValueInst must have at least one index");
+ assert(!Idxs.empty() && "InsertValueInst must have at least one index");
assert(ExtractValueInst::getIndexedType(Agg->getType(), Idxs) ==
Val->getType() && "Inserted value must match indexed type!");
@@ -1980,7 +2032,7 @@ void ExtractValueInst::init(ArrayRef<unsigned> Idxs, const Twine &Name) {
// There's no fundamental reason why we require at least one index.
// But there's no present need to support it.
- assert(Idxs.size() > 0 && "ExtractValueInst must have at least one index");
+ assert(!Idxs.empty() && "ExtractValueInst must have at least one index");
Indices.append(Idxs.begin(), Idxs.end());
setName(Name);
@@ -2053,7 +2105,6 @@ BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
setName(Name);
}
-
void BinaryOperator::init(BinaryOps iType) {
Value *LHS = getOperand(0), *RHS = getOperand(1);
(void)LHS; (void)RHS; // Silence warnings.
@@ -2213,7 +2264,6 @@ BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
Op->getType(), Name, InsertAtEnd);
}
-
// isConstantAllOnes - Helper function for several functions below
static inline bool isConstantAllOnes(const Value *V) {
if (const Constant *C = dyn_cast<Constant>(V))
@@ -2279,7 +2329,6 @@ const Value *BinaryOperator::getNotArgument(const Value *BinOp) {
return getNotArgument(const_cast<Value*>(BinOp));
}
-
// Exchange the two operands to this instruction. This instruction is safe to
// use on any binary instruction and does not modify the semantics of the
// instruction. If the instruction is order-dependent (SetLT f.e.), the opcode
@@ -2291,7 +2340,6 @@ bool BinaryOperator::swapOperands() {
return false;
}
-
//===----------------------------------------------------------------------===//
// FPMathOperator Class
//===----------------------------------------------------------------------===//
@@ -2305,7 +2353,6 @@ float FPMathOperator::getFPAccuracy() const {
return Accuracy->getValueAPF().convertToFloat();
}
-
//===----------------------------------------------------------------------===//
// CastInst Class
//===----------------------------------------------------------------------===//
@@ -2567,13 +2614,12 @@ unsigned CastInst::isEliminableCastPair(
return Instruction::BitCast;
return 0;
}
- case 12: {
+ case 12:
// addrspacecast, addrspacecast -> bitcast, if SrcAS == DstAS
// addrspacecast, addrspacecast -> addrspacecast, if SrcAS != DstAS
if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
return Instruction::AddrSpaceCast;
return Instruction::BitCast;
- }
case 13:
// FIXME: this state can be merged with (1), but the following assert
// is useful to check the correcteness of the sequence due to semantic
@@ -2594,7 +2640,6 @@ unsigned CastInst::isEliminableCastPair(
DstTy->getScalarType()->getPointerElementType())
return Instruction::AddrSpaceCast;
return 0;
-
case 15:
// FIXME: this state can be merged with (1), but the following assert
// is useful to check the correcteness of the sequence due to semantic
@@ -3070,7 +3115,6 @@ CastInst::getCastOpcode(
/// of the types involved.
bool
CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
-
// Check for type sanity on the arguments
Type *SrcTy = S->getType();
@@ -3419,7 +3463,6 @@ bool CmpInst::isEquality() const {
return cast<FCmpInst>(this)->isEquality();
}
-
CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
switch (pred) {
default: llvm_unreachable("Unknown cmp predicate!");
@@ -3743,9 +3786,11 @@ void SwitchInst::growOperands() {
BasicBlock *SwitchInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
+
unsigned SwitchInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void SwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
setSuccessor(idx, B);
}
@@ -3832,9 +3877,11 @@ void IndirectBrInst::removeDestination(unsigned idx) {
BasicBlock *IndirectBrInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
+
unsigned IndirectBrInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void IndirectBrInst::setSuccessorV(unsigned idx, BasicBlock *B) {
setSuccessor(idx, B);
}
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 628a67bd639c..b2b12289f871 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManagers.h"
@@ -465,6 +466,11 @@ public:
// null. It may be called multiple times.
static void createTheTimeInfo();
+ // print - Prints out timing information and then resets the timers.
+ void print() {
+ TG.print(*CreateInfoOutputFile());
+ }
+
/// getPassTimer - Return the timer for the specified pass if it exists.
Timer *getPassTimer(Pass *P) {
if (P->getAsPMDataManager())
@@ -1752,6 +1758,13 @@ Timer *llvm::getPassTimer(Pass *P) {
return nullptr;
}
+/// If timing is enabled, report the times collected up to now and then reset
+/// them.
+void llvm::reportAndResetTimings() {
+ if (TheTimeInfo)
+ TheTimeInfo->print();
+}
+
//===----------------------------------------------------------------------===//
// PMStack implementation
//
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index fec9df193685..12c258d95f52 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -1,4 +1,4 @@
-//===-- Module.cpp - Implement the Module class ---------------------------===//
+//===- Module.cpp - Implement the Module class ----------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,27 +11,46 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/IR/Module.h"
#include "SymbolTableListTraitsImpl.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Comdat.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/GVMaterializer.h"
-#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Type.h"
#include "llvm/IR/TypeFinder.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/RandomNumberGenerator.h"
#include <algorithm>
-#include <cstdarg>
-#include <cstdlib>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
using namespace llvm;
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index b67b0a307861..c9f957c244f8 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -1,4 +1,4 @@
-//===-- Type.cpp - Implement the Type class -------------------------------===//
+//===- Type.cpp - Implement the Type class --------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,12 +11,25 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/IR/Type.h"
#include "LLVMContextImpl.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
-#include <algorithm>
-#include <cstdarg>
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+
using namespace llvm;
//===----------------------------------------------------------------------===//
@@ -220,7 +233,6 @@ PointerType *Type::getInt64PtrTy(LLVMContext &C, unsigned AS) {
return getInt64Ty(C)->getPointerTo(AS);
}
-
//===----------------------------------------------------------------------===//
// IntegerType Implementation
//===----------------------------------------------------------------------===//
@@ -362,7 +374,8 @@ void StructType::setName(StringRef Name) {
if (Name == getName()) return;
StringMap<StructType *> &SymbolTable = getContext().pImpl->NamedStructTypes;
- typedef StringMap<StructType *>::MapEntryTy EntryTy;
+
+ using EntryTy = StringMap<StructType *>::MapEntryTy;
// If this struct already had a name, remove its symbol table entry. Don't
// delete the data yet because it may be part of the new name.
@@ -419,21 +432,6 @@ StructType *StructType::get(LLVMContext &Context, bool isPacked) {
return get(Context, None, isPacked);
}
-StructType *StructType::get(Type *type, ...) {
- assert(type && "Cannot create a struct type with no elements with this");
- LLVMContext &Ctx = type->getContext();
- va_list ap;
- SmallVector<llvm::Type*, 8> StructFields;
- va_start(ap, type);
- while (type) {
- StructFields.push_back(type);
- type = va_arg(ap, llvm::Type*);
- }
- auto *Ret = llvm::StructType::get(Ctx, StructFields);
- va_end(ap);
- return Ret;
-}
-
StructType *StructType::create(LLVMContext &Context, ArrayRef<Type*> Elements,
StringRef Name, bool isPacked) {
StructType *ST = create(Context, Name);
@@ -462,21 +460,6 @@ StructType *StructType::create(ArrayRef<Type*> Elements) {
return create(Elements[0]->getContext(), Elements, StringRef());
}
-StructType *StructType::create(StringRef Name, Type *type, ...) {
- assert(type && "Cannot create a struct type with no elements with this");
- LLVMContext &Ctx = type->getContext();
- va_list ap;
- SmallVector<llvm::Type*, 8> StructFields;
- va_start(ap, type);
- while (type) {
- StructFields.push_back(type);
- type = va_arg(ap, llvm::Type*);
- }
- auto *Ret = llvm::StructType::create(Ctx, StructFields, Name);
- va_end(ap);
- return Ret;
-}
-
bool StructType::isSized(SmallPtrSetImpl<Type*> *Visited) const {
if ((getSubclassData() & SCDB_IsSized) != 0)
return true;
@@ -508,19 +491,6 @@ StringRef StructType::getName() const {
return ((StringMapEntry<StructType*> *)SymbolTableEntry)->getKey();
}
-void StructType::setBody(Type *type, ...) {
- assert(type && "Cannot create a struct type with no elements with this");
- va_list ap;
- SmallVector<llvm::Type*, 8> StructFields;
- va_start(ap, type);
- while (type) {
- StructFields.push_back(type);
- type = va_arg(ap, llvm::Type*);
- }
- setBody(StructFields);
- va_end(ap);
-}
-
bool StructType::isValidElementType(Type *ElemTy) {
return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
!ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
@@ -540,7 +510,6 @@ StructType *Module::getTypeByName(StringRef Name) const {
return getContext().pImpl->NamedStructTypes.lookup(Name);
}
-
//===----------------------------------------------------------------------===//
// CompositeType Implementation
//===----------------------------------------------------------------------===//
@@ -589,7 +558,6 @@ bool CompositeType::indexValid(unsigned Idx) const {
return true;
}
-
//===----------------------------------------------------------------------===//
// ArrayType Implementation
//===----------------------------------------------------------------------===//
@@ -661,7 +629,6 @@ PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
return Entry;
}
-
PointerType::PointerType(Type *E, unsigned AddrSpace)
: Type(E->getContext(), PointerTyID), PointeeTy(E) {
ContainedTys = &PointeeTy;
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 65e124562493..3b68d6365872 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -267,6 +267,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
/// \brief Keep track of the metadata nodes that have been checked already.
SmallPtrSet<const Metadata *, 32> MDNodes;
+ /// Keep track which DISubprogram is attached to which function.
+ DenseMap<const DISubprogram *, const Function *> DISubprogramAttachments;
+
/// Track all DICompileUnits visited.
SmallPtrSet<const Metadata *, 2> CUVisited;
@@ -386,7 +389,7 @@ public:
verifyCompileUnits();
verifyDeoptimizeCallingConvs();
-
+ DISubprogramAttachments.clear();
return !Broken;
}
@@ -2085,13 +2088,19 @@ void Verifier::visitFunction(const Function &F) {
switch (I.first) {
default:
break;
- case LLVMContext::MD_dbg:
+ case LLVMContext::MD_dbg: {
++NumDebugAttachments;
AssertDI(NumDebugAttachments == 1,
"function must have a single !dbg attachment", &F, I.second);
AssertDI(isa<DISubprogram>(I.second),
"function !dbg attachment must be a subprogram", &F, I.second);
+ auto *SP = cast<DISubprogram>(I.second);
+ const Function *&AttachedTo = DISubprogramAttachments[SP];
+ AssertDI(!AttachedTo || AttachedTo == &F,
+ "DISubprogram attached to more than one function", SP, &F);
+ AttachedTo = &F;
break;
+ }
case LLVMContext::MD_prof:
++NumProfAttachments;
Assert(NumProfAttachments == 1,
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index 684b378c93e5..89ddd0fc1af3 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -24,7 +24,6 @@ subdirectories =
DebugInfo
Demangle
ExecutionEngine
- LibDriver
LineEditor
Linker
IR
@@ -39,6 +38,7 @@ subdirectories =
Support
TableGen
Target
+ ToolDrivers
Transforms
[component_0]
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 2d2dcdec05fb..c73b6b6b15c1 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -973,7 +973,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
// this value. If not, no need to preserve any ThinLTO copies.
!Res.second.IRName.empty())
GUIDPreservedSymbols.insert(GlobalValue::getGUID(
- GlobalValue::getRealLinkageName(Res.second.IRName)));
+ GlobalValue::dropLLVMManglingEscape(Res.second.IRName)));
}
auto DeadSymbols =
@@ -993,7 +993,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
if (Res.second.IRName.empty())
continue;
auto GUID = GlobalValue::getGUID(
- GlobalValue::getRealLinkageName(Res.second.IRName));
+ GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
// Mark exported unless index-based analysis determined it to be dead.
if (!DeadSymbols.count(GUID))
ExportedGUIDs.insert(GUID);
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 86fba843e980..6a275560dc92 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -495,17 +495,14 @@ void LTOCodeGenerator::verifyMergedModuleOnce() {
return;
HasVerifiedInput = true;
- if (LTOStripInvalidDebugInfo) {
- bool BrokenDebugInfo = false;
- if (verifyModule(*MergedModule, &dbgs(), &BrokenDebugInfo))
- report_fatal_error("Broken module found, compilation aborted!");
- if (BrokenDebugInfo) {
- emitWarning("Invalid debug info found, debug info will be stripped");
- StripDebugInfo(*MergedModule);
- }
- }
- if (verifyModule(*MergedModule, &dbgs()))
+ bool BrokenDebugInfo = false;
+ if (verifyModule(*MergedModule, &dbgs(),
+ LTOStripInvalidDebugInfo ? &BrokenDebugInfo : nullptr))
report_fatal_error("Broken module found, compilation aborted!");
+ if (BrokenDebugInfo) {
+ emitWarning("Invalid debug info found, debug info will be stripped");
+ StripDebugInfo(*MergedModule);
+ }
}
void LTOCodeGenerator::finishOptimizationRemarks() {
@@ -600,6 +597,7 @@ bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> Out) {
// If statistics were requested, print them out after codegen.
if (llvm::AreStatisticsEnabled())
llvm::PrintStatistics();
+ reportAndResetTimings();
finishOptimizationRemarks();
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index b4ee7c2b2fbc..65a7994325bc 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -446,7 +446,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
{
raw_svector_ostream OS(OutputBuffer);
ProfileSummaryInfo PSI(TheModule);
- auto Index = buildModuleSummaryIndex(TheModule, nullptr, nullptr);
+ auto Index = buildModuleSummaryIndex(TheModule, nullptr, &PSI);
WriteBitcodeToFile(&TheModule, OS, true, &Index);
}
return make_unique<ObjectMemoryBuffer>(std::move(OutputBuffer));
@@ -1024,4 +1024,5 @@ void ThinLTOCodeGenerator::run() {
// If statistics were requested, print them out now.
if (llvm::AreStatisticsEnabled())
llvm::PrintStatistics();
+ reportAndResetTimings();
}
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index 15a46a2d0420..ecef1efda1a2 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -602,6 +602,7 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
/*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
SGVar->getType()->getAddressSpace());
NewDGV->setAlignment(SGVar->getAlignment());
+ NewDGV->copyAttributesFrom(SGVar);
return NewDGV;
}
@@ -610,8 +611,11 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
Function *IRLinker::copyFunctionProto(const Function *SF) {
// If there is no linkage to be performed or we are linking from the source,
// bring SF over.
- return Function::Create(TypeMap.get(SF->getFunctionType()),
- GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+ auto *F =
+ Function::Create(TypeMap.get(SF->getFunctionType()),
+ GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+ F->copyAttributesFrom(SF);
+ return F;
}
/// Set up prototypes for any aliases that come over from the source module.
@@ -619,9 +623,11 @@ GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
// If there is no linkage to be performed or we're linking from the source,
// bring over SGA.
auto *Ty = TypeMap.get(SGA->getValueType());
- return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
- GlobalValue::ExternalLinkage, SGA->getName(),
- &DstM);
+ auto *GA =
+ GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
+ GlobalValue::ExternalLinkage, SGA->getName(), &DstM);
+ GA->copyAttributesFrom(SGA);
+ return GA;
}
GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
@@ -648,8 +654,6 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
else if (SGV->hasExternalWeakLinkage())
NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
- NewGV->copyAttributesFrom(SGV);
-
if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
// Metadata for global variables and function declarations is copied eagerly.
if (isa<GlobalVariable>(SGV) || SGV->isDeclaration())
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index f7f2253256eb..174397e27396 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -133,6 +133,11 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
// Avoid fixups when possible.
int64_t AbsValue;
if (Value->evaluateAsAbsolute(AbsValue, getAssembler())) {
+ if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) {
+ getContext().reportError(
+ Loc, "value evaluated as " + Twine(AbsValue) + " is out of range.");
+ return;
+ }
EmitIntValue(AbsValue, Size);
return;
}
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 66ba853da2fe..3b213ef4ce09 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -288,6 +288,7 @@ public:
private:
bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc);
+ void altMacroString(StringRef AltMacroStr, std::string &Res);
bool parseStatement(ParseStatementInfo &Info,
MCAsmParserSemaCallback *SI);
bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
@@ -1209,6 +1210,8 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
const char *CharPtr = StrLoc.getPointer();
while ((*CharPtr != '>') && (*CharPtr != '\n') &&
(*CharPtr != '\r') && (*CharPtr != '\0')){
+ if(*CharPtr == '!')
+ CharPtr++;
CharPtr++;
}
if (*CharPtr == '>') {
@@ -1218,6 +1221,15 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
return false;
}
+/// \brief creating a string without the escape characters '!'.
+void AsmParser::altMacroString(StringRef AltMacroStr,std::string &Res) {
+ for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
+ if (AltMacroStr[Pos] == '!')
+ Pos++;
+ Res += AltMacroStr[Pos];
+ }
+}
+
/// \brief Parse an expression and return it.
///
/// expr ::= expr &&,|| expr -> lowest.
@@ -2309,6 +2321,15 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
(*(Token.getString().begin()) == '%') && Token.is(AsmToken::Integer))
// Emit an integer value to the buffer.
OS << Token.getIntVal();
+ // Only Token that was validated as a string and begins with '<'
+ // is considered altMacroString!!!
+ else if ((Lexer.IsaAltMacroMode()) &&
+ (*(Token.getString().begin()) == '<') &&
+ Token.is(AsmToken::String)) {
+ std::string Res;
+ altMacroString(Token.getStringContents(), Res);
+ OS << Res;
+ }
// We expect no quotes around the string's contents when
// parsing for varargs.
else if (Token.isNot(AsmToken::String) || VarargParameter)
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index b1223e81be43..28531feccfe1 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -1062,7 +1062,7 @@ COFFObjectFile::getSectionContents(const coff_section *Sec,
// In COFF, a virtual section won't have any in-file
// content, so the file pointer to the content will be zero.
if (Sec->PointerToRawData == 0)
- return object_error::parse_failed;
+ return std::error_code();
// The only thing that we need to verify is that the contents is contained
// within the file bounds. We don't need to make sure it doesn't cover other
// data, as there's nothing that says that is not allowed.
@@ -1602,8 +1602,6 @@ ErrorOr<ArrayRef<UTF16>> ResourceSectionRef::getDirStringAtOffset(uint32_t Offse
uint16_t Length;
RETURN_IF_ERROR(Reader.readInteger(Length));
ArrayRef<UTF16> RawDirString;
- // Strings are stored as 2-byte aligned unicode characters but readFixedString
- // assumes byte string, so we double length.
RETURN_IF_ERROR(Reader.readArray(RawDirString, Length));
return RawDirString;
}
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 39f8704aacf2..058686e4db9e 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -168,6 +168,13 @@ static wasm::WasmLimits readLimits(const uint8_t *&Ptr) {
return Result;
}
+static wasm::WasmTable readTable(const uint8_t *&Ptr) {
+ wasm::WasmTable Table;
+ Table.ElemType = readVarint7(Ptr);
+ Table.Limits = readLimits(Ptr);
+ return Table;
+}
+
static Error readSection(WasmSection &Section, const uint8_t *&Ptr,
const uint8_t *Start) {
// TODO(sbc): Avoid reading past EOF in the case of malformed files.
@@ -397,13 +404,22 @@ Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End)
Sections.size(), i);
break;
case wasm::WASM_EXTERNAL_GLOBAL:
- Im.GlobalType = readVarint7(Ptr);
- Im.GlobalMutable = readVaruint1(Ptr);
+ Im.Global.Type = readVarint7(Ptr);
+ Im.Global.Mutable = readVaruint1(Ptr);
Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT,
Sections.size(), i);
break;
+ case wasm::WASM_EXTERNAL_MEMORY:
+ Im.Memory = readLimits(Ptr);
+ break;
+ case wasm::WASM_EXTERNAL_TABLE:
+ Im.Table = readTable(Ptr);
+ if (Im.Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+ return make_error<GenericBinaryError>("Invalid table element type",
+ object_error::parse_failed);
+ }
+ break;
default:
- // TODO(sbc): Handle other kinds of imports
return make_error<GenericBinaryError>(
"Unexpected import kind", object_error::parse_failed);
}
@@ -431,14 +447,11 @@ Error WasmObjectFile::parseTableSection(const uint8_t *Ptr, const uint8_t *End)
uint32_t Count = readVaruint32(Ptr);
Tables.reserve(Count);
while (Count--) {
- wasm::WasmTable Table;
- Table.ElemType = readVarint7(Ptr);
- if (Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+ Tables.push_back(readTable(Ptr));
+ if (Tables.back().ElemType != wasm::WASM_TYPE_ANYFUNC) {
return make_error<GenericBinaryError>("Invalid table element type",
object_error::parse_failed);
}
- Table.Limits = readLimits(Ptr);
- Tables.push_back(Table);
}
if (Ptr != End)
return make_error<GenericBinaryError>("Table section ended prematurely",
@@ -493,8 +506,10 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::GLOBAL_EXPORT,
Sections.size(), i);
break;
+ case wasm::WASM_EXTERNAL_MEMORY:
+ case wasm::WASM_EXTERNAL_TABLE:
+ break;
default:
- // TODO(sbc): Handle other kinds of exports
return make_error<GenericBinaryError>(
"Unexpected export kind", object_error::parse_failed);
}
@@ -507,7 +522,7 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
Error WasmObjectFile::parseStartSection(const uint8_t *Ptr, const uint8_t *End) {
StartFunction = readVaruint32(Ptr);
- if (StartFunction < FunctionTypes.size())
+ if (StartFunction >= FunctionTypes.size())
return make_error<GenericBinaryError>("Invalid start function",
object_error::parse_failed);
return Error::success();
@@ -638,10 +653,14 @@ basic_symbol_iterator WasmObjectFile::symbol_end() const {
return BasicSymbolRef(Ref, this);
}
-const WasmSymbol &WasmObjectFile::getWasmSymbol(DataRefImpl Symb) const {
+const WasmSymbol &WasmObjectFile::getWasmSymbol(const DataRefImpl &Symb) const {
return Symbols[Symb.d.a];
}
+const WasmSymbol &WasmObjectFile::getWasmSymbol(const SymbolRef &Symb) const {
+ return getWasmSymbol(Symb.getRawDataRefImpl());
+}
+
Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
const WasmSymbol &Sym = getWasmSymbol(Symb);
return Sym.Name;
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index c5d1b438ee2a..910d32f16af9 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -265,8 +265,12 @@ void MappingTraits<WasmYAML::Import>::mapping(IO &IO,
if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
IO.mapRequired("SigIndex", Import.SigIndex);
} else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
- IO.mapRequired("GlobalType", Import.GlobalType);
- IO.mapRequired("GlobalMutable", Import.GlobalMutable);
+ IO.mapRequired("GlobalType", Import.GlobalImport.Type);
+ IO.mapRequired("GlobalMutable", Import.GlobalImport.Mutable);
+ } else if (Import.Kind == wasm::WASM_EXTERNAL_TABLE) {
+ IO.mapRequired("Table", Import.TableImport);
+ } else if (Import.Kind == wasm::WASM_EXTERNAL_MEMORY ) {
+ IO.mapRequired("Memory", Import.Memory);
} else {
llvm_unreachable("unhandled import type");
}
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index b91b6fb7c7ad..b05efa7417b9 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -29,6 +29,7 @@
#include <algorithm>
#include <cstdint>
#include <memory>
+#include <set>
#include <system_error>
#include <utility>
#include <vector>
@@ -36,6 +37,32 @@
using namespace llvm;
using namespace sampleprof;
+std::error_code
+SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
+ if (std::error_code EC = writeHeader(ProfileMap))
+ return EC;
+
+ // Sort the ProfileMap by total samples.
+ typedef std::pair<StringRef, const FunctionSamples *> NameFunctionSamples;
+ std::vector<NameFunctionSamples> V;
+ for (const auto &I : ProfileMap)
+ V.push_back(std::make_pair(I.getKey(), &I.second));
+
+ std::stable_sort(
+ V.begin(), V.end(),
+ [](const NameFunctionSamples &A, const NameFunctionSamples &B) {
+ if (A.second->getTotalSamples() == B.second->getTotalSamples())
+ return A.first > B.first;
+ return A.second->getTotalSamples() > B.second->getTotalSamples();
+ });
+
+ for (const auto &I : V) {
+ if (std::error_code EC = write(*I.second))
+ return EC;
+ }
+ return sampleprof_error::success;
+}
+
/// \brief Write samples to a text file.
///
/// Note: it may be tempting to implement this in terms of
@@ -97,8 +124,7 @@ std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) {
}
void SampleProfileWriterBinary::addName(StringRef FName) {
- auto NextIdx = NameTable.size();
- NameTable.insert(std::make_pair(FName, NextIdx));
+ NameTable.insert(std::make_pair(FName, 0));
}
void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
@@ -136,10 +162,18 @@ std::error_code SampleProfileWriterBinary::writeHeader(
addNames(I.second);
}
+ // Sort the names to make NameTable is deterministic.
+ std::set<StringRef> V;
+ for (const auto &I : NameTable)
+ V.insert(I.first);
+ int i = 0;
+ for (const StringRef &N : V)
+ NameTable[N] = i++;
+
// Write out the name table.
encodeULEB128(NameTable.size(), OS);
- for (auto N : NameTable) {
- OS << N.first;
+ for (auto N : V) {
+ OS << N;
encodeULEB128(0, OS);
}
return sampleprof_error::success;
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index caa0691f9205..17144522db82 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -122,35 +122,38 @@ APInt::APInt(unsigned numbits, StringRef Str, uint8_t radix)
fromString(numbits, Str, radix);
}
+void APInt::reallocate(unsigned NewBitWidth) {
+ // If the number of words is the same we can just change the width and stop.
+ if (getNumWords() == getNumWords(NewBitWidth)) {
+ BitWidth = NewBitWidth;
+ return;
+ }
+
+ // If we have an allocation, delete it.
+ if (!isSingleWord())
+ delete [] U.pVal;
+
+ // Update BitWidth.
+ BitWidth = NewBitWidth;
+
+ // If we are supposed to have an allocation, create it.
+ if (!isSingleWord())
+ U.pVal = getMemory(getNumWords());
+}
+
void APInt::AssignSlowCase(const APInt& RHS) {
// Don't do anything for X = X
if (this == &RHS)
return;
- if (BitWidth == RHS.getBitWidth()) {
- // assume same bit-width single-word case is already handled
- assert(!isSingleWord());
- memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE);
- return;
- }
+ // Adjust the bit width and handle allocations as necessary.
+ reallocate(RHS.getBitWidth());
- if (isSingleWord()) {
- // assume case where both are single words is already handled
- assert(!RHS.isSingleWord());
- U.pVal = getMemory(RHS.getNumWords());
- memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
- } else if (getNumWords() == RHS.getNumWords())
- memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
- else if (RHS.isSingleWord()) {
- delete [] U.pVal;
+ // Copy the data.
+ if (isSingleWord())
U.VAL = RHS.U.VAL;
- } else {
- delete [] U.pVal;
- U.pVal = getMemory(RHS.getNumWords());
- memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
- }
- BitWidth = RHS.BitWidth;
- clearUnusedBits();
+ else
+ memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE);
}
/// This method 'profiles' an APInt for use with FoldingSet.
@@ -1138,10 +1141,13 @@ APInt APInt::multiplicativeInverse(const APInt& modulo) const {
return APInt(BitWidth, 0);
// The next-to-last t is the multiplicative inverse. However, we are
- // interested in a positive inverse. Calcuate a positive one from a negative
+ // interested in a positive inverse. Calculate a positive one from a negative
// one if necessary. A simple addition of the modulo suffices because
// abs(t[i]) is known to be less than *this/2 (see the link above).
- return t[i].isNegative() ? t[i] + modulo : t[i];
+ if (t[i].isNegative())
+ t[i] += modulo;
+
+ return std::move(t[i]);
}
/// Calculate the magic numbers required to implement a signed integer division
@@ -1240,7 +1246,7 @@ APInt::mu APInt::magicu(unsigned LeadingZeros) const {
/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
/// variables here have the same names as in the algorithm. Comments explain
/// the algorithm and any deviation from it.
-static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
+static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
unsigned m, unsigned n) {
assert(u && "Must provide dividend");
assert(v && "Must provide divisor");
@@ -1266,16 +1272,16 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
// overflow. Note that this can require an extra word in u so that u must
// be of length m+n+1.
unsigned shift = countLeadingZeros(v[n-1]);
- unsigned v_carry = 0;
- unsigned u_carry = 0;
+ uint32_t v_carry = 0;
+ uint32_t u_carry = 0;
if (shift) {
for (unsigned i = 0; i < m+n; ++i) {
- unsigned u_tmp = u[i] >> (32 - shift);
+ uint32_t u_tmp = u[i] >> (32 - shift);
u[i] = (u[i] << shift) | u_carry;
u_carry = u_tmp;
}
for (unsigned i = 0; i < n; ++i) {
- unsigned v_tmp = v[i] >> (32 - shift);
+ uint32_t v_tmp = v[i] >> (32 - shift);
v[i] = (v[i] << shift) | v_carry;
v_carry = v_tmp;
}
@@ -1296,11 +1302,11 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
// Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
// Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
// Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
- // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+ // qp by 1, increase rp by v[n-1], and repeat this test if rp < b. The test
// on v[n-2] determines at high speed most of the cases in which the trial
// value qp is one too large, and it eliminates all cases where qp is two
// too large.
- uint64_t dividend = ((uint64_t(u[j+n]) << 32) + u[j+n-1]);
+ uint64_t dividend = Make_64(u[j+n], u[j+n-1]);
DEBUG(dbgs() << "KnuthDiv: dividend == " << dividend << '\n');
uint64_t qp = dividend / v[n-1];
uint64_t rp = dividend % v[n-1];
@@ -1323,14 +1329,14 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
int64_t borrow = 0;
for (unsigned i = 0; i < n; ++i) {
uint64_t p = uint64_t(qp) * uint64_t(v[i]);
- int64_t subres = int64_t(u[j+i]) - borrow - (unsigned)p;
- u[j+i] = (unsigned)subres;
- borrow = (p >> 32) - (subres >> 32);
+ int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p);
+ u[j+i] = Lo_32(subres);
+ borrow = Hi_32(p) - Hi_32(subres);
DEBUG(dbgs() << "KnuthDiv: u[j+i] = " << u[j+i]
<< ", borrow = " << borrow << '\n');
}
bool isNeg = u[j+n] < borrow;
- u[j+n] -= (unsigned)borrow;
+ u[j+n] -= Lo_32(borrow);
DEBUG(dbgs() << "KnuthDiv: after subtraction:");
DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
@@ -1338,7 +1344,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
// D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
// negative, go to step D6; otherwise go on to step D7.
- q[j] = (unsigned)qp;
+ q[j] = Lo_32(qp);
if (isNeg) {
// D6. [Add back]. The probability that this step is necessary is very
// small, on the order of only 2/b. Make sure that test data accounts for
@@ -1349,7 +1355,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
// since it cancels with the borrow that occurred in D4.
bool carry = false;
for (unsigned i = 0; i < n; i++) {
- unsigned limit = std::min(u[j+i],v[i]);
+ uint32_t limit = std::min(u[j+i],v[i]);
u[j+i] += v[i] + carry;
carry = u[j+i] < limit || (carry && u[j+i] == limit);
}
@@ -1374,7 +1380,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
// multiplication by d by using a shift left. So, all we have to do is
// shift right here.
if (shift) {
- unsigned carry = 0;
+ uint32_t carry = 0;
DEBUG(dbgs() << "KnuthDiv: remainder:");
for (int i = n-1; i >= 0; i--) {
r[i] = (u[i] >> shift) | carry;
@@ -1403,17 +1409,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
// can't use 64-bit operands here because we don't have native results of
// 128-bits. Furthermore, casting the 64-bit values to 32-bit values won't
// work on large-endian machines.
- uint64_t mask = ~0ull >> (sizeof(unsigned)*CHAR_BIT);
unsigned n = rhsWords * 2;
unsigned m = (lhsWords * 2) - n;
// Allocate space for the temporary values we need either on the stack, if
// it will fit, or on the heap if it won't.
- unsigned SPACE[128];
- unsigned *U = nullptr;
- unsigned *V = nullptr;
- unsigned *Q = nullptr;
- unsigned *R = nullptr;
+ uint32_t SPACE[128];
+ uint32_t *U = nullptr;
+ uint32_t *V = nullptr;
+ uint32_t *Q = nullptr;
+ uint32_t *R = nullptr;
if ((Remainder?4:3)*n+2*m+1 <= 128) {
U = &SPACE[0];
V = &SPACE[m+n+1];
@@ -1421,34 +1426,34 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
if (Remainder)
R = &SPACE[(m+n+1) + n + (m+n)];
} else {
- U = new unsigned[m + n + 1];
- V = new unsigned[n];
- Q = new unsigned[m+n];
+ U = new uint32_t[m + n + 1];
+ V = new uint32_t[n];
+ Q = new uint32_t[m+n];
if (Remainder)
- R = new unsigned[n];
+ R = new uint32_t[n];
}
// Initialize the dividend
- memset(U, 0, (m+n+1)*sizeof(unsigned));
+ memset(U, 0, (m+n+1)*sizeof(uint32_t));
for (unsigned i = 0; i < lhsWords; ++i) {
- uint64_t tmp = (LHS.getNumWords() == 1 ? LHS.U.VAL : LHS.U.pVal[i]);
- U[i * 2] = (unsigned)(tmp & mask);
- U[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+ uint64_t tmp = LHS.getRawData()[i];
+ U[i * 2] = Lo_32(tmp);
+ U[i * 2 + 1] = Hi_32(tmp);
}
U[m+n] = 0; // this extra word is for "spill" in the Knuth algorithm.
// Initialize the divisor
- memset(V, 0, (n)*sizeof(unsigned));
+ memset(V, 0, (n)*sizeof(uint32_t));
for (unsigned i = 0; i < rhsWords; ++i) {
- uint64_t tmp = (RHS.getNumWords() == 1 ? RHS.U.VAL : RHS.U.pVal[i]);
- V[i * 2] = (unsigned)(tmp & mask);
- V[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+ uint64_t tmp = RHS.getRawData()[i];
+ V[i * 2] = Lo_32(tmp);
+ V[i * 2 + 1] = Hi_32(tmp);
}
// initialize the quotient and remainder
- memset(Q, 0, (m+n) * sizeof(unsigned));
+ memset(Q, 0, (m+n) * sizeof(uint32_t));
if (Remainder)
- memset(R, 0, n * sizeof(unsigned));
+ memset(R, 0, n * sizeof(uint32_t));
// Now, adjust m and n for the Knuth division. n is the number of words in
// the divisor. m is the number of words by which the dividend exceeds the
@@ -1469,22 +1474,22 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
// are using base 2^32 instead of base 10.
assert(n != 0 && "Divide by zero?");
if (n == 1) {
- unsigned divisor = V[0];
- unsigned remainder = 0;
- for (int i = m+n-1; i >= 0; i--) {
- uint64_t partial_dividend = uint64_t(remainder) << 32 | U[i];
+ uint32_t divisor = V[0];
+ uint32_t remainder = 0;
+ for (int i = m; i >= 0; i--) {
+ uint64_t partial_dividend = Make_64(remainder, U[i]);
if (partial_dividend == 0) {
Q[i] = 0;
remainder = 0;
} else if (partial_dividend < divisor) {
Q[i] = 0;
- remainder = (unsigned)partial_dividend;
+ remainder = Lo_32(partial_dividend);
} else if (partial_dividend == divisor) {
Q[i] = 1;
remainder = 0;
} else {
- Q[i] = (unsigned)(partial_dividend / divisor);
- remainder = (unsigned)(partial_dividend - (Q[i] * divisor));
+ Q[i] = Lo_32(partial_dividend / divisor);
+ remainder = Lo_32(partial_dividend - (Q[i] * divisor));
}
}
if (R)
@@ -1498,24 +1503,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
// If the caller wants the quotient
if (Quotient) {
// Set up the Quotient value's memory.
- if (Quotient->BitWidth != LHS.BitWidth) {
- if (Quotient->isSingleWord())
- Quotient->U.VAL = 0;
- else
- delete [] Quotient->U.pVal;
- Quotient->BitWidth = LHS.BitWidth;
- if (!Quotient->isSingleWord())
- Quotient->U.pVal = getClearedMemory(Quotient->getNumWords());
- } else
- Quotient->clearAllBits();
+ Quotient->reallocate(LHS.BitWidth);
+ // Clear out any previous bits.
+ Quotient->clearAllBits();
// The quotient is in Q. Reconstitute the quotient into Quotient's low
// order words.
// This case is currently dead as all users of divide() handle trivial cases
// earlier.
if (lhsWords == 1) {
- uint64_t tmp =
- uint64_t(Q[0]) | (uint64_t(Q[1]) << (APINT_BITS_PER_WORD / 2));
+ uint64_t tmp = Make_64(Q[1], Q[0]);
if (Quotient->isSingleWord())
Quotient->U.VAL = tmp;
else
@@ -1523,30 +1520,21 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
} else {
assert(!Quotient->isSingleWord() && "Quotient APInt not large enough");
for (unsigned i = 0; i < lhsWords; ++i)
- Quotient->U.pVal[i] =
- uint64_t(Q[i*2]) | (uint64_t(Q[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+ Quotient->U.pVal[i] = Make_64(Q[i*2+1], Q[i*2]);
}
}
// If the caller wants the remainder
if (Remainder) {
// Set up the Remainder value's memory.
- if (Remainder->BitWidth != RHS.BitWidth) {
- if (Remainder->isSingleWord())
- Remainder->U.VAL = 0;
- else
- delete [] Remainder->U.pVal;
- Remainder->BitWidth = RHS.BitWidth;
- if (!Remainder->isSingleWord())
- Remainder->U.pVal = getClearedMemory(Remainder->getNumWords());
- } else
- Remainder->clearAllBits();
+ Remainder->reallocate(RHS.BitWidth);
+ // Clear out any previous bits.
+ Remainder->clearAllBits();
// The remainder is in R. Reconstitute the remainder into Remainder's low
// order words.
if (rhsWords == 1) {
- uint64_t tmp =
- uint64_t(R[0]) | (uint64_t(R[1]) << (APINT_BITS_PER_WORD / 2));
+ uint64_t tmp = Make_64(R[1], R[0]);
if (Remainder->isSingleWord())
Remainder->U.VAL = tmp;
else
@@ -1554,8 +1542,7 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
} else {
assert(!Remainder->isSingleWord() && "Remainder APInt not large enough");
for (unsigned i = 0; i < rhsWords; ++i)
- Remainder->U.pVal[i] =
- uint64_t(R[i*2]) | (uint64_t(R[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+ Remainder->U.pVal[i] = Make_64(R[i*2+1], R[i*2]);
}
}
@@ -1578,29 +1565,30 @@ APInt APInt::udiv(const APInt& RHS) const {
}
// Get some facts about the LHS and RHS number of bits and words
- unsigned rhsBits = RHS.getActiveBits();
- unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+ unsigned lhsWords = getNumWords(getActiveBits());
+ unsigned rhsBits = RHS.getActiveBits();
+ unsigned rhsWords = getNumWords(rhsBits);
assert(rhsWords && "Divided by zero???");
- unsigned lhsBits = this->getActiveBits();
- unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
// Deal with some degenerate cases
if (!lhsWords)
// 0 / X ===> 0
return APInt(BitWidth, 0);
- else if (lhsWords < rhsWords || this->ult(RHS)) {
+ if (rhsBits == 1)
+ // X / 1 ===> X
+ return *this;
+ if (lhsWords < rhsWords || this->ult(RHS))
// X / Y ===> 0, iff X < Y
return APInt(BitWidth, 0);
- } else if (*this == RHS) {
+ if (*this == RHS)
// X / X ===> 1
return APInt(BitWidth, 1);
- } else if (lhsWords == 1 && rhsWords == 1) {
+ if (lhsWords == 1) // rhsWords is 1 if lhsWords is 1.
// All high words are zero, just use native divide
return APInt(BitWidth, this->U.pVal[0] / RHS.U.pVal[0]);
- }
// We have to compute it the hard way. Invoke the Knuth divide algorithm.
- APInt Quotient(1,0); // to hold result.
+ APInt Quotient; // to hold result.
divide(*this, lhsWords, RHS, rhsWords, &Quotient, nullptr);
return Quotient;
}
@@ -1624,31 +1612,32 @@ APInt APInt::urem(const APInt& RHS) const {
}
// Get some facts about the LHS
- unsigned lhsBits = getActiveBits();
- unsigned lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+ unsigned lhsWords = getNumWords(getActiveBits());
// Get some facts about the RHS
unsigned rhsBits = RHS.getActiveBits();
- unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+ unsigned rhsWords = getNumWords(rhsBits);
assert(rhsWords && "Performing remainder operation by zero ???");
// Check the degenerate cases
- if (lhsWords == 0) {
+ if (lhsWords == 0)
// 0 % Y ===> 0
return APInt(BitWidth, 0);
- } else if (lhsWords < rhsWords || this->ult(RHS)) {
+ if (rhsBits == 1)
+ // X % 1 ===> 0
+ return APInt(BitWidth, 0);
+ if (lhsWords < rhsWords || this->ult(RHS))
// X % Y ===> X, iff X < Y
return *this;
- } else if (*this == RHS) {
+ if (*this == RHS)
// X % X == 0;
return APInt(BitWidth, 0);
- } else if (lhsWords == 1) {
+ if (lhsWords == 1)
// All high words are zero, just use native remainder
return APInt(BitWidth, U.pVal[0] % RHS.U.pVal[0]);
- }
// We have to compute it the hard way. Invoke the Knuth divide algorithm.
- APInt Remainder(1,0);
+ APInt Remainder;
divide(*this, lhsWords, RHS, rhsWords, nullptr, &Remainder);
return Remainder;
}
@@ -1667,22 +1656,23 @@ APInt APInt::srem(const APInt &RHS) const {
void APInt::udivrem(const APInt &LHS, const APInt &RHS,
APInt &Quotient, APInt &Remainder) {
assert(LHS.BitWidth == RHS.BitWidth && "Bit widths must be the same");
+ unsigned BitWidth = LHS.BitWidth;
// First, deal with the easy case
if (LHS.isSingleWord()) {
assert(RHS.U.VAL != 0 && "Divide by zero?");
uint64_t QuotVal = LHS.U.VAL / RHS.U.VAL;
uint64_t RemVal = LHS.U.VAL % RHS.U.VAL;
- Quotient = APInt(LHS.BitWidth, QuotVal);
- Remainder = APInt(LHS.BitWidth, RemVal);
+ Quotient = APInt(BitWidth, QuotVal);
+ Remainder = APInt(BitWidth, RemVal);
return;
}
// Get some size facts about the dividend and divisor
- unsigned lhsBits = LHS.getActiveBits();
- unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
+ unsigned lhsWords = getNumWords(LHS.getActiveBits());
unsigned rhsBits = RHS.getActiveBits();
- unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+ unsigned rhsWords = getNumWords(rhsBits);
+ assert(rhsWords && "Performing divrem operation by zero ???");
// Check the degenerate cases
if (lhsWords == 0) {
@@ -1691,6 +1681,11 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
return;
}
+ if (rhsBits == 1) {
+ Quotient = LHS; // X / 1 ===> X
+ Remainder = 0; // X % 1 ===> 0
+ }
+
if (lhsWords < rhsWords || LHS.ult(RHS)) {
Remainder = LHS; // X % Y ===> X, iff X < Y
Quotient = 0; // X / Y ===> 0, iff X < Y
@@ -1703,12 +1698,15 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
return;
}
- if (lhsWords == 1 && rhsWords == 1) {
+ if (lhsWords == 1) { // rhsWords is 1 if lhsWords is 1.
// There is only one word to consider so use the native versions.
- uint64_t lhsValue = LHS.isSingleWord() ? LHS.U.VAL : LHS.U.pVal[0];
- uint64_t rhsValue = RHS.isSingleWord() ? RHS.U.VAL : RHS.U.pVal[0];
- Quotient = APInt(LHS.getBitWidth(), lhsValue / rhsValue);
- Remainder = APInt(LHS.getBitWidth(), lhsValue % rhsValue);
+ uint64_t lhsValue = LHS.U.pVal[0];
+ uint64_t rhsValue = RHS.U.pVal[0];
+ // Make sure there is enough space to hold the results.
+ Quotient.reallocate(BitWidth);
+ Remainder.reallocate(BitWidth);
+ Quotient = lhsValue / rhsValue;
+ Remainder = lhsValue % rhsValue;
return;
}
@@ -1723,12 +1721,12 @@ void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
APInt::udivrem(-LHS, -RHS, Quotient, Remainder);
else {
APInt::udivrem(-LHS, RHS, Quotient, Remainder);
- Quotient = -Quotient;
+ Quotient.negate();
}
- Remainder = -Remainder;
+ Remainder.negate();
} else if (RHS.isNegative()) {
APInt::udivrem(LHS, -RHS, Quotient, Remainder);
- Quotient = -Quotient;
+ Quotient.negate();
} else {
APInt::udivrem(LHS, RHS, Quotient, Remainder);
}
@@ -1859,10 +1857,8 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
*this += digit;
}
// If its negative, put it in two's complement form
- if (isNeg) {
- --(*this);
- this->flipAllBits();
- }
+ if (isNeg)
+ this->negate();
}
void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
@@ -1940,8 +1936,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
// They want to print the signed version and it is a negative value
// Flip the bits and add one to turn it into the equivalent positive
// value and put a '-' in the result.
- Tmp.flipAllBits();
- ++Tmp;
+ Tmp.negate();
Str.push_back('-');
}
@@ -1961,22 +1956,19 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1));
unsigned MaskAmt = Radix - 1;
- while (Tmp != 0) {
+ while (Tmp.getBoolValue()) {
unsigned Digit = unsigned(Tmp.getRawData()[0]) & MaskAmt;
Str.push_back(Digits[Digit]);
Tmp.lshrInPlace(ShiftAmt);
}
} else {
- APInt divisor(Radix == 10? 4 : 8, Radix);
- while (Tmp != 0) {
- APInt APdigit(1, 0);
- APInt tmp2(Tmp.getBitWidth(), 0);
- divide(Tmp, Tmp.getNumWords(), divisor, divisor.getNumWords(), &tmp2,
- &APdigit);
+ APInt divisor(Tmp.getBitWidth(), Radix);
+ APInt APdigit;
+ while (Tmp.getBoolValue()) {
+ udivrem(Tmp, divisor, Tmp, APdigit);
unsigned Digit = (unsigned)APdigit.getZExtValue();
assert(Digit < Radix && "divide failed");
Str.push_back(Digits[Digit]);
- Tmp = tmp2;
}
}
@@ -2346,13 +2338,11 @@ int APInt::tcMultiply(WordType *dst, const WordType *lhs,
return overflow;
}
-/* DST = LHS * RHS, where DST has width the sum of the widths of the
- operands. No overflow occurs. DST must be disjoint from both
- operands. Returns the number of parts required to hold the
- result. */
-unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
- const WordType *rhs, unsigned lhsParts,
- unsigned rhsParts) {
+/// DST = LHS * RHS, where DST has width the sum of the widths of the
+/// operands. No overflow occurs. DST must be disjoint from both operands.
+void APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
+ const WordType *rhs, unsigned lhsParts,
+ unsigned rhsParts) {
/* Put the narrower number on the LHS for less loops below. */
if (lhsParts > rhsParts)
return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts);
@@ -2363,10 +2353,6 @@ unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
for (unsigned i = 0; i < lhsParts; i++)
tcMultiplyPart(&dst[i], rhs, lhs[i], 0, rhsParts, rhsParts + 1, true);
-
- unsigned n = lhsParts + rhsParts;
-
- return n - (dst[n - 1] == 0);
}
/* If RHS is zero LHS and REMAINDER are left unchanged, return one.
@@ -2400,22 +2386,20 @@ int APInt::tcDivide(WordType *lhs, const WordType *rhs,
/* Loop, subtracting SRHS if REMAINDER is greater and adding that to
the total. */
for (;;) {
- int compare;
-
- compare = tcCompare(remainder, srhs, parts);
- if (compare >= 0) {
- tcSubtract(remainder, srhs, 0, parts);
- lhs[n] |= mask;
- }
+ int compare = tcCompare(remainder, srhs, parts);
+ if (compare >= 0) {
+ tcSubtract(remainder, srhs, 0, parts);
+ lhs[n] |= mask;
+ }
- if (shiftCount == 0)
- break;
- shiftCount--;
- tcShiftRight(srhs, parts, 1);
- if ((mask >>= 1) == 0) {
- mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
- n--;
- }
+ if (shiftCount == 0)
+ break;
+ shiftCount--;
+ tcShiftRight(srhs, parts, 1);
+ if ((mask >>= 1) == 0) {
+ mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
+ n--;
+ }
}
return false;
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 63c440037c22..83376284548f 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_library(LLVMSupport
MD5.cpp
NativeFormatting.cpp
Options.cpp
+ Parallel.cpp
PluginLoader.cpp
PrettyStackTrace.cpp
RandomNumberGenerator.cpp
diff --git a/lib/Support/Parallel.cpp b/lib/Support/Parallel.cpp
new file mode 100644
index 000000000000..ab2cfdebf07d
--- /dev/null
+++ b/lib/Support/Parallel.cpp
@@ -0,0 +1,138 @@
+//===- llvm/Support/Parallel.cpp - Parallel algorithms --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Parallel.h"
+#include "llvm/Config/llvm-config.h"
+
+#include <atomic>
+#include <stack>
+#include <thread>
+
+using namespace llvm;
+
+namespace {
+
+/// \brief An abstract class that takes closures and runs them asynchronously.
+class Executor {
+public:
+ virtual ~Executor() = default;
+ virtual void add(std::function<void()> func) = 0;
+
+ static Executor *getDefaultExecutor();
+};
+
+#if !LLVM_ENABLE_THREADS
+class SyncExecutor : public Executor {
+public:
+ virtual void add(std::function<void()> F) { F(); }
+};
+
+Executor *Executor::getDefaultExecutor() {
+ static SyncExecutor Exec;
+ return &Exec;
+}
+
+#elif defined(_MSC_VER)
+/// \brief An Executor that runs tasks via ConcRT.
+class ConcRTExecutor : public Executor {
+ struct Taskish {
+ Taskish(std::function<void()> Task) : Task(Task) {}
+
+ std::function<void()> Task;
+
+ static void run(void *P) {
+ Taskish *Self = static_cast<Taskish *>(P);
+ Self->Task();
+ concurrency::Free(Self);
+ }
+ };
+
+public:
+ virtual void add(std::function<void()> F) {
+ Concurrency::CurrentScheduler::ScheduleTask(
+ Taskish::run, new (concurrency::Alloc(sizeof(Taskish))) Taskish(F));
+ }
+};
+
+Executor *Executor::getDefaultExecutor() {
+ static ConcRTExecutor exec;
+ return &exec;
+}
+
+#else
+/// \brief An implementation of an Executor that runs closures on a thread pool
+/// in filo order.
+class ThreadPoolExecutor : public Executor {
+public:
+ explicit ThreadPoolExecutor(
+ unsigned ThreadCount = std::thread::hardware_concurrency())
+ : Done(ThreadCount) {
+ // Spawn all but one of the threads in another thread as spawning threads
+ // can take a while.
+ std::thread([&, ThreadCount] {
+ for (size_t i = 1; i < ThreadCount; ++i) {
+ std::thread([=] { work(); }).detach();
+ }
+ work();
+ }).detach();
+ }
+
+ ~ThreadPoolExecutor() override {
+ std::unique_lock<std::mutex> Lock(Mutex);
+ Stop = true;
+ Lock.unlock();
+ Cond.notify_all();
+ // Wait for ~Latch.
+ }
+
+ void add(std::function<void()> F) override {
+ std::unique_lock<std::mutex> Lock(Mutex);
+ WorkStack.push(F);
+ Lock.unlock();
+ Cond.notify_one();
+ }
+
+private:
+ void work() {
+ while (true) {
+ std::unique_lock<std::mutex> Lock(Mutex);
+ Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+ if (Stop)
+ break;
+ auto Task = WorkStack.top();
+ WorkStack.pop();
+ Lock.unlock();
+ Task();
+ }
+ Done.dec();
+ }
+
+ std::atomic<bool> Stop{false};
+ std::stack<std::function<void()>> WorkStack;
+ std::mutex Mutex;
+ std::condition_variable Cond;
+ parallel::detail::Latch Done;
+};
+
+Executor *Executor::getDefaultExecutor() {
+ static ThreadPoolExecutor exec;
+ return &exec;
+}
+#endif
+}
+
+#if LLVM_ENABLE_THREADS
+void parallel::detail::TaskGroup::spawn(std::function<void()> F) {
+ L.inc();
+ Executor::getDefaultExecutor()->add([&, F] {
+ F();
+ L.dec();
+ });
+}
+#endif
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index fa28ba1b6ab6..cdea09be41e0 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -103,13 +103,16 @@
#define STATVFS_F_FLAG(vfs) (vfs).f_flags
#endif
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+#include <sys/sysctl.h>
+#endif
+
using namespace llvm;
namespace llvm {
namespace sys {
namespace fs {
-#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
- defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
+#if defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \
defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__) || \
defined(_AIX)
static int
@@ -164,7 +167,7 @@ getprogpath(char ret[PATH_MAX], const char *bin)
free(pv);
return nullptr;
}
-#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__
+#endif // Bitrig || OpenBSD || minix || linux || CYGWIN || DragonFly || AIX
/// GetMainExecutable - Return the path to the main executable, given the
/// value of argv[0] from program startup.
@@ -180,9 +183,24 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
if (realpath(exe_path, link_path))
return link_path;
}
-#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
- defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \
- defined(__FreeBSD_kernel__) || defined(_AIX)
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__)
+ int mib[4];
+ mib[0] = CTL_KERN;
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+ mib[1] = KERN_PROC;
+ mib[2] = KERN_PROC_PATHNAME;
+ mib[3] = -1;
+#else
+ mib[1] = KERN_PROC_ARGS;
+ mib[2] = -1;
+ mib[3] = KERN_PROC_PATHNAME;
+#endif
+ char exe_path[PATH_MAX];
+ size_t cb = sizeof(exe_path);
+ if (sysctl(mib, 4, exe_path, &cb, NULL, 0) == 0)
+ return exe_path;
+#elif defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \
+ defined(__DragonFly__) || defined(_AIX)
char exe_path[PATH_MAX];
if (getprogpath(exe_path, argv0) != NULL)
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 16f8f5a98e52..1d0143c6716e 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -347,7 +347,7 @@ static bool terminalHasColors(int fd) {
MutexGuard G(*TermColorMutex);
int errret = 0;
- if (setupterm((char *)nullptr, fd, &errret) != 0)
+ if (setupterm(nullptr, fd, &errret) != 0)
// Regardless of why, if we can't get terminfo, we shouldn't try to print
// colors.
return false;
@@ -369,7 +369,7 @@ static bool terminalHasColors(int fd) {
// Now extract the structure allocated by setupterm and free its memory
// through a really silly dance.
- struct term *termp = set_curterm((struct term *)nullptr);
+ struct term *termp = set_curterm(nullptr);
(void)del_curterm(termp); // Drop any errors here.
// Return true if we found a color capabilities for the current terminal.
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 73f2b6a25f66..4af5fef4287c 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -216,6 +216,7 @@ def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON,
FeaturePerfMon
]>;
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index ff3e4c40e2c2..29f6d571d6bd 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -380,7 +380,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets);
}
- CallSeqStart.addImm(Handler.StackSize);
+ CallSeqStart.addImm(Handler.StackSize).addImm(0);
MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
.addImm(Handler.StackSize)
.addImm(0);
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 083708001757..9ac7ecb9cdb4 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3014,7 +3014,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
// Issue CALLSEQ_START
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
- .addImm(NumBytes);
+ .addImm(NumBytes).addImm(0);
// Process the args.
for (CCValAssign &VA : ArgLocs) {
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4b1bb27dce73..4f7c2e122390 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2265,7 +2265,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
SDValue Callee =
DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
- StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+ StructType *RetTy = StructType::get(ArgTy, ArgTy);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(DAG.getEntryNode())
@@ -3249,9 +3249,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall)
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL,
- true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
getPointerTy(DAG.getDataLayout()));
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index cb268828455e..c42738da7ab0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3427,6 +3427,10 @@ static bool getFMAPatterns(MachineInstr &Root,
Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
Found = true;
}
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
+ Found = true;
+ }
break;
case AArch64::FSUBDrr:
if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
@@ -3441,6 +3445,10 @@ static bool getFMAPatterns(MachineInstr &Root,
Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
Found = true;
}
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
+ Found = true;
+ }
break;
case AArch64::FSUBv2f32:
if (canCombineWithFMUL(MBB, Root.getOperand(2),
@@ -3495,6 +3503,8 @@ AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
case MachineCombinerPattern::FMULADDD_OP2:
case MachineCombinerPattern::FMULSUBD_OP1:
case MachineCombinerPattern::FMULSUBD_OP2:
+ case MachineCombinerPattern::FNMULSUBS_OP1:
+ case MachineCombinerPattern::FNMULSUBD_OP1:
case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
@@ -3996,6 +4006,24 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
}
+
+ case MachineCombinerPattern::FNMULSUBS_OP1:
+ case MachineCombinerPattern::FNMULSUBD_OP1: {
+ // FNMUL I=A,B,0
+ // FSUB R,I,C
+ // ==> FNMADD R,A,B,C // = -A*B - C
+ // --- Create(FNMADD);
+ if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
+ Opc = AArch64::FNMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FNMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ }
+
case MachineCombinerPattern::FMULSUBS_OP2:
case MachineCombinerPattern::FMULSUBD_OP2: {
// FMUL I=A,B,0
@@ -4011,6 +4039,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
+ }
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
Opc = AArch64::FMLSv1i32_indexed;
@@ -4067,7 +4096,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
FMAInstKind::Accumulator);
}
break;
- }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
DelInstrs.push_back(MUL);
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 902b08844216..5ddf66654a67 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -156,7 +156,8 @@ def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
- SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+ SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>,
[SDNPHasChain, SDNPOutGlue]>;
def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",
SDCallSeqEnd<[ SDTCisVT<0, i32>,
@@ -328,8 +329,9 @@ include "AArch64InstrFormats.td"
let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
// We set Sched to empty list because we expect these instructions to simply get
// removed in most cases.
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
- [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(AArch64callseq_start timm:$amt1, timm:$amt2)]>,
+ Sched<[]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
[(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
Sched<[]>;
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 5f895903da6f..789270c2a34b 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -529,9 +529,34 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// for the greedy mode the cost of the cross bank copy will
// offset this number.
// FIXME: Should be derived from the scheduling model.
- if (OpRegBankIdx[0] >= PMI_FirstFPR)
+ if (OpRegBankIdx[0] != PMI_FirstGPR)
Cost = 2;
+ else
+ // Check if that load feeds fp instructions.
+ // In that case, we want the default mapping to be on FPR
+ // instead of blind map every scalar to GPR.
+ for (const MachineInstr &UseMI :
+ MRI.use_instructions(MI.getOperand(0).getReg()))
+ // If we have at least one direct use in a FP instruction,
+ // assume this was a floating point load in the IR.
+ // If it was not, we would have had a bitcast before
+ // reaching that instruction.
+ if (isPreISelGenericFloatingPointOpcode(UseMI.getOpcode())) {
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ break;
+ }
break;
+ case TargetOpcode::G_STORE:
+ // Check if that store is fed by fp instructions.
+ if (OpRegBankIdx[0] == PMI_FirstGPR) {
+ unsigned VReg = MI.getOperand(0).getReg();
+ if (!VReg)
+ break;
+ MachineInstr *DefMI = MRI.getVRegDef(VReg);
+ if (isPreISelGenericFloatingPointOpcode(DefMI->getOpcode()))
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ break;
+ }
}
// Finally construct the computed mapping.
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 8f8eeef8a6cf..a9b4d44a523e 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -42,11 +42,11 @@ def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1
def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTXNv1i64)>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>;
-def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_1VXVY_5cyc], (instrs FMULX16, FMULX32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FMULX16, FMULX32)>;
-def : InstRW<[FalkorWr_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc], (instrs FMULX64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FMULX64)>;
def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>;
@@ -62,9 +62,9 @@ def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4
def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>;
def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>;
-def : InstRW<[FalkorWr_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
-def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
def : InstRW<[FalkorWr_3VXVY_4cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
@@ -72,13 +72,14 @@ def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i1
def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>;
-def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_2VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
-def : InstRW<[FalkorWr_2VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v2i64_indexed$")>;
// SIMD Integer Instructions
// -----------------------------------------------------------------------------
def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>;
@@ -119,10 +120,10 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)
def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)ADDLVv8i8v$")>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs ADDVv8i8v)>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQRDML(A|S)?H(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>;
@@ -169,9 +170,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL2?(v1i64|v2i64)$")>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
def : InstRW<[FalkorWr_3VXVY_3cyc], (instregex "^(S|U)ADDLVv4i32v$")>;
@@ -185,8 +186,9 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instregex "^(S|U)ABALv.*$")>;
def : InstRW<[FalkorWr_4VXVY_4cyc], (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)(i16|i32)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)v.*$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)v[248].*$")>;
+
// SIMD Load Instructions
// -----------------------------------------------------------------------------
def : InstRW<[WriteVLD], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
@@ -294,9 +296,9 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv
def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPXv1i32, FRECPXv1i64)>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs URECPEv2i32, URSQRTEv2i32)>;
-def : InstRW<[FalkorWr_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
-def : InstRW<[FalkorWr_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>;
def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>;
def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
@@ -311,9 +313,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs URECPEv4i32, URSQRTEv4i32)>;
def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs TBLv8i8Two)>;
def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^TBX(v8|v16)i8One$")>;
-def : InstRW<[FalkorWr_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>;
-def : InstRW<[FalkorWr_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>;
def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBL(v8i8Three|v16i8Two)$")>;
def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBX(v8i8Two|v16i8Two)$")>;
@@ -416,22 +418,25 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTSHr, FCVTDHr)>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTSDr, FCVTDSr)>;
-def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>;
def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>;
def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>;
-def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], (instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], (instregex "^F(N)?M(ADD|SUB)Drrr$")>;
// FP Miscellaneous Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
+def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(H|S|D)i$")>;
+def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hi|Hr|S0|Si|Sr|D0|Di|Dr|v.*_ns)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hr|Sr|Dr|v.*_ns)$")>;
+// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov 0.0
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs FMOVD0, FMOVS0)>;
def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>;
@@ -475,16 +480,17 @@ def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^EXTR(W|X)rri$")>;
// Divide and Multiply Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1X_4cyc], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
-def : InstRW<[FalkorWr_1X_4cyc], (instregex "^M(ADD|SUB)Wrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], (instregex "^M(ADD|SUB)Wrrr$")>;
-def : InstRW<[FalkorWr_1X_5cyc], (instregex "^(S|U)MULHrr$")>;
-def : InstRW<[FalkorWr_1X_5cyc], (instregex "^M(ADD|SUB)Xrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^M(ADD|SUB)Xrrr$")>;
def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>;
def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)(MLAL|MLSL|MULL)v.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(S|U)MULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^(S|U)(MLAL|MLSL)v.*$")>;
// Move and Shift Instructions
// -----------------------------------------------------------------------------
diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
index e64b2c441a19..6526cc28e806 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
@@ -29,8 +29,9 @@
// Define 1 micro-op types
def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; }
-def FalkorWr_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
-def FalkorWr_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
+def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; }
def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; }
def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; }
@@ -45,8 +46,10 @@ def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
-def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; }
def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; }
@@ -75,14 +78,26 @@ def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
let Latency = 4;
let NumMicroOps = 2;
}
+def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
let Latency = 5;
let NumMicroOps = 2;
}
+def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
let Latency = 6;
let NumMicroOps = 2;
}
+def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
let Latency = 4;
@@ -350,18 +365,17 @@ def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
let NumMicroOps = 9;
}
-// Forwarding logic is modeled for vector multiply and accumulate
+// Forwarding logic is modeled for multiply add/accumulate.
// -----------------------------------------------------------------------------
-def FalkorReadVMA : SchedReadAdvance<2, [FalkorWr_1VXVY_4cyc,
- FalkorWr_2VXVY_4cyc]>;
-def FalkorReadFMA : SchedReadAdvance<3, [FalkorWr_1VXVY_5cyc,
- FalkorWr_1VXVY_6cyc,
- FalkorWr_2VXVY_5cyc,
- FalkorWr_2VXVY_6cyc]>;
+def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
+def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
+def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>;
+def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
+def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
// SchedPredicates and WriteVariants for Immediate Zero and LSLFast
// -----------------------------------------------------------------------------
-def FalkorImmZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
+def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>;
def FalkorWr_FMOV : SchedWriteVariant<[
@@ -378,7 +392,6 @@ def FalkorWr_LDR : SchedWriteVariant<[
def FalkorWr_ADD : SchedWriteVariant<[
SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>,
- SchedVar<FalkorImmZPred, [FalkorWr_1XYZ_1cyc]>,
SchedVar<NoSchedPred, [FalkorWr_2XYZ_2cyc]>]>;
def FalkorWr_PRFM : SchedWriteVariant<[
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index abdeac019a18..1c81d34014fd 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -91,6 +91,8 @@ void AArch64Subtarget::initializeProperties() {
case Falkor:
MaxInterleaveFactor = 4;
VectorInsertExtractBaseCost = 2;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
break;
case Kryo:
MaxInterleaveFactor = 4;
@@ -99,6 +101,8 @@ void AArch64Subtarget::initializeProperties() {
PrefetchDistance = 740;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 11;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
break;
case ThunderX2T99:
CacheLineSize = 64;
@@ -108,6 +112,8 @@ void AArch64Subtarget::initializeProperties() {
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
break;
case ThunderX:
case ThunderXT88:
@@ -116,6 +122,8 @@ void AArch64Subtarget::initializeProperties() {
CacheLineSize = 128;
PrefFunctionAlignment = 3;
PrefLoopAlignment = 2;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
break;
case CortexA35: break;
case CortexA53: break;
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 5b9bee6e41b8..df54bf3f48e1 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -83,6 +83,9 @@ protected:
// NegativeImmediates - transform instructions with negative immediates
bool NegativeImmediates = true;
+ // Enable 64-bit vectorization in SLP.
+ unsigned MinVectorRegisterBitWidth = 64;
+
bool UseAA = false;
bool PredictableSelectIsExpensive = false;
bool BalanceFPOps = false;
@@ -106,6 +109,7 @@ protected:
unsigned PrefFunctionAlignment = 0;
unsigned PrefLoopAlignment = 0;
unsigned MaxJumpTableSize = 0;
+ unsigned WideningBaseCost = 0;
// ReserveX18 - X18 is not available as a general purpose register.
bool ReserveX18;
@@ -190,6 +194,10 @@ public:
bool isXRaySupported() const override { return true; }
+ unsigned getMinVectorRegisterBitWidth() const {
+ return MinVectorRegisterBitWidth;
+ }
+
bool isX18Reserved() const { return ReserveX18; }
bool hasFPARMv8() const { return HasFPARMv8; }
bool hasNEON() const { return HasNEON; }
@@ -228,6 +236,8 @@ public:
unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
+ unsigned getWideningBaseCost() const { return WideningBaseCost; }
+
/// CPU has TBI (top byte of addresses is ignored during HW address
/// translation) and OS enables it.
bool supportsAddressTopByteIgnored() const;
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 8875f9b72647..12a2e9a867f0 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -70,3 +70,11 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
return MCBinaryExpr::createSub(Res, PC, getContext());
}
+
+void AArch64_MachoTargetObjectFile::getNameWithPrefix(
+ SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+ const TargetMachine &TM) const {
+ // AArch64 does not use section-relative relocations so any global symbol must
+ // be accessed via at least a linker-private symbol.
+ getMangler().getNameWithPrefix(OutName, GV, /* CannotUsePrivateLabel */ true);
+}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 05e1dfa9e6c9..47e3bce43f6e 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -40,6 +40,9 @@ public:
const MCValue &MV, int64_t Offset,
MachineModuleInfo *MMI,
MCStreamer &Streamer) const override;
+
+ void getNameWithPrefix(SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+ const TargetMachine &TM) const override;
};
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4d59da0c646d..7c6f55c06bce 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -176,11 +176,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
+bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
+ ArrayRef<const Value *> Args) {
+
+ // A helper that returns a vector type from the given type. The number of
+ // elements in type Ty determine the vector width.
+ auto toVectorTy = [&](Type *ArgTy) {
+ return VectorType::get(ArgTy->getScalarType(),
+ DstTy->getVectorNumElements());
+ };
+
+ // Exit early if DstTy is not a vector type whose elements are at least
+ // 16-bits wide.
+ if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
+ return false;
+
+ // Determine if the operation has a widening variant. We consider both the
+ // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
+ // instructions.
+ //
+ // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+ // verify that their extending operands are eliminated during code
+ // generation.
+ switch (Opcode) {
+ case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
+ case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+ break;
+ default:
+ return false;
+ }
+
+ // To be a widening instruction (either the "wide" or "long" versions), the
+ // second operand must be a sign- or zero extend having a single user. We
+ // only consider extends having a single user because they may otherwise not
+ // be eliminated.
+ if (Args.size() != 2 ||
+ (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
+ !Args[1]->hasOneUse())
+ return false;
+ auto *Extend = cast<CastInst>(Args[1]);
+
+ // Legalize the destination type and ensure it can be used in a widening
+ // operation.
+ auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
+ unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
+ if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
+ return false;
+
+ // Legalize the source type and ensure it can be used in a widening
+ // operation.
+ Type *SrcTy = toVectorTy(Extend->getSrcTy());
+ auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
+ unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
+ if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
+ return false;
+
+ // Get the total number of vector elements in the legalized types.
+ unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
+ unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
+
+ // Return true if the legalized types have the same number of vector elements
+ // and the destination element type size is twice that of the source type.
+ return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
+}
+
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ // If the cast is observable, and it is used by a widening instruction (e.g.,
+ // uaddl, saddw, etc.), it may be free.
+ if (I && I->hasOneUse()) {
+ auto *SingleUser = cast<Instruction>(*I->user_begin());
+ SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
+ if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
+ // If the cast is the second operand, it is free. We will generate either
+ // a "wide" or "long" version of the widening instruction.
+ if (I == SingleUser->getOperand(1))
+ return 0;
+ // If the cast is not the second operand, it will be free if it looks the
+ // same as the second operand. In this case, we will generate a "long"
+ // version of the widening instruction.
+ if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
+ if (I->getOpcode() == Cast->getOpcode() &&
+ cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
+ return 0;
+ }
+ }
+
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
@@ -379,6 +463,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
+ // add in the widening overhead specified by the sub-target. Since the
+ // extends feeding widening instructions are performed automatically, they
+ // aren't present in the generated code and have a zero cost. By adding a
+ // widening overhead here, we attach the total cost of the combined operation
+ // to the widening instruction.
+ int Cost = 0;
+ if (isWideningInstruction(Ty, Opcode, Args))
+ Cost += ST->getWideningBaseCost();
+
int ISD = TLI->InstructionOpcodeToISD(Opcode);
if (ISD == ISD::SDIV &&
@@ -388,9 +482,9 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
// The OperandValue properties many not be same as that of previous
// operation; conservatively assume OP_None.
- int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
@@ -405,8 +499,8 @@ int AArch64TTIImpl::getArithmeticInstrCost(
switch (ISD) {
default:
- return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
case ISD::ADD:
case ISD::MUL:
case ISD::XOR:
@@ -414,7 +508,7 @@ int AArch64TTIImpl::getArithmeticInstrCost(
case ISD::AND:
// These nodes are marked as 'custom' for combining purposes only.
// We know that they are legal. See LowerAdd in ISelLowering.
- return 1 * LT.first;
+ return (Cost + 1) * LT.first;
}
}
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index e37c003e064c..280d97f3c502 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -43,6 +43,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
VECTOR_LDST_FOUR_ELEMENTS
};
+ bool isWideningInstruction(Type *Ty, unsigned Opcode,
+ ArrayRef<const Value *> Args);
+
public:
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -84,6 +87,10 @@ public:
return 64;
}
+ unsigned getMinVectorRegisterBitWidth() {
+ return ST->getMinVectorRegisterBitWidth();
+ }
+
unsigned getMaxInterleaveFactor(unsigned VF);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
@@ -134,6 +141,10 @@ public:
unsigned getMinPrefetchStride();
unsigned getMaxPrefetchIterationsAhead();
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const {
+ return false;
+ }
/// @}
};
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 4dbcc9581a84..449d732a8d44 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3904,10 +3904,14 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
return false;
}
+static SMLoc incrementLoc(SMLoc L, int Offset) {
+ return SMLoc::getFromPointer(L.getPointer() + Offset);
+}
+
/// parseDirectiveCPU
/// ::= .cpu id
bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
- SMLoc CPULoc = getLoc();
+ SMLoc CurLoc = getLoc();
StringRef CPU, ExtensionString;
std::tie(CPU, ExtensionString) =
@@ -3923,15 +3927,19 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
// FIXME This is using tablegen data, but should be moved to ARMTargetParser
// once that is tablegen'ed
if (!getSTI().isCPUStringValid(CPU)) {
- Error(CPULoc, "unknown CPU name");
+ Error(CurLoc, "unknown CPU name");
return false;
}
MCSubtargetInfo &STI = copySTI();
STI.setDefaultFeatures(CPU, "");
+ CurLoc = incrementLoc(CurLoc, CPU.size());
FeatureBitset Features = STI.getFeatureBits();
for (auto Name : RequestedExtensions) {
+ // Advance source location past '+'.
+ CurLoc = incrementLoc(CurLoc, 1);
+
bool EnableFeature = true;
if (Name.startswith_lower("no")) {
@@ -3939,6 +3947,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
Name = Name.substr(2);
}
+ bool FoundExtension = false;
for (const auto &Extension : ExtensionMap) {
if (Extension.Name != Name)
continue;
@@ -3952,9 +3961,15 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
uint64_t Features =
ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
setAvailableFeatures(Features);
+ FoundExtension = true;
break;
}
+
+ if (!FoundExtension)
+ Error(CurLoc, "unsupported architectural extension");
+
+ CurLoc = incrementLoc(CurLoc, Name.size());
}
return false;
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 94112849f84e..1b28df963b40 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -32,8 +32,9 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
- // We prefer NEON instructions to be printed in the short form.
- AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+ // We prefer NEON instructions to be printed in the short, Apple-specific
+ // form when targeting Darwin.
+ AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant;
PrivateGlobalPrefix = "L";
PrivateLabelPrefix = "L";
@@ -68,8 +69,9 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
if (T.getArch() == Triple::aarch64_be)
IsLittleEndian = false;
- // We prefer NEON instructions to be printed in the short form.
- AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+ // We prefer NEON instructions to be printed in the generic form when
+ // targeting ELF.
+ AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant;
CodePointerSize = 8;
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 8f6e1e7d8846..3f89702bed50 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -50,6 +50,10 @@ FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+
+void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
+extern char &AMDGPUMachineCFGStructurizerID;
ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 2e5b78bbf7ef..b279bd61e180 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -61,6 +61,24 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
"Support flat address space"
>;
+def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
+ "FlatInstOffsets",
+ "true",
+ "Flat instructions have immediate offset addressing mode"
+>;
+
+def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
+ "FlatGlobalInsts",
+ "true",
+ "Have global_* flat memory instructions"
+>;
+
+def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
+ "FlatScratchInsts",
+ "true",
+ "Have scratch_* flat memory instructions"
+>;
+
def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
"UnalignedBufferAccess",
"true",
@@ -407,7 +425,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
- FeatureFastFMAF32, FeatureDPP
+ FeatureFastFMAF32, FeatureDPP,
+ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
]
>;
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ccae36ced1f8..7c99752b881f 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -136,8 +136,7 @@ private:
bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
SDValue &ImmOffset, SDValue &VOffset) const;
- bool SelectFlat(SDValue Addr, SDValue &VAddr,
- SDValue &SLC, SDValue &TFE) const;
+ bool SelectFlat(SDValue Addr, SDValue &VAddr, SDValue &SLC) const;
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
bool &Imm) const;
@@ -1278,10 +1277,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
SDValue &VAddr,
- SDValue &SLC,
- SDValue &TFE) const {
+ SDValue &SLC) const {
VAddr = Addr;
- TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+ SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 915d1d9e0e68..f80652b87373 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -567,13 +567,19 @@ static bool hasSourceMods(const SDNode *N) {
case AMDGPUISD::INTERP_P1:
case AMDGPUISD::INTERP_P2:
case AMDGPUISD::DIV_SCALE:
+
+ // TODO: Should really be looking at the users of the bitcast. These are
+ // problematic because bitcasts are used to legalize all stores to integer
+ // types.
+ case ISD::BITCAST:
return false;
default:
return true;
}
}
-static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
+bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
+ unsigned CostThreshold) {
// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
// it is truly free to use a source modifier in all cases. If there are
// multiple users but for each one will necessitate using VOP3, there will be
@@ -2299,7 +2305,7 @@ static bool isU24(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
DAG.computeKnownBits(Op, Known);
- return (VT.getSizeInBits() - Known.Zero.countLeadingOnes()) <= 24;
+ return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
}
static bool isI24(SDValue Op, SelectionDAG &DAG) {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index e1a5a2072418..4c588a7bafd0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -132,6 +132,8 @@ public:
return false;
}
+ static bool allUsesHaveSourceMods(const SDNode *N,
+ unsigned CostThreshold = 4);
bool isFAbsFree(EVT VT) const override;
bool isFNegFree(EVT VT) const override;
bool isTruncateFree(EVT Src, EVT Dest) const override;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8867ed689a31..a7eac080f885 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -127,9 +127,9 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
.add(I.getOperand(1))
.add(I.getOperand(0))
.addImm(0)
- .addImm(0)
.addImm(0);
+
// Now that we selected an opcode, we need to constrain the register
// operands to use appropriate classes.
bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
@@ -393,7 +393,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
.add(I.getOperand(0))
.addReg(PtrReg)
.addImm(0)
- .addImm(0)
.addImm(0);
bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a2567a549028..9de302994e68 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -33,6 +33,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
const LLT P1 = LLT::pointer(1, 64);
const LLT P2 = LLT::pointer(2, 64);
+ setAction({G_CONSTANT, S32}, Legal);
setAction({G_CONSTANT, S64}, Legal);
setAction({G_GEP, P1}, Legal);
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
new file mode 100644
index 000000000000..6d2785ba1c60
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -0,0 +1,2881 @@
+//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level CFG structurizer pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegionInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <tuple>
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpucfgstructurizer"
+
+namespace {
+class PHILinearizeDestIterator;
+
+class PHILinearize {
+ friend class PHILinearizeDestIterator;
+
+public:
+ typedef std::pair<unsigned, MachineBasicBlock *> PHISourceT;
+
+private:
+ typedef DenseSet<PHISourceT> PHISourcesT;
+ typedef struct {
+ unsigned DestReg;
+ DebugLoc DL;
+ PHISourcesT Sources;
+ } PHIInfoElementT;
+ typedef SmallPtrSet<PHIInfoElementT *, 2> PHIInfoT;
+ PHIInfoT PHIInfo;
+
+ static unsigned phiInfoElementGetDest(PHIInfoElementT *Info);
+ static void phiInfoElementSetDef(PHIInfoElementT *Info, unsigned NewDef);
+ static PHISourcesT &phiInfoElementGetSources(PHIInfoElementT *Info);
+ static void phiInfoElementAddSource(PHIInfoElementT *Info, unsigned SourceReg,
+ MachineBasicBlock *SourceMBB);
+ static void phiInfoElementRemoveSource(PHIInfoElementT *Info,
+ unsigned SourceReg,
+ MachineBasicBlock *SourceMBB);
+ PHIInfoElementT *findPHIInfoElement(unsigned DestReg);
+ PHIInfoElementT *findPHIInfoElementFromSource(unsigned SourceReg,
+ MachineBasicBlock *SourceMBB);
+
+public:
+ bool findSourcesFromMBB(MachineBasicBlock *SourceMBB,
+ SmallVector<unsigned, 4> &Sources);
+ void addDest(unsigned DestReg, const DebugLoc &DL);
+ void replaceDef(unsigned OldDestReg, unsigned NewDestReg);
+ void deleteDef(unsigned DestReg);
+ void addSource(unsigned DestReg, unsigned SourceReg,
+ MachineBasicBlock *SourceMBB);
+ void removeSource(unsigned DestReg, unsigned SourceReg,
+ MachineBasicBlock *SourceMBB = nullptr);
+ bool findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB,
+ unsigned &DestReg);
+ bool isSource(unsigned Reg, MachineBasicBlock *SourceMBB = nullptr);
+ unsigned getNumSources(unsigned DestReg);
+ void dump(MachineRegisterInfo *MRI);
+ void clear();
+
+ typedef PHISourcesT::iterator source_iterator;
+ typedef PHILinearizeDestIterator dest_iterator;
+
+ dest_iterator dests_begin();
+ dest_iterator dests_end();
+
+ source_iterator sources_begin(unsigned Reg);
+ source_iterator sources_end(unsigned Reg);
+};
+
+class PHILinearizeDestIterator {
+private:
+ PHILinearize::PHIInfoT::iterator Iter;
+
+public:
+ unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); }
+ PHILinearizeDestIterator &operator++() {
+ ++Iter;
+ return *this;
+ }
+ bool operator==(const PHILinearizeDestIterator &I) const {
+ return I.Iter == Iter;
+ }
+ bool operator!=(const PHILinearizeDestIterator &I) const {
+ return I.Iter != Iter;
+ }
+
+ PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {}
+};
+
+unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) {
+ return Info->DestReg;
+}
+
+void PHILinearize::phiInfoElementSetDef(PHIInfoElementT *Info,
+ unsigned NewDef) {
+ Info->DestReg = NewDef;
+}
+
+PHILinearize::PHISourcesT &
+PHILinearize::phiInfoElementGetSources(PHIInfoElementT *Info) {
+ return Info->Sources;
+}
+
+void PHILinearize::phiInfoElementAddSource(PHIInfoElementT *Info,
+ unsigned SourceReg,
+ MachineBasicBlock *SourceMBB) {
+ // Assertion ensures we don't use the same SourceMBB for the
+ // sources, because we cannot have different registers with
+ // identical predecessors, but we can have the same register for
+ // multiple predecessors.
+#if !defined(NDEBUG)
+ for (auto SI : phiInfoElementGetSources(Info)) {
+ assert((SI.second != SourceMBB || SourceReg == SI.first));
+ }
+#endif
+
+ phiInfoElementGetSources(Info).insert(PHISourceT(SourceReg, SourceMBB));
+}
+
+void PHILinearize::phiInfoElementRemoveSource(PHIInfoElementT *Info,
+ unsigned SourceReg,
+ MachineBasicBlock *SourceMBB) {
+ auto &Sources = phiInfoElementGetSources(Info);
+ SmallVector<PHISourceT, 4> ElimiatedSources;
+ for (auto SI : Sources) {
+ if (SI.first == SourceReg &&
+ (SI.second == nullptr || SI.second == SourceMBB)) {
+ ElimiatedSources.push_back(PHISourceT(SI.first, SI.second));
+ }
+ }
+
+ for (auto &Source : ElimiatedSources) {
+ Sources.erase(Source);
+ }
+}
+
+PHILinearize::PHIInfoElementT *
+PHILinearize::findPHIInfoElement(unsigned DestReg) {
+ for (auto I : PHIInfo) {
+ if (phiInfoElementGetDest(I) == DestReg) {
+ return I;
+ }
+ }
+ return nullptr;
+}
+
+PHILinearize::PHIInfoElementT *
+PHILinearize::findPHIInfoElementFromSource(unsigned SourceReg,
+ MachineBasicBlock *SourceMBB) {
+ for (auto I : PHIInfo) {
+ for (auto SI : phiInfoElementGetSources(I)) {
+ if (SI.first == SourceReg &&
+ (SI.second == nullptr || SI.second == SourceMBB)) {
+ return I;
+ }
+ }
+ }
+ return nullptr;
+}
+
+bool PHILinearize::findSourcesFromMBB(MachineBasicBlock *SourceMBB,
+ SmallVector<unsigned, 4> &Sources) {
+ bool FoundSource = false;
+ for (auto I : PHIInfo) {
+ for (auto SI : phiInfoElementGetSources(I)) {
+ if (SI.second == SourceMBB) {
+ FoundSource = true;
+ Sources.push_back(SI.first);
+ }
+ }
+ }
+ return FoundSource;
+}
+
+void PHILinearize::addDest(unsigned DestReg, const DebugLoc &DL) {
+ assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exsists");
+ PHISourcesT EmptySet;
+ PHIInfoElementT *NewElement = new PHIInfoElementT();
+ NewElement->DestReg = DestReg;
+ NewElement->DL = DL;
+ NewElement->Sources = EmptySet;
+ PHIInfo.insert(NewElement);
+}
+
+void PHILinearize::replaceDef(unsigned OldDestReg, unsigned NewDestReg) {
+ phiInfoElementSetDef(findPHIInfoElement(OldDestReg), NewDestReg);
+}
+
+void PHILinearize::deleteDef(unsigned DestReg) {
+ PHIInfoElementT *InfoElement = findPHIInfoElement(DestReg);
+ PHIInfo.erase(InfoElement);
+ delete InfoElement;
+}
+
+void PHILinearize::addSource(unsigned DestReg, unsigned SourceReg,
+ MachineBasicBlock *SourceMBB) {
+ phiInfoElementAddSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
+}
+
+void PHILinearize::removeSource(unsigned DestReg, unsigned SourceReg,
+ MachineBasicBlock *SourceMBB) {
+ phiInfoElementRemoveSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
+}
+
+bool PHILinearize::findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB,
+ unsigned &DestReg) {
+ PHIInfoElementT *InfoElement =
+ findPHIInfoElementFromSource(SourceReg, SourceMBB);
+ if (InfoElement != nullptr) {
+ DestReg = phiInfoElementGetDest(InfoElement);
+ return true;
+ }
+ return false;
+}
+
+bool PHILinearize::isSource(unsigned Reg, MachineBasicBlock *SourceMBB) {
+ unsigned DestReg;
+ return findDest(Reg, SourceMBB, DestReg);
+}
+
+unsigned PHILinearize::getNumSources(unsigned DestReg) {
+ return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size();
+}
+
+void PHILinearize::dump(MachineRegisterInfo *MRI) {
+ const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+ dbgs() << "=PHIInfo Start=\n";
+ for (auto PII : this->PHIInfo) {
+ PHIInfoElementT &Element = *PII;
+ dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI)
+ << " Sources: {";
+ for (auto &SI : Element.Sources) {
+ dbgs() << PrintReg(SI.first, TRI) << "(BB#"
+ << SI.second->getNumber() << "),";
+ }
+ dbgs() << "}\n";
+ }
+ dbgs() << "=PHIInfo End=\n";
+}
+
+void PHILinearize::clear() { PHIInfo = PHIInfoT(); }
+
+PHILinearize::dest_iterator PHILinearize::dests_begin() {
+ return PHILinearizeDestIterator(PHIInfo.begin());
+}
+
+PHILinearize::dest_iterator PHILinearize::dests_end() {
+ return PHILinearizeDestIterator(PHIInfo.end());
+}
+
+PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) {
+ auto InfoElement = findPHIInfoElement(Reg);
+ return phiInfoElementGetSources(InfoElement).begin();
+}
+PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) {
+ auto InfoElement = findPHIInfoElement(Reg);
+ return phiInfoElementGetSources(InfoElement).end();
+}
+
+class RegionMRT;
+class MBBMRT;
+
+static unsigned getPHINumInputs(MachineInstr &PHI) {
+ assert(PHI.isPHI());
+ return (PHI.getNumOperands() - 1) / 2;
+}
+
+static MachineBasicBlock *getPHIPred(MachineInstr &PHI, unsigned Index) {
+ assert(PHI.isPHI());
+ return PHI.getOperand(Index * 2 + 2).getMBB();
+}
+
+static void setPhiPred(MachineInstr &PHI, unsigned Index,
+ MachineBasicBlock *NewPred) {
+ PHI.getOperand(Index * 2 + 2).setMBB(NewPred);
+}
+
+static unsigned getPHISourceReg(MachineInstr &PHI, unsigned Index) {
+ assert(PHI.isPHI());
+ return PHI.getOperand(Index * 2 + 1).getReg();
+}
+
+static unsigned getPHIDestReg(MachineInstr &PHI) {
+ assert(PHI.isPHI());
+ return PHI.getOperand(0).getReg();
+}
+
+class LinearizedRegion {
+protected:
+ MachineBasicBlock *Entry;
+ // The exit block is part of the region, and is the last
+ // merge block before exiting the region.
+ MachineBasicBlock *Exit;
+ DenseSet<unsigned> LiveOuts;
+ SmallPtrSet<MachineBasicBlock *, 1> MBBs;
+ bool HasLoop;
+ LinearizedRegion *Parent;
+ RegionMRT *RMRT;
+
+ void storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+ MachineInstr *DefInstr, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+ void storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+ MachineInstr *DefInstr,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo);
+
+ void storeMBBLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo,
+ RegionMRT *TopRegion);
+
+ void storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+ void storeLiveOuts(RegionMRT *Region, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo,
+ RegionMRT *TopRegion = nullptr);
+
+public:
+ void setRegionMRT(RegionMRT *Region) { RMRT = Region; }
+
+ RegionMRT *getRegionMRT() { return RMRT; }
+
+ void setParent(LinearizedRegion *P) { Parent = P; }
+
+ LinearizedRegion *getParent() { return Parent; }
+
+ void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr);
+
+ void setBBSelectRegIn(unsigned Reg);
+
+ unsigned getBBSelectRegIn();
+
+ void setBBSelectRegOut(unsigned Reg, bool IsLiveOut);
+
+ unsigned getBBSelectRegOut();
+
+ void setHasLoop(bool Value);
+
+ bool getHasLoop();
+
+ void addLiveOut(unsigned VReg);
+
+ void removeLiveOut(unsigned Reg);
+
+ void replaceLiveOut(unsigned OldReg, unsigned NewReg);
+
+ void replaceRegister(unsigned Register, unsigned NewRegister,
+ MachineRegisterInfo *MRI, bool ReplaceInside,
+ bool ReplaceOutside, bool IncludeLoopPHIs);
+
+ void replaceRegisterInsideRegion(unsigned Register, unsigned NewRegister,
+ bool IncludeLoopPHIs,
+ MachineRegisterInfo *MRI);
+
+ void replaceRegisterOutsideRegion(unsigned Register, unsigned NewRegister,
+ bool IncludeLoopPHIs,
+ MachineRegisterInfo *MRI);
+
+ DenseSet<unsigned> *getLiveOuts();
+
+ void setEntry(MachineBasicBlock *NewEntry);
+
+ MachineBasicBlock *getEntry();
+
+ void setExit(MachineBasicBlock *NewExit);
+
+ MachineBasicBlock *getExit();
+
+ void addMBB(MachineBasicBlock *MBB);
+
+ void addMBBs(LinearizedRegion *InnerRegion);
+
+ bool contains(MachineBasicBlock *MBB);
+
+ bool isLiveOut(unsigned Reg);
+
+ bool hasNoDef(unsigned Reg, MachineRegisterInfo *MRI);
+
+ void removeFalseRegisterKills(MachineRegisterInfo *MRI);
+
+ void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+ LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+ LinearizedRegion();
+
+ ~LinearizedRegion();
+};
+
+class MRT {
+protected:
+ RegionMRT *Parent;
+ unsigned BBSelectRegIn;
+ unsigned BBSelectRegOut;
+
+public:
+ unsigned getBBSelectRegIn() { return BBSelectRegIn; }
+
+ unsigned getBBSelectRegOut() { return BBSelectRegOut; }
+
+ void setBBSelectRegIn(unsigned Reg) { BBSelectRegIn = Reg; }
+
+ void setBBSelectRegOut(unsigned Reg) { BBSelectRegOut = Reg; }
+
+ virtual RegionMRT *getRegionMRT() { return nullptr; }
+
+ virtual MBBMRT *getMBBMRT() { return nullptr; }
+
+ bool isRegion() { return getRegionMRT() != nullptr; }
+
+ bool isMBB() { return getMBBMRT() != nullptr; }
+
+ bool isRoot() { return Parent == nullptr; }
+
+ void setParent(RegionMRT *Region) { Parent = Region; }
+
+ RegionMRT *getParent() { return Parent; }
+
+ static MachineBasicBlock *
+ initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
+ DenseMap<MachineRegion *, RegionMRT *> &RegionMap);
+
+ static RegionMRT *buildMRT(MachineFunction &MF,
+ const MachineRegionInfo *RegionInfo,
+ const SIInstrInfo *TII,
+ MachineRegisterInfo *MRI);
+
+ virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) = 0;
+
+ void dumpDepth(int depth) {
+ for (int i = depth; i > 0; --i) {
+ dbgs() << " ";
+ }
+ }
+
+ virtual ~MRT() {}
+};
+
+class MBBMRT : public MRT {
+ MachineBasicBlock *MBB;
+
+public:
+ virtual MBBMRT *getMBBMRT() { return this; }
+
+ MachineBasicBlock *getMBB() { return MBB; }
+
+ virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+ dumpDepth(depth);
+ dbgs() << "MBB: " << getMBB()->getNumber();
+ dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
+ dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+ }
+
+ MBBMRT(MachineBasicBlock *BB) : MBB(BB) {
+ setParent(nullptr);
+ setBBSelectRegOut(0);
+ setBBSelectRegIn(0);
+ }
+};
+
+class RegionMRT : public MRT {
+protected:
+ MachineRegion *Region;
+ LinearizedRegion *LRegion;
+ MachineBasicBlock *Succ;
+
+ SetVector<MRT *> Children;
+
+public:
+ virtual RegionMRT *getRegionMRT() { return this; }
+
+ void setLinearizedRegion(LinearizedRegion *LinearizeRegion) {
+ LRegion = LinearizeRegion;
+ }
+
+ LinearizedRegion *getLinearizedRegion() { return LRegion; }
+
+ MachineRegion *getMachineRegion() { return Region; }
+
+ unsigned getInnerOutputRegister() {
+ return (*(Children.begin()))->getBBSelectRegOut();
+ }
+
+ void addChild(MRT *Tree) { Children.insert(Tree); }
+
+ SetVector<MRT *> *getChildren() { return &Children; }
+
+ virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+ dumpDepth(depth);
+ dbgs() << "Region: " << (void *)Region;
+ dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
+ dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+
+ dumpDepth(depth);
+ if (getSucc())
+ dbgs() << "Succ: " << getSucc()->getNumber() << "\n";
+ else
+ dbgs() << "Succ: none \n";
+ for (auto MRTI : Children) {
+ MRTI->dump(TRI, depth + 1);
+ }
+ }
+
+ MRT *getEntryTree() { return Children.back(); }
+
+ MRT *getExitTree() { return Children.front(); }
+
+ MachineBasicBlock *getEntry() {
+ MRT *Tree = Children.back();
+ return (Tree->isRegion()) ? Tree->getRegionMRT()->getEntry()
+ : Tree->getMBBMRT()->getMBB();
+ }
+
+ MachineBasicBlock *getExit() {
+ MRT *Tree = Children.front();
+ return (Tree->isRegion()) ? Tree->getRegionMRT()->getExit()
+ : Tree->getMBBMRT()->getMBB();
+ }
+
+ void setSucc(MachineBasicBlock *MBB) { Succ = MBB; }
+
+ MachineBasicBlock *getSucc() { return Succ; }
+
+ bool contains(MachineBasicBlock *MBB) {
+ for (auto CI : Children) {
+ if (CI->isMBB()) {
+ if (MBB == CI->getMBBMRT()->getMBB()) {
+ return true;
+ }
+ } else {
+ if (CI->getRegionMRT()->contains(MBB)) {
+ return true;
+ } else if (CI->getRegionMRT()->getLinearizedRegion() != nullptr &&
+ CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ void replaceLiveOutReg(unsigned Register, unsigned NewRegister) {
+ LinearizedRegion *LRegion = getLinearizedRegion();
+ LRegion->replaceLiveOut(Register, NewRegister);
+ for (auto &CI : Children) {
+ if (CI->isRegion()) {
+ CI->getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
+ }
+ }
+ }
+
+ RegionMRT(MachineRegion *MachineRegion)
+ : Region(MachineRegion), LRegion(nullptr), Succ(nullptr) {
+ setParent(nullptr);
+ setBBSelectRegOut(0);
+ setBBSelectRegIn(0);
+ }
+
+ virtual ~RegionMRT() {
+ if (LRegion) {
+ delete LRegion;
+ }
+
+ for (auto CI : Children) {
+ delete &(*CI);
+ }
+ }
+};
+
+static unsigned createBBSelectReg(const SIInstrInfo *TII,
+ MachineRegisterInfo *MRI) {
+ return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32));
+}
+
+MachineBasicBlock *
+MRT::initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
+ DenseMap<MachineRegion *, RegionMRT *> &RegionMap) {
+ for (auto &MFI : MF) {
+ MachineBasicBlock *ExitMBB = &MFI;
+ if (ExitMBB->succ_size() == 0) {
+ return ExitMBB;
+ }
+ }
+ llvm_unreachable("CFG has no exit block");
+ return nullptr;
+}
+
+RegionMRT *MRT::buildMRT(MachineFunction &MF,
+ const MachineRegionInfo *RegionInfo,
+ const SIInstrInfo *TII, MachineRegisterInfo *MRI) {
+ SmallPtrSet<MachineRegion *, 4> PlacedRegions;
+ DenseMap<MachineRegion *, RegionMRT *> RegionMap;
+ MachineRegion *TopLevelRegion = RegionInfo->getTopLevelRegion();
+ RegionMRT *Result = new RegionMRT(TopLevelRegion);
+ RegionMap[TopLevelRegion] = Result;
+
+ // Insert the exit block first, we need it to be the merge node
+ // for the top level region.
+ MachineBasicBlock *Exit = initializeMRT(MF, RegionInfo, RegionMap);
+
+ unsigned BBSelectRegIn = createBBSelectReg(TII, MRI);
+ MBBMRT *ExitMRT = new MBBMRT(Exit);
+ RegionMap[RegionInfo->getRegionFor(Exit)]->addChild(ExitMRT);
+ ExitMRT->setBBSelectRegIn(BBSelectRegIn);
+
+ for (auto MBBI : post_order(&(MF.front()))) {
+ MachineBasicBlock *MBB = &(*MBBI);
+
+ // Skip Exit since we already added it
+ if (MBB == Exit) {
+ continue;
+ }
+
+ DEBUG(dbgs() << "Visiting BB#" << MBB->getNumber() << "\n");
+ MBBMRT *NewMBB = new MBBMRT(MBB);
+ MachineRegion *Region = RegionInfo->getRegionFor(MBB);
+
+ // Ensure we have the MRT region
+ if (RegionMap.count(Region) == 0) {
+ RegionMRT *NewMRTRegion = new RegionMRT(Region);
+ RegionMap[Region] = NewMRTRegion;
+
+ // Ensure all parents are in the RegionMap
+ MachineRegion *Parent = Region->getParent();
+ while (RegionMap.count(Parent) == 0) {
+ RegionMRT *NewMRTParent = new RegionMRT(Parent);
+ NewMRTParent->addChild(NewMRTRegion);
+ NewMRTRegion->setParent(NewMRTParent);
+ RegionMap[Parent] = NewMRTParent;
+ NewMRTRegion = NewMRTParent;
+ Parent = Parent->getParent();
+ }
+ RegionMap[Parent]->addChild(NewMRTRegion);
+ NewMRTRegion->setParent(RegionMap[Parent]);
+ }
+
+ // Add MBB to Region MRT
+ RegionMap[Region]->addChild(NewMBB);
+ NewMBB->setParent(RegionMap[Region]);
+ RegionMap[Region]->setSucc(Region->getExit());
+ }
+ return Result;
+}
+
+void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+ MachineInstr *DefInstr,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo) {
+ if (TRI->isVirtualRegister(Reg)) {
+ DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+ // If this is a source register to a PHI we are chaining, it
+ // must be live out.
+ if (PHIInfo.isSource(Reg)) {
+ DEBUG(dbgs() << "Add LiveOut (PHI): " << PrintReg(Reg, TRI) << "\n");
+ addLiveOut(Reg);
+ } else {
+ // If this is live out of the MBB
+ for (auto &UI : MRI->use_operands(Reg)) {
+ if (UI.getParent()->getParent() != MBB) {
+ DEBUG(dbgs() << "Add LiveOut (MBB BB#" << MBB->getNumber()
+ << "): " << PrintReg(Reg, TRI) << "\n");
+ addLiveOut(Reg);
+ } else {
+ // If the use is in the same MBB we have to make sure
+ // it is after the def, otherwise it is live out in a loop
+ MachineInstr *UseInstr = UI.getParent();
+ for (MachineBasicBlock::instr_iterator
+ MII = UseInstr->getIterator(),
+ MIE = UseInstr->getParent()->instr_end();
+ MII != MIE; ++MII) {
+ if ((&(*MII)) == DefInstr) {
+ DEBUG(dbgs() << "Add LiveOut (Loop): " << PrintReg(Reg, TRI)
+ << "\n");
+ addLiveOut(Reg);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+ MachineInstr *DefInstr,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo) {
+ if (TRI->isVirtualRegister(Reg)) {
+ DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+ for (auto &UI : MRI->use_operands(Reg)) {
+ if (!Region->contains(UI.getParent()->getParent())) {
+ DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
+ << "): " << PrintReg(Reg, TRI) << "\n");
+ addLiveOut(Reg);
+ }
+ }
+ }
+}
+
+void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo) {
+ DEBUG(dbgs() << "-Store Live Outs Begin (BB#" << MBB->getNumber() << ")-\n");
+ for (auto &II : *MBB) {
+ for (auto &RI : II.defs()) {
+ storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo);
+ }
+ for (auto &IRI : II.implicit_operands()) {
+ if (IRI.isDef()) {
+ storeLiveOutReg(MBB, IRI.getReg(), IRI.getParent(), MRI, TRI, PHIInfo);
+ }
+ }
+ }
+
+ // If we have a successor with a PHI, source coming from this MBB we have to
+ // add the register as live out
+ for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+ E = MBB->succ_end();
+ SI != E; ++SI) {
+ for (auto &II : *(*SI)) {
+ if (II.isPHI()) {
+ MachineInstr &PHI = II;
+ int numPreds = getPHINumInputs(PHI);
+ for (int i = 0; i < numPreds; ++i) {
+ if (getPHIPred(PHI, i) == MBB) {
+ unsigned PHIReg = getPHISourceReg(PHI, i);
+ DEBUG(dbgs() << "Add LiveOut (PhiSource BB#" << MBB->getNumber()
+ << " -> BB#" << (*SI)->getNumber()
+ << "): " << PrintReg(PHIReg, TRI) << "\n");
+ addLiveOut(PHIReg);
+ }
+ }
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "-Store Live Outs Endn-\n");
+}
+
+void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo,
+ RegionMRT *TopRegion) {
+ for (auto &II : *MBB) {
+ for (auto &RI : II.defs()) {
+ storeLiveOutRegRegion(TopRegion, RI.getReg(), RI.getParent(), MRI, TRI,
+ PHIInfo);
+ }
+ for (auto &IRI : II.implicit_operands()) {
+ if (IRI.isDef()) {
+ storeLiveOutRegRegion(TopRegion, IRI.getReg(), IRI.getParent(), MRI,
+ TRI, PHIInfo);
+ }
+ }
+ }
+}
+
+void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo,
+ RegionMRT *CurrentTopRegion) {
+ MachineBasicBlock *Exit = Region->getSucc();
+
+ RegionMRT *TopRegion =
+ CurrentTopRegion == nullptr ? Region : CurrentTopRegion;
+
+ // Check if exit is end of function, if so, no live outs.
+ if (Exit == nullptr)
+ return;
+
+ auto Children = Region->getChildren();
+ for (auto CI : *Children) {
+ if (CI->isMBB()) {
+ auto MBB = CI->getMBBMRT()->getMBB();
+ storeMBBLiveOuts(MBB, MRI, TRI, PHIInfo, TopRegion);
+ } else {
+ LinearizedRegion *SubRegion = CI->getRegionMRT()->getLinearizedRegion();
+ // We should be limited to only store registers that are live out from the
+ // lineaized region
+ for (auto MBBI : SubRegion->MBBs) {
+ storeMBBLiveOuts(MBBI, MRI, TRI, PHIInfo, TopRegion);
+ }
+ }
+ }
+
+ if (CurrentTopRegion == nullptr) {
+ auto Succ = Region->getSucc();
+ for (auto &II : *Succ) {
+ if (II.isPHI()) {
+ MachineInstr &PHI = II;
+ int numPreds = getPHINumInputs(PHI);
+ for (int i = 0; i < numPreds; ++i) {
+ if (Region->contains(getPHIPred(PHI, i))) {
+ unsigned PHIReg = getPHISourceReg(PHI, i);
+ DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
+ << "): " << PrintReg(PHIReg, TRI) << "\n");
+ addLiveOut(PHIReg);
+ }
+ }
+ }
+ }
+ }
+}
+
+void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
+ OS << "Linearized Region {";
+ bool IsFirst = true;
+ for (const auto &MBB : MBBs) {
+ if (IsFirst) {
+ IsFirst = false;
+ } else {
+ OS << " ,";
+ }
+ OS << MBB->getNumber();
+ }
+ OS << "} (" << Entry->getNumber() << ", "
+ << (Exit == nullptr ? -1 : Exit->getNumber())
+ << "): In:" << PrintReg(getBBSelectRegIn(), TRI)
+ << " Out:" << PrintReg(getBBSelectRegOut(), TRI) << " {";
+ for (auto &LI : LiveOuts) {
+ OS << PrintReg(LI, TRI) << " ";
+ }
+ OS << "} \n";
+}
+
+unsigned LinearizedRegion::getBBSelectRegIn() {
+ return getRegionMRT()->getBBSelectRegIn();
+}
+
+unsigned LinearizedRegion::getBBSelectRegOut() {
+ return getRegionMRT()->getBBSelectRegOut();
+}
+
+void LinearizedRegion::setHasLoop(bool Value) { HasLoop = Value; }
+
+bool LinearizedRegion::getHasLoop() { return HasLoop; }
+
+void LinearizedRegion::addLiveOut(unsigned VReg) { LiveOuts.insert(VReg); }
+
+void LinearizedRegion::removeLiveOut(unsigned Reg) {
+ if (isLiveOut(Reg))
+ LiveOuts.erase(Reg);
+}
+
+void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) {
+ if (isLiveOut(OldReg)) {
+ removeLiveOut(OldReg);
+ addLiveOut(NewReg);
+ }
+}
+
+void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
+ MachineRegisterInfo *MRI,
+ bool ReplaceInside, bool ReplaceOutside,
+ bool IncludeLoopPHI) {
+ assert(Register != NewRegister && "Cannot replace a reg with itself");
+
+ DEBUG(dbgs() << "Pepareing to replace register (region): "
+ << PrintReg(Register, MRI->getTargetRegisterInfo()) << " with "
+ << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
+
+ // If we are replacing outside, we also need to update the LiveOuts
+ if (ReplaceOutside &&
+ (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) {
+ LinearizedRegion *Current = this;
+ while (Current != nullptr && Current->getEntry() != nullptr) {
+ DEBUG(dbgs() << "Region before register replace\n");
+ DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+ Current->replaceLiveOut(Register, NewRegister);
+ DEBUG(dbgs() << "Region after register replace\n");
+ DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+ Current = Current->getParent();
+ }
+ }
+
+ for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
+ E = MRI->reg_end();
+ I != E;) {
+ MachineOperand &O = *I;
+ ++I;
+
+ // We don't rewrite defs.
+ if (O.isDef())
+ continue;
+
+ bool IsInside = contains(O.getParent()->getParent());
+ bool IsLoopPHI = IsInside && (O.getParent()->isPHI() &&
+ O.getParent()->getParent() == getEntry());
+ bool ShouldReplace = (IsInside && ReplaceInside) ||
+ (!IsInside && ReplaceOutside) ||
+ (IncludeLoopPHI && IsLoopPHI);
+ if (ShouldReplace) {
+
+ if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+ DEBUG(dbgs() << "Trying to substitute physical register: "
+ << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
+ llvm_unreachable("Cannot substitute physical registers");
+ } else {
+ DEBUG(dbgs() << "Replacing register (region): "
+ << PrintReg(Register, MRI->getTargetRegisterInfo())
+ << " with "
+ << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
+ O.setReg(NewRegister);
+ }
+ }
+ }
+}
+
+void LinearizedRegion::replaceRegisterInsideRegion(unsigned Register,
+ unsigned NewRegister,
+ bool IncludeLoopPHIs,
+ MachineRegisterInfo *MRI) {
+ replaceRegister(Register, NewRegister, MRI, true, false, IncludeLoopPHIs);
+}
+
+void LinearizedRegion::replaceRegisterOutsideRegion(unsigned Register,
+ unsigned NewRegister,
+ bool IncludeLoopPHIs,
+ MachineRegisterInfo *MRI) {
+ replaceRegister(Register, NewRegister, MRI, false, true, IncludeLoopPHIs);
+}
+
+DenseSet<unsigned> *LinearizedRegion::getLiveOuts() { return &LiveOuts; }
+
+void LinearizedRegion::setEntry(MachineBasicBlock *NewEntry) {
+ Entry = NewEntry;
+}
+
+MachineBasicBlock *LinearizedRegion::getEntry() { return Entry; }
+
+void LinearizedRegion::setExit(MachineBasicBlock *NewExit) { Exit = NewExit; }
+
+MachineBasicBlock *LinearizedRegion::getExit() { return Exit; }
+
+void LinearizedRegion::addMBB(MachineBasicBlock *MBB) { MBBs.insert(MBB); }
+
+void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) {
+ for (const auto &MBB : InnerRegion->MBBs) {
+ addMBB(MBB);
+ }
+}
+
+bool LinearizedRegion::contains(MachineBasicBlock *MBB) {
+ return MBBs.count(MBB) == 1;
+}
+
+bool LinearizedRegion::isLiveOut(unsigned Reg) {
+ return LiveOuts.count(Reg) == 1;
+}
+
+bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) {
+ return MRI->def_begin(Reg) == MRI->def_end();
+}
+
+// After the code has been structurized, what was flagged as kills
+// before are no longer register kills.
+void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
+ const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+ for (auto MBBI : MBBs) {
+ MachineBasicBlock *MBB = MBBI;
+ for (auto &II : *MBB) {
+ for (auto &RI : II.uses()) {
+ if (RI.isReg()) {
+ unsigned Reg = RI.getReg();
+ if (TRI->isVirtualRegister(Reg)) {
+ if (hasNoDef(Reg, MRI))
+ continue;
+ if (!MRI->hasOneDef(Reg)) {
+ DEBUG(this->getEntry()->getParent()->dump());
+ DEBUG(dbgs() << PrintReg(Reg, TRI) << "\n");
+ }
+
+ if (MRI->def_begin(Reg) == MRI->def_end()) {
+ DEBUG(dbgs() << "Register "
+ << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ << " has NO defs\n");
+ } else if (!MRI->hasOneDef(Reg)) {
+ DEBUG(dbgs() << "Register "
+ << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ << " has multiple defs\n");
+ }
+
+ assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
+ MachineOperand *Def = &(*(MRI->def_begin(Reg)));
+ MachineOperand *UseOperand = &(RI);
+ bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB;
+ if (UseIsOutsideDefMBB && UseOperand->isKill()) {
+ DEBUG(dbgs() << "Removing kill flag on register: "
+ << PrintReg(Reg, TRI) << "\n");
+ UseOperand->setIsKill(false);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void LinearizedRegion::initLiveOut(RegionMRT *Region,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo) {
+ storeLiveOuts(Region, MRI, TRI, PHIInfo);
+}
+
+LinearizedRegion::LinearizedRegion(MachineBasicBlock *MBB,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo) {
+ setEntry(MBB);
+ setExit(MBB);
+ storeLiveOuts(MBB, MRI, TRI, PHIInfo);
+ MBBs.insert(MBB);
+ Parent = nullptr;
+}
+
+LinearizedRegion::LinearizedRegion() {
+ setEntry(nullptr);
+ setExit(nullptr);
+ Parent = nullptr;
+}
+
+LinearizedRegion::~LinearizedRegion() {}
+
+class AMDGPUMachineCFGStructurizer : public MachineFunctionPass {
+private:
+ const MachineRegionInfo *Regions;
+ const SIInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+ unsigned BBSelectRegister;
+ PHILinearize PHIInfo;
+ DenseMap<MachineBasicBlock *, MachineBasicBlock *> FallthroughMap;
+
+ void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &RegionIndices);
+ void getPHIRegionIndices(LinearizedRegion *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &RegionIndices);
+ void getPHINonRegionIndices(LinearizedRegion *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHINonRegionIndices);
+
+ void storePHILinearizationInfoDest(
+ unsigned LDestReg, MachineInstr &PHI,
+ SmallVector<unsigned, 2> *RegionIndices = nullptr);
+
+ unsigned storePHILinearizationInfo(MachineInstr &PHI,
+ SmallVector<unsigned, 2> *RegionIndices);
+
+ void extractKilledPHIs(MachineBasicBlock *MBB);
+
+ bool shrinkPHI(MachineInstr &PHI, SmallVector<unsigned, 2> &PHIIndices,
+ unsigned *ReplaceReg);
+
+ bool shrinkPHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+ MachineBasicBlock *SourceMBB,
+ SmallVector<unsigned, 2> &PHIIndices, unsigned *ReplaceReg);
+
+ void replacePHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+ MachineBasicBlock *LastMerge,
+ SmallVector<unsigned, 2> &PHIRegionIndices);
+ void replaceEntryPHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+ MachineBasicBlock *IfMBB,
+ SmallVector<unsigned, 2> &PHIRegionIndices);
+ void replaceLiveOutRegs(MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHIRegionIndices,
+ unsigned CombinedSourceReg,
+ LinearizedRegion *LRegion);
+ void rewriteRegionExitPHI(RegionMRT *Region, MachineBasicBlock *LastMerge,
+ MachineInstr &PHI, LinearizedRegion *LRegion);
+
+ void rewriteRegionExitPHIs(RegionMRT *Region, MachineBasicBlock *LastMerge,
+ LinearizedRegion *LRegion);
+ void rewriteRegionEntryPHI(LinearizedRegion *Region, MachineBasicBlock *IfMBB,
+ MachineInstr &PHI);
+ void rewriteRegionEntryPHIs(LinearizedRegion *Region,
+ MachineBasicBlock *IfMBB);
+
+ bool regionIsSimpleIf(RegionMRT *Region);
+
+ void transformSimpleIfRegion(RegionMRT *Region);
+
+ void eliminateDeadBranchOperands(MachineBasicBlock::instr_iterator &II);
+
+ void insertUnconditionalBranch(MachineBasicBlock *MBB,
+ MachineBasicBlock *Dest,
+ const DebugLoc &DL = DebugLoc());
+
+ MachineBasicBlock *createLinearizedExitBlock(RegionMRT *Region);
+
+ void insertMergePHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB, unsigned DestRegister,
+ unsigned IfSourceRegister, unsigned CodeSourceRegister,
+ bool IsUndefIfSource = false);
+
+ MachineBasicBlock *createIfBlock(MachineBasicBlock *MergeBB,
+ MachineBasicBlock *CodeBBStart,
+ MachineBasicBlock *CodeBBEnd,
+ MachineBasicBlock *SelectBB, unsigned IfReg,
+ bool InheritPreds);
+
+ void prunePHIInfo(MachineBasicBlock *MBB);
+ void createEntryPHI(LinearizedRegion *CurrentRegion, unsigned DestReg);
+
+ void createEntryPHIs(LinearizedRegion *CurrentRegion);
+ void resolvePHIInfos(MachineBasicBlock *FunctionEntry);
+
+ void replaceRegisterWith(unsigned Register, unsigned NewRegister);
+
+ MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB,
+ MachineBasicBlock *CodeBB,
+ LinearizedRegion *LRegion,
+ unsigned BBSelectRegIn,
+ unsigned BBSelectRegOut);
+
+ MachineBasicBlock *
+ createIfRegion(MachineBasicBlock *MergeMBB, LinearizedRegion *InnerRegion,
+ LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB,
+ unsigned BBSelectRegIn, unsigned BBSelectRegOut);
+ void ensureCondIsNotKilled(SmallVector<MachineOperand, 1> Cond);
+
+ void rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ unsigned BBSelectReg);
+
+ MachineInstr *getDefInstr(unsigned Reg);
+ void insertChainedPHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ LinearizedRegion *InnerRegion, unsigned DestReg,
+ unsigned SourceReg);
+ bool containsDef(MachineBasicBlock *MBB, LinearizedRegion *InnerRegion,
+ unsigned Register);
+ void rewriteLiveOutRegs(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ LinearizedRegion *InnerRegion,
+ LinearizedRegion *LRegion);
+
+ void splitLoopPHI(MachineInstr &PHI, MachineBasicBlock *Entry,
+ MachineBasicBlock *EntrySucc, LinearizedRegion *LRegion);
+ void splitLoopPHIs(MachineBasicBlock *Entry, MachineBasicBlock *EntrySucc,
+ LinearizedRegion *LRegion);
+
+ MachineBasicBlock *splitExit(LinearizedRegion *LRegion);
+
+ MachineBasicBlock *splitEntry(LinearizedRegion *LRegion);
+
+ LinearizedRegion *initLinearizedRegion(RegionMRT *Region);
+
+ bool structurizeComplexRegion(RegionMRT *Region);
+
+ bool structurizeRegion(RegionMRT *Region);
+
+ bool structurizeRegions(RegionMRT *Region, bool isTopRegion);
+
+public:
+ static char ID;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineRegionInfoPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) {
+ initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ void initFallthroughMap(MachineFunction &MF);
+
+ void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut);
+
+ unsigned initializeSelectRegisters(MRT *MRT, unsigned ExistingExitReg,
+ MachineRegisterInfo *MRI,
+ const SIInstrInfo *TII);
+
+ RegionMRT *RMRT;
+ void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; }
+
+ RegionMRT *getRegionMRT() { return RMRT; }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+char AMDGPUMachineCFGStructurizer::ID = 0;
+
+bool AMDGPUMachineCFGStructurizer::regionIsSimpleIf(RegionMRT *Region) {
+ MachineBasicBlock *Entry = Region->getEntry();
+ MachineBasicBlock *Succ = Region->getSucc();
+ bool FoundBypass = false;
+ bool FoundIf = false;
+
+ if (Entry->succ_size() != 2) {
+ return false;
+ }
+
+ for (MachineBasicBlock::const_succ_iterator SI = Entry->succ_begin(),
+ E = Entry->succ_end();
+ SI != E; ++SI) {
+ MachineBasicBlock *Current = *SI;
+
+ if (Current == Succ) {
+ FoundBypass = true;
+ } else if ((Current->succ_size() == 1) &&
+ *(Current->succ_begin()) == Succ) {
+ FoundIf = true;
+ }
+ }
+
+ return FoundIf && FoundBypass;
+}
+
+void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) {
+ MachineBasicBlock *Entry = Region->getEntry();
+ MachineBasicBlock *Exit = Region->getExit();
+ TII->convertNonUniformIfRegion(Entry, Exit);
+}
+
+static void fixMBBTerminator(MachineBasicBlock *MBB) {
+
+ if (MBB->succ_size() == 1) {
+ auto *Succ = *(MBB->succ_begin());
+ for (auto &TI : MBB->terminators()) {
+ for (auto &UI : TI.uses()) {
+ if (UI.isMBB() && UI.getMBB() != Succ) {
+ UI.setMBB(Succ);
+ }
+ }
+ }
+ }
+}
+
+static void fixRegionTerminator(RegionMRT *Region) {
+ MachineBasicBlock *InternalSucc = nullptr;
+ MachineBasicBlock *ExternalSucc = nullptr;
+ LinearizedRegion *LRegion = Region->getLinearizedRegion();
+ auto Exit = LRegion->getExit();
+
+ SmallPtrSet<MachineBasicBlock *, 2> Successors;
+ for (MachineBasicBlock::const_succ_iterator SI = Exit->succ_begin(),
+ SE = Exit->succ_end();
+ SI != SE; ++SI) {
+ MachineBasicBlock *Succ = *SI;
+ if (LRegion->contains(Succ)) {
+ // Do not allow re-assign
+ assert(InternalSucc == nullptr);
+ InternalSucc = Succ;
+ } else {
+ // Do not allow re-assign
+ assert(ExternalSucc == nullptr);
+ ExternalSucc = Succ;
+ }
+ }
+
+ for (auto &TI : Exit->terminators()) {
+ for (auto &UI : TI.uses()) {
+ if (UI.isMBB()) {
+ auto Target = UI.getMBB();
+ if (Target != InternalSucc && Target != ExternalSucc) {
+ UI.setMBB(ExternalSucc);
+ }
+ }
+ }
+ }
+}
+
+// If a region region is just a sequence of regions (and the exit
+// block in the case of the top level region), we can simply skip
+// linearizing it, because it is already linear
+bool regionIsSequence(RegionMRT *Region) {
+ auto Children = Region->getChildren();
+ for (auto CI : *Children) {
+ if (!CI->isRegion()) {
+ if (CI->getMBBMRT()->getMBB()->succ_size() > 1) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+void fixupRegionExits(RegionMRT *Region) {
+ auto Children = Region->getChildren();
+ for (auto CI : *Children) {
+ if (!CI->isRegion()) {
+ fixMBBTerminator(CI->getMBBMRT()->getMBB());
+ } else {
+ fixRegionTerminator(CI->getRegionMRT());
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHIRegionIndices(
+ RegionMRT *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHIRegionIndices) {
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ MachineBasicBlock *Pred = getPHIPred(PHI, i);
+ if (Region->contains(Pred)) {
+ PHIRegionIndices.push_back(i);
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHIRegionIndices(
+ LinearizedRegion *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHIRegionIndices) {
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ MachineBasicBlock *Pred = getPHIPred(PHI, i);
+ if (Region->contains(Pred)) {
+ PHIRegionIndices.push_back(i);
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHINonRegionIndices(
+ LinearizedRegion *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHINonRegionIndices) {
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ MachineBasicBlock *Pred = getPHIPred(PHI, i);
+ if (!Region->contains(Pred)) {
+ PHINonRegionIndices.push_back(i);
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest(
+ unsigned LDestReg, MachineInstr &PHI,
+ SmallVector<unsigned, 2> *RegionIndices) {
+ if (RegionIndices) {
+ for (auto i : *RegionIndices) {
+ PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i));
+ }
+ } else {
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i));
+ }
+ }
+}
+
+unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo(
+ MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) {
+ unsigned DestReg = getPHIDestReg(PHI);
+ unsigned LinearizeDestReg =
+ MRI->createVirtualRegister(MRI->getRegClass(DestReg));
+ PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc());
+ storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices);
+ return LinearizeDestReg;
+}
+
+void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
+ // We need to create a new chain for the killed phi, but there is no
+ // need to do the renaming outside or inside the block.
+ SmallPtrSet<MachineInstr *, 2> PHIs;
+ for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+ E = MBB->instr_end();
+ I != E; ++I) {
+ MachineInstr &Instr = *I;
+ if (Instr.isPHI()) {
+ unsigned PHIDestReg = getPHIDestReg(Instr);
+ DEBUG(dbgs() << "Extractking killed phi:\n");
+ DEBUG(Instr.dump());
+ PHIs.insert(&Instr);
+ PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc());
+ storePHILinearizationInfoDest(PHIDestReg, Instr);
+ }
+ }
+
+ for (auto PI : PHIs) {
+ PI->eraseFromParent();
+ }
+}
+
+static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices,
+ unsigned Index) {
+ for (auto i : PHIRegionIndices) {
+ if (i == Index)
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHIIndices,
+ unsigned *ReplaceReg) {
+ return shrinkPHI(PHI, 0, nullptr, PHIIndices, ReplaceReg);
+}
+
+bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
+ unsigned CombinedSourceReg,
+ MachineBasicBlock *SourceMBB,
+ SmallVector<unsigned, 2> &PHIIndices,
+ unsigned *ReplaceReg) {
+ DEBUG(dbgs() << "Shrink PHI: ");
+ DEBUG(PHI.dump());
+ DEBUG(dbgs() << " to " << PrintReg(getPHIDestReg(PHI), TRI)
+ << "<def> = PHI(");
+
+ bool Replaced = false;
+ unsigned NumInputs = getPHINumInputs(PHI);
+ int SingleExternalEntryIndex = -1;
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (!isPHIRegionIndex(PHIIndices, i)) {
+ if (SingleExternalEntryIndex == -1) {
+ // Single entry
+ SingleExternalEntryIndex = i;
+ } else {
+ // Multiple entries
+ SingleExternalEntryIndex = -2;
+ }
+ }
+ }
+
+ if (SingleExternalEntryIndex > -1) {
+ *ReplaceReg = getPHISourceReg(PHI, SingleExternalEntryIndex);
+ // We should not rewrite the code, we should only pick up the single value
+ // that represents the shrunk PHI.
+ Replaced = true;
+ } else {
+ MachineBasicBlock *MBB = PHI.getParent();
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+ getPHIDestReg(PHI));
+ if (SourceMBB) {
+ MIB.addReg(CombinedSourceReg);
+ MIB.addMBB(SourceMBB);
+ DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+ << SourceMBB->getNumber());
+ }
+
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (isPHIRegionIndex(PHIIndices, i)) {
+ continue;
+ }
+ unsigned SourceReg = getPHISourceReg(PHI, i);
+ MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+ MIB.addReg(SourceReg);
+ MIB.addMBB(SourcePred);
+ DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+ << SourcePred->getNumber());
+ }
+ DEBUG(dbgs() << ")\n");
+ }
+ PHI.eraseFromParent();
+ return Replaced;
+}
+
+void AMDGPUMachineCFGStructurizer::replacePHI(
+ MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge,
+ SmallVector<unsigned, 2> &PHIRegionIndices) {
+ DEBUG(dbgs() << "Replace PHI: ");
+ DEBUG(PHI.dump());
+ DEBUG(dbgs() << " with " << PrintReg(getPHIDestReg(PHI), TRI)
+ << "<def> = PHI(");
+
+ bool HasExternalEdge = false;
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (!isPHIRegionIndex(PHIRegionIndices, i)) {
+ HasExternalEdge = true;
+ }
+ }
+
+ if (HasExternalEdge) {
+ MachineBasicBlock *MBB = PHI.getParent();
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+ getPHIDestReg(PHI));
+ MIB.addReg(CombinedSourceReg);
+ MIB.addMBB(LastMerge);
+ DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+ << LastMerge->getNumber());
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (isPHIRegionIndex(PHIRegionIndices, i)) {
+ continue;
+ }
+ unsigned SourceReg = getPHISourceReg(PHI, i);
+ MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+ MIB.addReg(SourceReg);
+ MIB.addMBB(SourcePred);
+ DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+ << SourcePred->getNumber());
+ }
+ DEBUG(dbgs() << ")\n");
+ } else {
+ replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg);
+ }
+ PHI.eraseFromParent();
+}
+
+void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
+ MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB,
+ SmallVector<unsigned, 2> &PHIRegionIndices) {
+
+ DEBUG(dbgs() << "Replace entry PHI: ");
+ DEBUG(PHI.dump());
+ DEBUG(dbgs() << " with ");
+
+ unsigned NumInputs = getPHINumInputs(PHI);
+ unsigned NumNonRegionInputs = NumInputs;
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (isPHIRegionIndex(PHIRegionIndices, i)) {
+ NumNonRegionInputs--;
+ }
+ }
+
+ if (NumNonRegionInputs == 0) {
+ auto DestReg = getPHIDestReg(PHI);
+ replaceRegisterWith(DestReg, CombinedSourceReg);
+ DEBUG(dbgs() << " register " << PrintReg(CombinedSourceReg, TRI) << "\n");
+ PHI.eraseFromParent();
+ } else {
+ DEBUG(dbgs() << PrintReg(getPHIDestReg(PHI), TRI) << "<def> = PHI(");
+ MachineBasicBlock *MBB = PHI.getParent();
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+ getPHIDestReg(PHI));
+ MIB.addReg(CombinedSourceReg);
+ MIB.addMBB(IfMBB);
+ DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+ << IfMBB->getNumber());
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (isPHIRegionIndex(PHIRegionIndices, i)) {
+ continue;
+ }
+ unsigned SourceReg = getPHISourceReg(PHI, i);
+ MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+ MIB.addReg(SourceReg);
+ MIB.addMBB(SourcePred);
+ DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+ << SourcePred->getNumber());
+ }
+ DEBUG(dbgs() << ")\n");
+ PHI.eraseFromParent();
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
+ MachineInstr &PHI, SmallVector<unsigned, 2> &PHIRegionIndices,
+ unsigned CombinedSourceReg, LinearizedRegion *LRegion) {
+ bool WasLiveOut = false;
+ for (auto PII : PHIRegionIndices) {
+ unsigned Reg = getPHISourceReg(PHI, PII);
+ if (LRegion->isLiveOut(Reg)) {
+ bool IsDead = true;
+
+ // Check if register is live out of the basic block
+ MachineBasicBlock *DefMBB = getDefInstr(Reg)->getParent();
+ for (auto UI = MRI->use_begin(Reg), E = MRI->use_end(); UI != E; ++UI) {
+ if ((*UI).getParent()->getParent() != DefMBB) {
+ IsDead = false;
+ }
+ }
+
+ DEBUG(dbgs() << "Register " << PrintReg(Reg, TRI) << " is "
+ << (IsDead ? "dead" : "alive") << " after PHI replace\n");
+ if (IsDead) {
+ LRegion->removeLiveOut(Reg);
+ }
+ WasLiveOut = true;
+ }
+ }
+
+ if (WasLiveOut)
+ LRegion->addLiveOut(CombinedSourceReg);
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHI(RegionMRT *Region,
+ MachineBasicBlock *LastMerge,
+ MachineInstr &PHI,
+ LinearizedRegion *LRegion) {
+ SmallVector<unsigned, 2> PHIRegionIndices;
+ getPHIRegionIndices(Region, PHI, PHIRegionIndices);
+ unsigned LinearizedSourceReg =
+ storePHILinearizationInfo(PHI, &PHIRegionIndices);
+
+ replacePHI(PHI, LinearizedSourceReg, LastMerge, PHIRegionIndices);
+ replaceLiveOutRegs(PHI, PHIRegionIndices, LinearizedSourceReg, LRegion);
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHI(LinearizedRegion *Region,
+ MachineBasicBlock *IfMBB,
+ MachineInstr &PHI) {
+ SmallVector<unsigned, 2> PHINonRegionIndices;
+ getPHINonRegionIndices(Region, PHI, PHINonRegionIndices);
+ unsigned LinearizedSourceReg =
+ storePHILinearizationInfo(PHI, &PHINonRegionIndices);
+ replaceEntryPHI(PHI, LinearizedSourceReg, IfMBB, PHINonRegionIndices);
+}
+
+static void collectPHIs(MachineBasicBlock *MBB,
+ SmallVector<MachineInstr *, 2> &PHIs) {
+ for (auto &BBI : *MBB) {
+ if (BBI.isPHI()) {
+ PHIs.push_back(&BBI);
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHIs(RegionMRT *Region,
+ MachineBasicBlock *LastMerge,
+ LinearizedRegion *LRegion) {
+ SmallVector<MachineInstr *, 2> PHIs;
+ auto Exit = Region->getSucc();
+ if (Exit == nullptr)
+ return;
+
+ collectPHIs(Exit, PHIs);
+
+ for (auto PHII : PHIs) {
+ rewriteRegionExitPHI(Region, LastMerge, *PHII, LRegion);
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Region,
+ MachineBasicBlock *IfMBB) {
+ SmallVector<MachineInstr *, 2> PHIs;
+ auto Entry = Region->getEntry();
+
+ collectPHIs(Entry, PHIs);
+
+ for (auto PHII : PHIs) {
+ rewriteRegionEntryPHI(Region, IfMBB, *PHII);
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB,
+ MachineBasicBlock *Dest,
+ const DebugLoc &DL) {
+ DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
+ << " -> " << Dest->getNumber() << "\n");
+ MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator();
+ bool HasTerminator = Terminator != MBB->instr_end();
+ if (HasTerminator) {
+ TII->ReplaceTailWithBranchTo(Terminator, Dest);
+ }
+ if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(Dest)) {
+ TII->insertUnconditionalBranch(*MBB, Dest, DL);
+ }
+}
+
+static MachineBasicBlock *getSingleExitNode(MachineFunction &MF) {
+ MachineBasicBlock *result = nullptr;
+ for (auto &MFI : MF) {
+ if (MFI.succ_size() == 0) {
+ if (result == nullptr) {
+ result = &MFI;
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ return result;
+}
+
+static bool hasOneExitNode(MachineFunction &MF) {
+ return getSingleExitNode(MF) != nullptr;
+}
+
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) {
+ auto Exit = Region->getSucc();
+
+ // If the exit is the end of the function, we just use the existing
+ MachineFunction *MF = Region->getEntry()->getParent();
+ if (Exit == nullptr && hasOneExitNode(*MF)) {
+ return &(*(--(Region->getEntry()->getParent()->end())));
+ }
+
+ MachineBasicBlock *LastMerge = MF->CreateMachineBasicBlock();
+ if (Exit == nullptr) {
+ MachineFunction::iterator ExitIter = MF->end();
+ MF->insert(ExitIter, LastMerge);
+ } else {
+ MachineFunction::iterator ExitIter = Exit->getIterator();
+ MF->insert(ExitIter, LastMerge);
+ LastMerge->addSuccessor(Exit);
+ insertUnconditionalBranch(LastMerge, Exit);
+ DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n");
+ }
+ return LastMerge;
+}
+
+void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
+ MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ unsigned DestRegister,
+ unsigned IfSourceRegister,
+ unsigned CodeSourceRegister,
+ bool IsUndefIfSource) {
+ // If this is the function exit block, we don't need a phi.
+ if (MergeBB->succ_begin() == MergeBB->succ_end()) {
+ return;
+ }
+ DEBUG(dbgs() << "Merge PHI (BB#" << MergeBB->getNumber()
+ << "): " << PrintReg(DestRegister, TRI) << "<def> = PHI("
+ << PrintReg(IfSourceRegister, TRI) << ", BB#"
+ << IfBB->getNumber() << PrintReg(CodeSourceRegister, TRI)
+ << ", BB#" << CodeBB->getNumber() << ")\n");
+ const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin());
+ MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL,
+ TII->get(TargetOpcode::PHI), DestRegister);
+ if (IsUndefIfSource && false) {
+ MIB.addReg(IfSourceRegister, RegState::Undef);
+ } else {
+ MIB.addReg(IfSourceRegister);
+ }
+ MIB.addMBB(IfBB);
+ MIB.addReg(CodeSourceRegister);
+ MIB.addMBB(CodeBB);
+}
+
+static void removeExternalCFGSuccessors(MachineBasicBlock *MBB) {
+ for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(),
+ E = MBB->succ_end();
+ PI != E; ++PI) {
+ if ((*PI) != MBB) {
+ (MBB)->removeSuccessor(*PI);
+ }
+ }
+}
+
+static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
+ MachineBasicBlock *EndMBB) {
+
+ // We have to check against the StartMBB successor becasuse a
+ // structurized region with a loop will have the entry block split,
+ // and the backedge will go to the entry successor.
+ DenseSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Succs;
+ unsigned SuccSize = StartMBB->succ_size();
+ if (SuccSize > 0) {
+ MachineBasicBlock *StartMBBSucc = *(StartMBB->succ_begin());
+ for (MachineBasicBlock::succ_iterator PI = EndMBB->succ_begin(),
+ E = EndMBB->succ_end();
+ PI != E; ++PI) {
+ // Either we have a back-edge to the entry block, or a back-edge to the
+ // succesor of the entry block since the block may be split.
+ if ((*PI) != StartMBB &&
+ !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) {
+ Succs.insert(
+ std::pair<MachineBasicBlock *, MachineBasicBlock *>(EndMBB, *PI));
+ }
+ }
+ }
+
+ for (MachineBasicBlock::pred_iterator PI = StartMBB->pred_begin(),
+ E = StartMBB->pred_end();
+ PI != E; ++PI) {
+ if ((*PI) != EndMBB) {
+ Succs.insert(
+ std::pair<MachineBasicBlock *, MachineBasicBlock *>(*PI, StartMBB));
+ }
+ }
+
+ for (auto SI : Succs) {
+ std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
+ DEBUG(dbgs() << "Removing edge: BB#" << Edge.first->getNumber() << " -> BB#"
+ << Edge.second->getNumber() << "\n");
+ Edge.first->removeSuccessor(Edge.second);
+ }
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
+ MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBBStart,
+ MachineBasicBlock *CodeBBEnd, MachineBasicBlock *SelectBB, unsigned IfReg,
+ bool InheritPreds) {
+ MachineFunction *MF = MergeBB->getParent();
+ MachineBasicBlock *IfBB = MF->CreateMachineBasicBlock();
+
+ if (InheritPreds) {
+ for (MachineBasicBlock::pred_iterator PI = CodeBBStart->pred_begin(),
+ E = CodeBBStart->pred_end();
+ PI != E; ++PI) {
+ if ((*PI) != CodeBBEnd) {
+ MachineBasicBlock *Pred = (*PI);
+ Pred->addSuccessor(IfBB);
+ }
+ }
+ }
+
+ removeExternalCFGEdges(CodeBBStart, CodeBBEnd);
+
+ auto CodeBBStartI = CodeBBStart->getIterator();
+ auto CodeBBEndI = CodeBBEnd->getIterator();
+ auto MergeIter = MergeBB->getIterator();
+ MF->insert(MergeIter, IfBB);
+ MF->splice(MergeIter, CodeBBStartI, ++CodeBBEndI);
+ IfBB->addSuccessor(MergeBB);
+ IfBB->addSuccessor(CodeBBStart);
+
+ DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
+ // Ensure that the MergeBB is a succesor of the CodeEndBB.
+ if (!CodeBBEnd->isSuccessor(MergeBB))
+ CodeBBEnd->addSuccessor(MergeBB);
+
+ DEBUG(dbgs() << "Moved MBB#" << CodeBBStart->getNumber() << " through MBB#"
+ << CodeBBEnd->getNumber() << "\n");
+
+ // If we have a single predecessor we can find a reasonable debug location
+ MachineBasicBlock *SinglePred =
+ CodeBBStart->pred_size() == 1 ? *(CodeBBStart->pred_begin()) : nullptr;
+ const DebugLoc &DL = SinglePred
+ ? SinglePred->findDebugLoc(SinglePred->getFirstTerminator())
+ : DebugLoc();
+
+ unsigned Reg =
+ TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg,
+ SelectBB->getNumber() /* CodeBBStart->getNumber() */);
+ if (&(*(IfBB->getParent()->begin())) == IfBB) {
+ TII->materializeImmediate(*IfBB, IfBB->begin(), DL, IfReg,
+ CodeBBStart->getNumber());
+ }
+ MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
+ ArrayRef<MachineOperand> Cond(RegOp);
+ TII->insertBranch(*IfBB, MergeBB, CodeBBStart, Cond, DL);
+
+ return IfBB;
+}
+
+void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled(
+ SmallVector<MachineOperand, 1> Cond) {
+ if (Cond.size() != 1)
+ return;
+ if (!Cond[0].isReg())
+ return;
+
+ unsigned CondReg = Cond[0].getReg();
+ for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) {
+ (*UI).setIsKill(false);
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ unsigned BBSelectReg) {
+ MachineBasicBlock *TrueBB = nullptr;
+ MachineBasicBlock *FalseBB = nullptr;
+ SmallVector<MachineOperand, 1> Cond;
+ MachineBasicBlock *FallthroughBB = FallthroughMap[CodeBB];
+ TII->analyzeBranch(*CodeBB, TrueBB, FalseBB, Cond);
+
+ const DebugLoc &DL = CodeBB->findDebugLoc(CodeBB->getFirstTerminator());
+
+ if (FalseBB == nullptr && TrueBB == nullptr && FallthroughBB == nullptr) {
+ // This is an exit block, hence no successors. We will assign the
+ // bb select register to the entry block.
+ TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+ BBSelectReg,
+ CodeBB->getParent()->begin()->getNumber());
+ insertUnconditionalBranch(CodeBB, MergeBB, DL);
+ return;
+ }
+
+ if (FalseBB == nullptr && TrueBB == nullptr) {
+ TrueBB = FallthroughBB;
+ } else if (TrueBB != nullptr) {
+ FalseBB =
+ (FallthroughBB && (FallthroughBB != TrueBB)) ? FallthroughBB : FalseBB;
+ }
+
+ if ((TrueBB != nullptr && FalseBB == nullptr) || (TrueBB == FalseBB)) {
+ TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+ BBSelectReg, TrueBB->getNumber());
+ } else {
+ const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg);
+ unsigned TrueBBReg = MRI->createVirtualRegister(RegClass);
+ unsigned FalseBBReg = MRI->createVirtualRegister(RegClass);
+ TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+ TrueBBReg, TrueBB->getNumber());
+ TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+ FalseBBReg, FalseBB->getNumber());
+ ensureCondIsNotKilled(Cond);
+ TII->insertVectorSelect(*CodeBB, CodeBB->getFirstTerminator(), DL,
+ BBSelectReg, Cond, TrueBBReg, FalseBBReg);
+ }
+
+ insertUnconditionalBranch(CodeBB, MergeBB, DL);
+}
+
+MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) {
+ if (MRI->def_begin(Reg) == MRI->def_end()) {
+ DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ << " has NO defs\n");
+ } else if (!MRI->hasOneDef(Reg)) {
+ DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ << " has multiple defs\n");
+ DEBUG(dbgs() << "DEFS BEGIN:\n");
+ for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) {
+ DEBUG(DI->getParent()->dump());
+ }
+ DEBUG(dbgs() << "DEFS END\n");
+ }
+
+ assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
+ return (*(MRI->def_begin(Reg))).getParent();
+}
+
+void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
+ MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ LinearizedRegion *InnerRegion,
+ unsigned DestReg,
+ unsigned SourceReg) {
+ // In this function we know we are part of a chain already, so we need
+ // to add the registers to the existing chain, and rename the register
+ // inside the region.
+ bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit();
+ MachineInstr *DefInstr = getDefInstr(SourceReg);
+ if (DefInstr->isPHI() && DefInstr->getParent() == CodeBB && IsSingleBB) {
+ // Handle the case where the def is a PHI-def inside a basic
+ // block, then we only need to do renaming. Special care needs to
+ // be taken if the PHI-def is part of an existing chain, or if a
+ // new one needs to be created.
+ InnerRegion->replaceRegisterInsideRegion(SourceReg, DestReg, true, MRI);
+
+ // We collect all PHI Information, and if we are at the region entry,
+ // all PHIs will be removed, and then re-introduced if needed.
+ storePHILinearizationInfoDest(DestReg, *DefInstr);
+ // We have picked up all the information we need now and can remove
+ // the PHI
+ PHIInfo.removeSource(DestReg, SourceReg, CodeBB);
+ DefInstr->eraseFromParent();
+ } else {
+ // If this is not a phi-def, or it is a phi-def but from a linearized region
+ if (IsSingleBB && DefInstr->getParent() == InnerRegion->getEntry()) {
+ // If this is a single BB and the definition is in this block we
+ // need to replace any uses outside the region.
+ InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI);
+ }
+ const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
+ unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
+ bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
+ DEBUG(dbgs() << "Insert Chained PHI\n");
+ insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
+ SourceReg, IsLastDef);
+
+ PHIInfo.removeSource(DestReg, SourceReg, CodeBB);
+ if (IsLastDef) {
+ const DebugLoc &DL = IfBB->findDebugLoc(IfBB->getFirstTerminator());
+ TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DL,
+ NextDestReg, 0);
+ PHIInfo.deleteDef(DestReg);
+ } else {
+ PHIInfo.replaceDef(DestReg, NextDestReg);
+ }
+ }
+}
+
+bool AMDGPUMachineCFGStructurizer::containsDef(MachineBasicBlock *MBB,
+ LinearizedRegion *InnerRegion,
+ unsigned Register) {
+ return getDefInstr(Register)->getParent() == MBB ||
+ InnerRegion->contains(getDefInstr(Register)->getParent());
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
+ MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ LinearizedRegion *InnerRegion,
+ LinearizedRegion *LRegion) {
+ DenseSet<unsigned> *LiveOuts = InnerRegion->getLiveOuts();
+ SmallVector<unsigned, 4> OldLiveOuts;
+ bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit();
+ for (auto OLI : *LiveOuts) {
+ OldLiveOuts.push_back(OLI);
+ }
+
+ for (auto LI : OldLiveOuts) {
+ DEBUG(dbgs() << "LiveOut: " << PrintReg(LI, TRI));
+ if (!containsDef(CodeBB, InnerRegion, LI) ||
+ (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
+ // If the register simly lives through the CodeBB, we don't have
+ // to rewrite anything since the register is not defined in this
+ // part of the code.
+ DEBUG(dbgs() << "- through");
+ continue;
+ }
+ DEBUG(dbgs() << "\n");
+ unsigned Reg = LI;
+ if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) {
+ // If the register is live out, we do want to create a phi,
+ // unless it is from the Exit block, becasuse in that case there
+ // is already a PHI, and no need to create a new one.
+
+ // If the register is just a live out def and not part of a phi
+ // chain, we need to create a PHI node to handle the if region,
+ // and replace all uses outside of the region with the new dest
+ // register, unless it is the outgoing BB select register. We have
+ // already creaed phi nodes for these.
+ const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
+ unsigned PHIDestReg = MRI->createVirtualRegister(RegClass);
+ unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
+ // Create initializer, this value is never used, but is needed
+ // to satisfy SSA.
+ DEBUG(dbgs() << "Initializer for reg: " << PrintReg(Reg) << "\n");
+ TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(),
+ IfSourceReg, 0);
+
+ InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI);
+ DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
+ insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg,
+ IfSourceReg, Reg, true);
+ }
+ }
+
+ // Handle the chained definitions in PHIInfo, checking if this basic block
+ // is a source block for a definition.
+ SmallVector<unsigned, 4> Sources;
+ if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) {
+ DEBUG(dbgs() << "Inserting PHI Live Out from BB#" << CodeBB->getNumber()
+ << "\n");
+ for (auto SI : Sources) {
+ unsigned DestReg;
+ PHIInfo.findDest(SI, CodeBB, DestReg);
+ insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI);
+ }
+ DEBUG(dbgs() << "Insertion done.\n");
+ }
+
+ DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
+ DEBUG(dbgs() << "Before PHI Prune\n");
+ DEBUG(PHIInfo.dump(MRI));
+ SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4>
+ ElimiatedSources;
+ for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+ ++DRI) {
+
+ unsigned DestReg = *DRI;
+ auto SE = PHIInfo.sources_end(DestReg);
+
+ bool MBBContainsPHISource = false;
+ // Check if there is a PHI source in this MBB
+ for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+ unsigned SourceReg = (*SRI).first;
+ MachineOperand *Def = &(*(MRI->def_begin(SourceReg)));
+ if (Def->getParent()->getParent() == MBB) {
+ MBBContainsPHISource = true;
+ }
+ }
+
+ // If so, all other sources are useless since we know this block
+ // is always executed when the region is executed.
+ if (MBBContainsPHISource) {
+ for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+ PHILinearize::PHISourceT Source = *SRI;
+ unsigned SourceReg = Source.first;
+ MachineBasicBlock *SourceMBB = Source.second;
+ MachineOperand *Def = &(*(MRI->def_begin(SourceReg)));
+ if (Def->getParent()->getParent() != MBB) {
+ ElimiatedSources.push_back(
+ std::make_tuple(DestReg, SourceReg, SourceMBB));
+ }
+ }
+ }
+ }
+
+ // Remove the PHI sources that are in the given MBB
+ for (auto &SourceInfo : ElimiatedSources) {
+ PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo),
+ std::get<2>(SourceInfo));
+ }
+ DEBUG(dbgs() << "After PHI Prune\n");
+ DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion,
+ unsigned DestReg) {
+ MachineBasicBlock *Entry = CurrentRegion->getEntry();
+ MachineBasicBlock *Exit = CurrentRegion->getExit();
+
+ DEBUG(dbgs() << "RegionExit: " << Exit->getNumber()
+ << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n");
+
+ int NumSources = 0;
+ auto SE = PHIInfo.sources_end(DestReg);
+
+ for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+ NumSources++;
+ }
+
+ if (NumSources == 1) {
+ auto SRI = PHIInfo.sources_begin(DestReg);
+ unsigned SourceReg = (*SRI).first;
+ replaceRegisterWith(DestReg, SourceReg);
+ } else {
+ const DebugLoc &DL = Entry->findDebugLoc(Entry->begin());
+ MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL,
+ TII->get(TargetOpcode::PHI), DestReg);
+ DEBUG(dbgs() << "Entry PHI " << PrintReg(DestReg, TRI) << "<def> = PHI(");
+
+ unsigned CurrentBackedgeReg = 0;
+
+ for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+ unsigned SourceReg = (*SRI).first;
+
+ if (CurrentRegion->contains((*SRI).second)) {
+ if (CurrentBackedgeReg == 0) {
+ CurrentBackedgeReg = SourceReg;
+ } else {
+ MachineInstr *PHIDefInstr = getDefInstr(SourceReg);
+ MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent();
+ const TargetRegisterClass *RegClass =
+ MRI->getRegClass(CurrentBackedgeReg);
+ unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass);
+ MachineInstrBuilder BackedgePHI =
+ BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL,
+ TII->get(TargetOpcode::PHI), NewBackedgeReg);
+ BackedgePHI.addReg(CurrentBackedgeReg);
+ BackedgePHI.addMBB(getPHIPred(*PHIDefInstr, 0));
+ BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1));
+ BackedgePHI.addMBB((*SRI).second);
+ CurrentBackedgeReg = NewBackedgeReg;
+ DEBUG(dbgs() << "Inserting backedge PHI: "
+ << PrintReg(NewBackedgeReg, TRI) << "<def> = PHI("
+ << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
+ << getPHIPred(*PHIDefInstr, 0)->getNumber() << ", "
+ << PrintReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
+ << ", BB#" << (*SRI).second->getNumber());
+ }
+ } else {
+ MIB.addReg(SourceReg);
+ MIB.addMBB((*SRI).second);
+ DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+ << (*SRI).second->getNumber() << ", ");
+ }
+ }
+
+ // Add the final backedge register source to the entry phi
+ if (CurrentBackedgeReg != 0) {
+ MIB.addReg(CurrentBackedgeReg);
+ MIB.addMBB(Exit);
+ DEBUG(dbgs() << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
+ << Exit->getNumber() << ")\n");
+ } else {
+ DEBUG(dbgs() << ")\n");
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) {
+ DEBUG(PHIInfo.dump(MRI));
+
+ for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+ ++DRI) {
+
+ unsigned DestReg = *DRI;
+ createEntryPHI(CurrentRegion, DestReg);
+ }
+ PHIInfo.clear();
+}
+
+void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
+ unsigned NewRegister) {
+ assert(Register != NewRegister && "Cannot replace a reg with itself");
+
+ for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
+ E = MRI->reg_end();
+ I != E;) {
+ MachineOperand &O = *I;
+ ++I;
+ if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+ DEBUG(dbgs() << "Trying to substitute physical register: "
+ << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
+ llvm_unreachable("Cannot substitute physical registers");
+ // We don't handle physical registers, but if we need to
+ // in the future This is how we do it:
+ // O.substPhysReg(NewRegister, *TRI);
+ } else {
+ DEBUG(dbgs() << "Replacing register: "
+ << PrintReg(Register, MRI->getTargetRegisterInfo())
+ << " with "
+ << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
+ O.setReg(NewRegister);
+ }
+ }
+ PHIInfo.deleteDef(Register);
+
+ getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
+
+ DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) {
+ DEBUG(dbgs() << "Resolve PHI Infos\n");
+ DEBUG(PHIInfo.dump(MRI));
+ for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+ ++DRI) {
+ unsigned DestReg = *DRI;
+ DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) << "\n");
+ auto SRI = PHIInfo.sources_begin(DestReg);
+ unsigned SourceReg = (*SRI).first;
+ DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI)
+ << " SourceReg: " << PrintReg(SourceReg, TRI) << "\n");
+
+ assert(PHIInfo.sources_end(DestReg) == ++SRI &&
+ "More than one phi source in entry node");
+ replaceRegisterWith(DestReg, SourceReg);
+ }
+}
+
+static bool isFunctionEntryBlock(MachineBasicBlock *MBB) {
+ return ((&(*(MBB->getParent()->begin()))) == MBB);
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
+ MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBB,
+ LinearizedRegion *CurrentRegion, unsigned BBSelectRegIn,
+ unsigned BBSelectRegOut) {
+ if (isFunctionEntryBlock(CodeBB) && !CurrentRegion->getHasLoop()) {
+ // Handle non-loop function entry block.
+ // We need to allow loops to the entry block and then
+ rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut);
+ resolvePHIInfos(CodeBB);
+ removeExternalCFGSuccessors(CodeBB);
+ CodeBB->addSuccessor(MergeBB);
+ CurrentRegion->addMBB(CodeBB);
+ return nullptr;
+ }
+ if (CurrentRegion->getEntry() == CodeBB && !CurrentRegion->getHasLoop()) {
+ // Handle non-loop region entry block.
+ MachineFunction *MF = MergeBB->getParent();
+ auto MergeIter = MergeBB->getIterator();
+ auto CodeBBStartIter = CodeBB->getIterator();
+ auto CodeBBEndIter = ++(CodeBB->getIterator());
+ if (CodeBBEndIter != MergeIter) {
+ MF->splice(MergeIter, CodeBBStartIter, CodeBBEndIter);
+ }
+ rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut);
+ prunePHIInfo(CodeBB);
+ createEntryPHIs(CurrentRegion);
+ removeExternalCFGSuccessors(CodeBB);
+ CodeBB->addSuccessor(MergeBB);
+ CurrentRegion->addMBB(CodeBB);
+ return nullptr;
+ } else {
+ // Handle internal block.
+ const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn);
+ unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
+ rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg);
+ bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB;
+ MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB,
+ BBSelectRegIn, IsRegionEntryBB);
+ CurrentRegion->addMBB(IfBB);
+ // If this is the entry block we need to make the If block the new
+ // linearized region entry.
+ if (IsRegionEntryBB) {
+ CurrentRegion->setEntry(IfBB);
+
+ if (CurrentRegion->getHasLoop()) {
+ MachineBasicBlock *RegionExit = CurrentRegion->getExit();
+ MachineBasicBlock *ETrueBB = nullptr;
+ MachineBasicBlock *EFalseBB = nullptr;
+ SmallVector<MachineOperand, 1> ECond;
+
+ const DebugLoc &DL = DebugLoc();
+ TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond);
+ TII->removeBranch(*RegionExit);
+
+ // We need to create a backedge if there is a loop
+ unsigned Reg = TII->insertNE(
+ RegionExit, RegionExit->instr_end(), DL,
+ CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
+ CurrentRegion->getRegionMRT()->getEntry()->getNumber());
+ MachineOperand RegOp =
+ MachineOperand::CreateReg(Reg, false, false, true);
+ ArrayRef<MachineOperand> Cond(RegOp);
+ DEBUG(dbgs() << "RegionExitReg: ");
+ DEBUG(Cond[0].print(dbgs(), TRI));
+ DEBUG(dbgs() << "\n");
+ TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
+ Cond, DebugLoc());
+ RegionExit->addSuccessor(CurrentRegion->getEntry());
+ }
+ }
+ CurrentRegion->addMBB(CodeBB);
+ LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo);
+
+ InnerRegion.setParent(CurrentRegion);
+ DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
+ insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
+ CodeBBSelectReg);
+ InnerRegion.addMBB(MergeBB);
+
+ DEBUG(InnerRegion.print(dbgs(), TRI));
+ rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion);
+ extractKilledPHIs(CodeBB);
+ if (IsRegionEntryBB) {
+ createEntryPHIs(CurrentRegion);
+ }
+ return IfBB;
+ }
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
+ MachineBasicBlock *MergeBB, LinearizedRegion *InnerRegion,
+ LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB,
+ unsigned BBSelectRegIn, unsigned BBSelectRegOut) {
+ unsigned CodeBBSelectReg =
+ InnerRegion->getRegionMRT()->getInnerOutputRegister();
+ MachineBasicBlock *CodeEntryBB = InnerRegion->getEntry();
+ MachineBasicBlock *CodeExitBB = InnerRegion->getExit();
+ MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeEntryBB, CodeExitBB,
+ SelectBB, BBSelectRegIn, true);
+ CurrentRegion->addMBB(IfBB);
+ bool isEntry = CurrentRegion->getEntry() == InnerRegion->getEntry();
+ if (isEntry) {
+
+ if (CurrentRegion->getHasLoop()) {
+ MachineBasicBlock *RegionExit = CurrentRegion->getExit();
+ MachineBasicBlock *ETrueBB = nullptr;
+ MachineBasicBlock *EFalseBB = nullptr;
+ SmallVector<MachineOperand, 1> ECond;
+
+ const DebugLoc &DL = DebugLoc();
+ TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond);
+ TII->removeBranch(*RegionExit);
+
+ // We need to create a backedge if there is a loop
+ unsigned Reg =
+ TII->insertNE(RegionExit, RegionExit->instr_end(), DL,
+ CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
+ CurrentRegion->getRegionMRT()->getEntry()->getNumber());
+ MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
+ ArrayRef<MachineOperand> Cond(RegOp);
+ DEBUG(dbgs() << "RegionExitReg: ");
+ DEBUG(Cond[0].print(dbgs(), TRI));
+ DEBUG(dbgs() << "\n");
+ TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
+ Cond, DebugLoc());
+ RegionExit->addSuccessor(IfBB);
+ }
+ }
+ CurrentRegion->addMBBs(InnerRegion);
+ DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
+ insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
+ CodeBBSelectReg);
+
+ rewriteLiveOutRegs(IfBB, /* CodeEntryBB */ CodeExitBB, MergeBB, InnerRegion,
+ CurrentRegion);
+
+ rewriteRegionEntryPHIs(InnerRegion, IfBB);
+
+ if (isEntry) {
+ CurrentRegion->setEntry(IfBB);
+ }
+
+ if (isEntry) {
+ createEntryPHIs(CurrentRegion);
+ }
+
+ return IfBB;
+}
+
+void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
+ MachineBasicBlock *Entry,
+ MachineBasicBlock *EntrySucc,
+ LinearizedRegion *LRegion) {
+ SmallVector<unsigned, 2> PHIRegionIndices;
+ getPHIRegionIndices(LRegion, PHI, PHIRegionIndices);
+
+ assert(PHIRegionIndices.size() == 1);
+
+ unsigned RegionIndex = PHIRegionIndices[0];
+ unsigned RegionSourceReg = getPHISourceReg(PHI, RegionIndex);
+ MachineBasicBlock *RegionSourceMBB = getPHIPred(PHI, RegionIndex);
+ unsigned PHIDest = getPHIDestReg(PHI);
+ unsigned PHISource = PHIDest;
+ unsigned ReplaceReg;
+
+ if (shrinkPHI(PHI, PHIRegionIndices, &ReplaceReg)) {
+ PHISource = ReplaceReg;
+ }
+
+ const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest);
+ unsigned NewDestReg = MRI->createVirtualRegister(RegClass);
+ LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI);
+ MachineInstrBuilder MIB =
+ BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
+ TII->get(TargetOpcode::PHI), NewDestReg);
+ DEBUG(dbgs() << "Split Entry PHI " << PrintReg(NewDestReg, TRI)
+ << "<def> = PHI(");
+ MIB.addReg(PHISource);
+ MIB.addMBB(Entry);
+ DEBUG(dbgs() << PrintReg(PHISource, TRI) << ", BB#" << Entry->getNumber());
+ MIB.addReg(RegionSourceReg);
+ MIB.addMBB(RegionSourceMBB);
+ DEBUG(dbgs() << " ," << PrintReg(RegionSourceReg, TRI) << ", BB#"
+ << RegionSourceMBB->getNumber() << ")\n");
+}
+
+void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry,
+ MachineBasicBlock *EntrySucc,
+ LinearizedRegion *LRegion) {
+ SmallVector<MachineInstr *, 2> PHIs;
+ collectPHIs(Entry, PHIs);
+
+ for (auto PHII : PHIs) {
+ splitLoopPHI(*PHII, Entry, EntrySucc, LRegion);
+ }
+}
+
+// Split the exit block so that we can insert a end control flow
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) {
+ auto MRTRegion = LRegion->getRegionMRT();
+ auto Exit = LRegion->getExit();
+ auto MF = Exit->getParent();
+ auto Succ = MRTRegion->getSucc();
+
+ auto NewExit = MF->CreateMachineBasicBlock();
+ auto AfterExitIter = Exit->getIterator();
+ AfterExitIter++;
+ MF->insert(AfterExitIter, NewExit);
+ Exit->removeSuccessor(Succ);
+ Exit->addSuccessor(NewExit);
+ NewExit->addSuccessor(Succ);
+ insertUnconditionalBranch(NewExit, Succ);
+ LRegion->addMBB(NewExit);
+ LRegion->setExit(NewExit);
+
+ DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n");
+
+ // Replace any PHI Predecessors in the successor with NewExit
+ for (auto &II : *Succ) {
+ MachineInstr &Instr = II;
+
+ // If we are past the PHI instructions we are done
+ if (!Instr.isPHI())
+ break;
+
+ int numPreds = getPHINumInputs(Instr);
+ for (int i = 0; i < numPreds; ++i) {
+ auto Pred = getPHIPred(Instr, i);
+ if (Pred == Exit) {
+ setPhiPred(Instr, i, NewExit);
+ }
+ }
+ }
+
+ return NewExit;
+}
+
+
+static MachineBasicBlock *split(MachineBasicBlock::iterator I) {
+ // Create the fall-through block.
+ MachineBasicBlock *MBB = (*I).getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock();
+ auto MBBIter = ++(MBB->getIterator());
+ MF->insert(MBBIter, SuccMBB);
+ SuccMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ MBB->addSuccessor(SuccMBB);
+
+ // Splice the code over.
+ SuccMBB->splice(SuccMBB->end(), MBB, I, MBB->end());
+
+ return SuccMBB;
+}
+
+// Split the entry block separating PHI-nodes and the rest of the code
+// This is needed to insert an initializer for the bb select register
+// inloop regions.
+
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) {
+ MachineBasicBlock *Entry = LRegion->getEntry();
+ MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI());
+ MachineBasicBlock *Exit = LRegion->getExit();
+
+ DEBUG(dbgs() << "Split BB#" << Entry->getNumber() << " to BB#"
+ << Entry->getNumber() << " -> BB#" << EntrySucc->getNumber()
+ << "\n");
+ LRegion->addMBB(EntrySucc);
+
+ // Make the backedge go to Entry Succ
+ if (Exit->isSuccessor(Entry)) {
+ Exit->removeSuccessor(Entry);
+ }
+ Exit->addSuccessor(EntrySucc);
+ MachineInstr &Branch = *(Exit->instr_rbegin());
+ for (auto &UI : Branch.uses()) {
+ if (UI.isMBB() && UI.getMBB() == Entry) {
+ UI.setMBB(EntrySucc);
+ }
+ }
+
+ splitLoopPHIs(Entry, EntrySucc, LRegion);
+
+ return EntrySucc;
+}
+
+LinearizedRegion *
+AMDGPUMachineCFGStructurizer::initLinearizedRegion(RegionMRT *Region) {
+ LinearizedRegion *LRegion = Region->getLinearizedRegion();
+ LRegion->initLiveOut(Region, MRI, TRI, PHIInfo);
+ LRegion->setEntry(Region->getEntry());
+ return LRegion;
+}
+
+static void removeOldExitPreds(RegionMRT *Region) {
+ MachineBasicBlock *Exit = Region->getSucc();
+ if (Exit == nullptr) {
+ return;
+ }
+ for (MachineBasicBlock::pred_iterator PI = Exit->pred_begin(),
+ E = Exit->pred_end();
+ PI != E; ++PI) {
+ if (Region->contains(*PI)) {
+ (*PI)->removeSuccessor(Exit);
+ }
+ }
+}
+
+static bool mbbHasBackEdge(MachineBasicBlock *MBB,
+ SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
+ for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+ if (MBBs.count(*SI) != 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool containsNewBackedge(MRT *Tree,
+ SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
+ // Need to traverse this in reverse since it is in post order.
+ if (Tree == nullptr)
+ return false;
+
+ if (Tree->isMBB()) {
+ MachineBasicBlock *MBB = Tree->getMBBMRT()->getMBB();
+ MBBs.insert(MBB);
+ if (mbbHasBackEdge(MBB, MBBs)) {
+ return true;
+ }
+ } else {
+ RegionMRT *Region = Tree->getRegionMRT();
+ SetVector<MRT *> *Children = Region->getChildren();
+ for (auto CI = Children->rbegin(), CE = Children->rend(); CI != CE; ++CI) {
+ if (containsNewBackedge(*CI, MBBs))
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool containsNewBackedge(RegionMRT *Region) {
+ SmallPtrSet<MachineBasicBlock *, 8> MBBs;
+ return containsNewBackedge(Region, MBBs);
+}
+
+bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
+ auto *LRegion = initLinearizedRegion(Region);
+ LRegion->setHasLoop(containsNewBackedge(Region));
+ MachineBasicBlock *LastMerge = createLinearizedExitBlock(Region);
+ MachineBasicBlock *CurrentMerge = LastMerge;
+ LRegion->addMBB(LastMerge);
+ LRegion->setExit(LastMerge);
+
+ rewriteRegionExitPHIs(Region, LastMerge, LRegion);
+ removeOldExitPreds(Region);
+
+ DEBUG(PHIInfo.dump(MRI));
+
+ SetVector<MRT *> *Children = Region->getChildren();
+ DEBUG(dbgs() << "===========If Region Start===============\n");
+ if (LRegion->getHasLoop()) {
+ DEBUG(dbgs() << "Has Backedge: Yes\n");
+ } else {
+ DEBUG(dbgs() << "Has Backedge: No\n");
+ }
+
+ unsigned BBSelectRegIn;
+ unsigned BBSelectRegOut;
+ for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) {
+ DEBUG(dbgs() << "CurrentRegion: \n");
+ DEBUG(LRegion->print(dbgs(), TRI));
+
+ auto CNI = CI;
+ ++CNI;
+
+ MRT *Child = (*CI);
+
+ if (Child->isRegion()) {
+
+ LinearizedRegion *InnerLRegion =
+ Child->getRegionMRT()->getLinearizedRegion();
+ // We found the block is the exit of an inner region, we need
+ // to put it in the current linearized region.
+
+ DEBUG(dbgs() << "Linearizing region: ");
+ DEBUG(InnerLRegion->print(dbgs(), TRI));
+ DEBUG(dbgs() << "\n");
+
+ MachineBasicBlock *InnerEntry = InnerLRegion->getEntry();
+ if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) {
+ // Entry has already been linearized, no need to do this region.
+ unsigned OuterSelect = InnerLRegion->getBBSelectRegOut();
+ unsigned InnerSelectReg =
+ InnerLRegion->getRegionMRT()->getInnerOutputRegister();
+ replaceRegisterWith(InnerSelectReg, OuterSelect),
+ resolvePHIInfos(InnerEntry);
+ if (!InnerLRegion->getExit()->isSuccessor(CurrentMerge))
+ InnerLRegion->getExit()->addSuccessor(CurrentMerge);
+ continue;
+ }
+
+ BBSelectRegOut = Child->getBBSelectRegOut();
+ BBSelectRegIn = Child->getBBSelectRegIn();
+
+ DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+ << "\n");
+ DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+ << "\n");
+
+ MachineBasicBlock *IfEnd = CurrentMerge;
+ CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion,
+ Child->getRegionMRT()->getEntry(),
+ BBSelectRegIn, BBSelectRegOut);
+ TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
+ } else {
+ MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB();
+ DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
+
+ if (MBB == getSingleExitNode(*(MBB->getParent()))) {
+ // If this is the exit block then we need to skip to the next.
+ // The "in" register will be transferred to "out" in the next
+ // iteration.
+ continue;
+ }
+
+ BBSelectRegOut = Child->getBBSelectRegOut();
+ BBSelectRegIn = Child->getBBSelectRegIn();
+
+ DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+ << "\n");
+ DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+ << "\n");
+
+ MachineBasicBlock *IfEnd = CurrentMerge;
+ // This is a basic block that is not part of an inner region, we
+ // need to put it in the current linearized region.
+ CurrentMerge = createIfRegion(CurrentMerge, MBB, LRegion, BBSelectRegIn,
+ BBSelectRegOut);
+ if (CurrentMerge) {
+ TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
+ }
+
+ DEBUG(PHIInfo.dump(MRI));
+ }
+ }
+
+ LRegion->removeFalseRegisterKills(MRI);
+
+ if (LRegion->getHasLoop()) {
+ MachineBasicBlock *NewSucc = splitEntry(LRegion);
+ if (isFunctionEntryBlock(LRegion->getEntry())) {
+ resolvePHIInfos(LRegion->getEntry());
+ }
+ const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI());
+ unsigned InReg = LRegion->getBBSelectRegIn();
+ unsigned InnerSelectReg =
+ MRI->createVirtualRegister(MRI->getRegClass(InReg));
+ unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
+ TII->materializeImmediate(*(LRegion->getEntry()),
+ LRegion->getEntry()->getFirstTerminator(), DL,
+ NewInReg, Region->getEntry()->getNumber());
+ // Need to be careful about updating the registers inside the region.
+ LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI);
+ DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
+ insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc,
+ InnerSelectReg, NewInReg,
+ LRegion->getRegionMRT()->getInnerOutputRegister());
+ splitExit(LRegion);
+ TII->convertNonUniformLoopRegion(NewSucc, LastMerge);
+ }
+
+ if (Region->isRoot()) {
+ TII->insertReturn(*LastMerge);
+ }
+
+ DEBUG(Region->getEntry()->getParent()->dump());
+ DEBUG(LRegion->print(dbgs(), TRI));
+ DEBUG(PHIInfo.dump(MRI));
+
+ DEBUG(dbgs() << "===========If Region End===============\n");
+
+ Region->setLinearizedRegion(LRegion);
+ return true;
+}
+
+bool AMDGPUMachineCFGStructurizer::structurizeRegion(RegionMRT *Region) {
+ if (false && regionIsSimpleIf(Region)) {
+ transformSimpleIfRegion(Region);
+ return true;
+ } else if (regionIsSequence(Region)) {
+ fixupRegionExits(Region);
+ return false;
+ } else {
+ structurizeComplexRegion(Region);
+ }
+ return false;
+}
+
+static int structurize_once = 0;
+
+bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region,
+ bool isTopRegion) {
+ bool Changed = false;
+
+ auto Children = Region->getChildren();
+ for (auto CI : *Children) {
+ if (CI->isRegion()) {
+ Changed |= structurizeRegions(CI->getRegionMRT(), false);
+ }
+ }
+
+ if (structurize_once < 2 || true) {
+ Changed |= structurizeRegion(Region);
+ structurize_once++;
+ }
+ return Changed;
+}
+
+void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) {
+ DEBUG(dbgs() << "Fallthrough Map:\n");
+ for (auto &MBBI : MF) {
+ MachineBasicBlock *MBB = MBBI.getFallThrough();
+ if (MBB != nullptr) {
+ DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
+ << MBB->getNumber() << "\n");
+ }
+ FallthroughMap[&MBBI] = MBB;
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region,
+ unsigned SelectOut) {
+ LinearizedRegion *LRegion = new LinearizedRegion();
+ if (SelectOut) {
+ LRegion->addLiveOut(SelectOut);
+ DEBUG(dbgs() << "Add LiveOut (BBSelect): " << PrintReg(SelectOut, TRI)
+ << "\n");
+ }
+ LRegion->setRegionMRT(Region);
+ Region->setLinearizedRegion(LRegion);
+ LRegion->setParent(Region->getParent()
+ ? Region->getParent()->getLinearizedRegion()
+ : nullptr);
+}
+
+unsigned
+AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned SelectOut,
+ MachineRegisterInfo *MRI,
+ const SIInstrInfo *TII) {
+ if (MRT->isRegion()) {
+ RegionMRT *Region = MRT->getRegionMRT();
+ Region->setBBSelectRegOut(SelectOut);
+ unsigned InnerSelectOut = createBBSelectReg(TII, MRI);
+
+ // Fixme: Move linearization creation to the original spot
+ createLinearizedRegion(Region, SelectOut);
+
+ for (auto CI = Region->getChildren()->begin(),
+ CE = Region->getChildren()->end();
+ CI != CE; ++CI) {
+ InnerSelectOut =
+ initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII);
+ }
+ MRT->setBBSelectRegIn(InnerSelectOut);
+ return InnerSelectOut;
+ } else {
+ MRT->setBBSelectRegOut(SelectOut);
+ unsigned NewSelectIn = createBBSelectReg(TII, MRI);
+ MRT->setBBSelectRegIn(NewSelectIn);
+ return NewSelectIn;
+ }
+}
+
+static void checkRegOnlyPHIInputs(MachineFunction &MF) {
+ for (auto &MBBI : MF) {
+ for (MachineBasicBlock::instr_iterator I = MBBI.instr_begin(),
+ E = MBBI.instr_end();
+ I != E; ++I) {
+ MachineInstr &Instr = *I;
+ if (Instr.isPHI()) {
+ int numPreds = getPHINumInputs(Instr);
+ for (int i = 0; i < numPreds; ++i) {
+ assert(Instr.getOperand(i * 2 + 1).isReg() &&
+ "PHI Operand not a register");
+ }
+ }
+ }
+ }
+}
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+ "AMDGPU Machine CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass)
+INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+ "AMDGPU Machine CFG Structurizer", false, false)
+
+char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID;
+
+
+bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ MRI = &(MF.getRegInfo());
+ initFallthroughMap(MF);
+
+ checkRegOnlyPHIInputs(MF);
+ DEBUG(dbgs() << "----STRUCTURIZER START----\n");
+ DEBUG(MF.dump());
+
+ Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo());
+ DEBUG(Regions->dump());
+
+ RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI);
+ setRegionMRT(RTree);
+ initializeSelectRegisters(RTree, 0, MRI, TII);
+ DEBUG(RTree->dump(TRI));
+ bool result = structurizeRegions(RTree, true);
+ delete RTree;
+ DEBUG(dbgs() << "----STRUCTURIZER END----\n");
+ initFallthroughMap(MF);
+ return result;
+}
+
+FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() {
+ return new AMDGPUMachineCFGStructurizer();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 36dcc699d4ea..e40f39557747 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -397,14 +397,17 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
// instructions.
static bool canVectorizeInst(Instruction *Inst, User *User) {
switch (Inst->getOpcode()) {
- case Instruction::Load:
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(Inst);
+ return !LI->isVolatile();
+ }
case Instruction::BitCast:
case Instruction::AddrSpaceCast:
return true;
case Instruction::Store: {
// Must be the stored pointer operand, not a stored value.
StoreInst *SI = cast<StoreInst>(Inst);
- return SI->getPointerOperand() == User;
+ return (SI->getPointerOperand() == User) && !SI->isVolatile();
}
default:
return false;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 972c28579f7a..6e301b4ad527 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -125,6 +125,9 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasSDWA(false),
HasDPP(false),
FlatAddressSpace(false),
+ FlatInstOffsets(false),
+ FlatGlobalInsts(false),
+ FlatScratchInsts(false),
R600ALUInst(false),
CaymanISA(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index a5cda817ac11..bed7d326b3dd 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -145,6 +145,9 @@ protected:
bool HasSDWA;
bool HasDPP;
bool FlatAddressSpace;
+ bool FlatInstOffsets;
+ bool FlatGlobalInsts;
+ bool FlatScratchInsts;
bool R600ALUInst;
bool CaymanISA;
bool CFALUBug;
@@ -380,6 +383,18 @@ public:
return FlatAddressSpace;
}
+ bool hasFlatInstOffsets() const {
+ return FlatInstOffsets;
+ }
+
+ bool hasFlatGlobalInsts() const {
+ return FlatGlobalInsts;
+ }
+
+ bool hasFlatScratchInsts() const {
+ return FlatScratchInsts;
+ }
+
bool isMesaKernel(const MachineFunction &MF) const {
return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index cd5bad04d0b3..386a88b0520f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -118,6 +118,13 @@ static cl::opt<bool> EnableSIInsertWaitcntsPass(
cl::desc("Use new waitcnt insertion pass"),
cl::init(false));
+// Option to run late CFG structurizer
+static cl::opt<bool> LateCFGStructurize(
+ "amdgpu-late-structurize",
+ cl::desc("Enable late CFG structurization"),
+ cl::init(false),
+ cl::Hidden);
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -702,11 +709,15 @@ bool GCNPassConfig::addPreISel() {
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
addPass(&AMDGPUUnifyDivergentExitNodesID);
- addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+ if (!LateCFGStructurize) {
+ addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+ }
addPass(createSinkingPass());
addPass(createSITypeRewriter());
addPass(createAMDGPUAnnotateUniformValues());
- addPass(createSIAnnotateControlFlowPass());
+ if (!LateCFGStructurize) {
+ addPass(createSIAnnotateControlFlowPass());
+ }
return false;
}
@@ -770,6 +781,9 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
#endif
void GCNPassConfig::addPreRegAlloc() {
+ if (LateCFGStructurize) {
+ addPass(createAMDGPUMachineCFGStructurizerPass());
+ }
addPass(createSIWholeQuadModePass());
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c9482c37ec80..beafebc1284a 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -363,13 +363,22 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
- case Instruction::InsertElement:
+ case Instruction::InsertElement: {
+ unsigned EltSize
+ = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
+ if (EltSize < 32) {
+ if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
+ return 0;
+ return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+ }
+
// Extracts are just reads of a subregister, so are free. Inserts are
// considered free because we don't want to have any cost for scalarizing
// operations, and we don't have to copy into a different register class.
// Dynamic indexing isn't free and is best avoided.
return Index == ~0u ? 2 : 0;
+ }
default:
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
@@ -479,3 +488,26 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
return false;
}
+
+unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
+ if (ST->hasVOP3PInsts()) {
+ VectorType *VT = cast<VectorType>(Tp);
+ if (VT->getNumElements() == 2 &&
+ DL.getTypeSizeInBits(VT->getElementType()) == 16) {
+ // With op_sel VOP3P instructions freely can access the low half or high
+ // half of a register, so any swizzle is free.
+
+ switch (Kind) {
+ case TTI::SK_Broadcast:
+ case TTI::SK_Reverse:
+ case TTI::SK_PermuteSingleSrc:
+ return 0;
+ default:
+ break;
+ }
+ }
+ }
+
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 71d6306bc1a5..e0024e21e82b 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -114,6 +114,9 @@ public:
}
unsigned getVectorSplitCost() { return 0; }
+
+ unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp);
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 7c0ef4aeac3c..cafce0164fa9 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -48,6 +48,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUISelDAGToDAG.cpp
AMDGPULowerIntrinsics.cpp
AMDGPUMCInstLower.cpp
+ AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp
AMDGPUUnifyMetadata.cpp
AMDGPUOpenCLImageTypeLoweringPass.cpp
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index b0ac0e689a0b..8ba9efd42c70 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -7,7 +7,7 @@
//
//===----------------------------------------------------------------------===//
-def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
+def FLATAtomic : ComplexPattern<i64, 2, "SelectFlat">;
//===----------------------------------------------------------------------===//
// FLAT classes
@@ -62,7 +62,9 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
bits<8> vdst;
bits<1> slc;
bits<1> glc;
- bits<1> tfe;
+
+ // We don't use tfe right now, and it was removed in gfx9.
+ bits<1> tfe = 0;
// 15-0 is reserved.
let Inst{16} = !if(ps.has_glc, glc, ps.glcValue);
@@ -79,8 +81,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
opName,
(outs regClass:$vdst),
- (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe),
- " $vdst, $vaddr$glc$slc$tfe"> {
+ (ins VReg_64:$vaddr, GLC:$glc, slc:$slc),
+ " $vdst, $vaddr$glc$slc"> {
let has_data = 0;
let mayLoad = 1;
}
@@ -88,8 +90,8 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo<
opName,
(outs),
- (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe),
- " $vaddr, $vdata$glc$slc$tfe"> {
+ (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc),
+ " $vaddr, $vdata$glc$slc"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -105,8 +107,8 @@ multiclass FLAT_Atomic_Pseudo<
def "" : FLAT_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
- " $vaddr, $vdata$slc$tfe",
+ (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc),
+ " $vaddr, $vdata$slc",
[]>,
AtomicNoRet <NAME, 0> {
let mayLoad = 1;
@@ -119,10 +121,10 @@ multiclass FLAT_Atomic_Pseudo<
def _RTN : FLAT_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
- " $vdst, $vaddr, $vdata glc$slc$tfe",
+ (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc),
+ " $vdst, $vaddr, $vdata glc$slc",
[(set vt:$vdst,
- (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>,
+ (atomic (FLATAtomic i64:$vaddr, i1:$slc), data_vt:$vdata))]>,
AtomicNoRet <NAME, 1> {
let mayLoad = 1;
let mayStore = 1;
@@ -311,30 +313,30 @@ def flat_truncstorei16 : flat_st <truncstorei16>;
// Patterns for global loads with no offset.
class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
(vt (node i64:$addr)),
- (inst $addr, 0, 0, 0)
+ (inst $addr, 0, 0)
>;
class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
(vt (node i64:$addr)),
- (inst $addr, 1, 0, 0)
+ (inst $addr, 1, 0)
>;
class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
(node vt:$data, i64:$addr),
- (inst $addr, $data, 0, 0, 0)
+ (inst $addr, $data, 0, 0)
>;
class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
// atomic store follows atomic binop convention so the address comes
// first.
(node i64:$addr, vt:$data),
- (inst $addr, $data, 1, 0, 0)
+ (inst $addr, $data, 1, 0)
>;
class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : Pat <
(vt (node i64:$addr, data_vt:$data)),
- (inst $addr, $data, 0, 0)
+ (inst $addr, $data, 0)
>;
let Predicates = [isCIVI] in {
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index bf16a8216001..8066428fe44a 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -27,7 +27,7 @@ void llvm::printLivesAt(SlotIndex SI,
unsigned Num = 0;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
- if (MRI.reg_nodbg_empty(Reg))
+ if (!LIS.hasInterval(Reg))
continue;
const auto &LI = LIS.getInterval(Reg);
if (LI.hasSubRanges()) {
@@ -131,13 +131,13 @@ bool GCNRegPressure::less(const SISubtarget &ST,
const GCNRegPressure& O,
unsigned MaxOccupancy) const {
const auto SGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumSGPRs(getSGRPNum()));
+ ST.getOccupancyWithNumSGPRs(getSGPRNum()));
const auto VGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+ ST.getOccupancyWithNumVGPRs(getVGPRNum()));
const auto OtherSGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumSGPRs(O.getSGRPNum()));
+ ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
const auto OtherVGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(O.getVGRPNum()));
+ ST.getOccupancyWithNumVGPRs(O.getVGPRNum()));
const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -167,17 +167,17 @@ bool GCNRegPressure::less(const SISubtarget &ST,
return VW < OtherVW;
}
}
- return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()):
- (getVGRPNum() < O.getVGRPNum());
+ return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
+ (getVGPRNum() < O.getVGPRNum());
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
- OS << "VGPRs: " << getVGRPNum();
- if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')';
- OS << ", SGPRs: " << getSGRPNum();
- if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')';
+ OS << "VGPRs: " << getVGPRNum();
+ if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
+ OS << ", SGPRs: " << getSGPRNum();
+ if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
OS << ", LVGPR WT: " << getVGPRTuplesWeight()
<< ", LSGPR WT: " << getSGPRTuplesWeight();
if (ST) OS << " -> Occ: " << getOccupancy(*ST);
@@ -192,7 +192,6 @@ LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI) {
- assert(!MRI.reg_nodbg_empty(Reg));
LaneBitmask LiveMask;
const auto &LI = LIS.getInterval(Reg);
if (LI.hasSubRanges()) {
@@ -214,7 +213,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
GCNRPTracker::LiveRegSet LiveRegs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
auto Reg = TargetRegisterInfo::index2VirtReg(I);
- if (MRI.reg_nodbg_empty(Reg))
+ if (!LIS.hasInterval(Reg))
continue;
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
if (LiveMask.any())
@@ -223,13 +222,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
return LiveRegs;
}
-void GCNUpwardRPTracker::reset(const MachineInstr &MI) {
- MRI = &MI.getParent()->getParent()->getRegInfo();
- LiveRegs = getLiveRegsAfter(MI, LIS);
- MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
-}
-
-LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
+LaneBitmask GCNRPTracker::getDefRegMask(const MachineOperand &MO) const {
assert(MO.isDef() && MO.isReg() &&
TargetRegisterInfo::isVirtualRegister(MO.getReg()));
@@ -241,7 +234,7 @@ LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
}
-LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
+LaneBitmask GCNRPTracker::getUsedRegMask(const MachineOperand &MO) const {
assert(MO.isUse() && MO.isReg() &&
TargetRegisterInfo::isVirtualRegister(MO.getReg()));
@@ -259,6 +252,18 @@ LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
}
+void GCNUpwardRPTracker::reset(const MachineInstr &MI,
+ const LiveRegSet *LiveRegsCopy) {
+ MRI = &MI.getParent()->getParent()->getRegInfo();
+ if (LiveRegsCopy) {
+ if (&LiveRegs != LiveRegsCopy)
+ LiveRegs = *LiveRegsCopy;
+ } else {
+ LiveRegs = getLiveRegsAfter(MI, LIS);
+ }
+ MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+}
+
void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
assert(MRI && "call reset first");
@@ -297,6 +302,100 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
MaxPressure = max(MaxPressure, CurPressure);
}
+bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
+ const LiveRegSet *LiveRegsCopy) {
+ MRI = &MI.getParent()->getParent()->getRegInfo();
+ LastTrackedMI = nullptr;
+ MBBEnd = MI.getParent()->end();
+ NextMI = &MI;
+ NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+ if (NextMI == MBBEnd)
+ return false;
+ if (LiveRegsCopy) {
+ if (&LiveRegs != LiveRegsCopy)
+ LiveRegs = *LiveRegsCopy;
+ } else {
+ LiveRegs = getLiveRegsBefore(*NextMI, LIS);
+ }
+ MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+ return true;
+}
+
+bool GCNDownwardRPTracker::advanceBeforeNext() {
+ assert(MRI && "call reset first");
+
+ NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+ if (NextMI == MBBEnd)
+ return false;
+
+ SlotIndex SI = LIS.getInstructionIndex(*NextMI).getBaseIndex();
+ assert(SI.isValid());
+
+ // Remove dead registers or mask bits.
+ for (auto &It : LiveRegs) {
+ const LiveInterval &LI = LIS.getInterval(It.first);
+ if (LI.hasSubRanges()) {
+ for (const auto &S : LI.subranges()) {
+ if (!S.liveAt(SI)) {
+ auto PrevMask = It.second;
+ It.second &= ~S.LaneMask;
+ CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+ }
+ }
+ } else if (!LI.liveAt(SI)) {
+ auto PrevMask = It.second;
+ It.second = LaneBitmask::getNone();
+ CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+ }
+ if (It.second.none())
+ LiveRegs.erase(It.first);
+ }
+
+ MaxPressure = max(MaxPressure, CurPressure);
+
+ return true;
+}
+
+void GCNDownwardRPTracker::advanceToNext() {
+ LastTrackedMI = &*NextMI++;
+
+ // Add new registers or mask bits.
+ for (const auto &MO : LastTrackedMI->defs()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ auto &LiveMask = LiveRegs[Reg];
+ auto PrevMask = LiveMask;
+ LiveMask |= getDefRegMask(MO);
+ CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+ }
+
+ MaxPressure = max(MaxPressure, CurPressure);
+}
+
+bool GCNDownwardRPTracker::advance() {
+ // If we have just called reset live set is actual.
+ if ((NextMI == MBBEnd) || (LastTrackedMI && !advanceBeforeNext()))
+ return false;
+ advanceToNext();
+ return true;
+}
+
+bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator End) {
+ while (NextMI != End)
+ if (!advance()) return false;
+ return true;
+}
+
+bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin,
+ MachineBasicBlock::const_iterator End,
+ const LiveRegSet *LiveRegsCopy) {
+ reset(*Begin, LiveRegsCopy);
+ return advance(End);
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
@@ -352,4 +451,16 @@ bool GCNUpwardRPTracker::isValid() const {
return true;
}
+void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
+ const MachineRegisterInfo &MRI) {
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+ auto It = LiveRegs.find(Reg);
+ if (It != LiveRegs.end() && It->second.any())
+ OS << ' ' << PrintVRegOrUnit(Reg, TRI) << ':'
+ << PrintLaneMask(It->second);
+ }
+ OS << '\n';
+}
#endif
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index 82e76a7bfddc..9875ca6a6d16 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -33,19 +33,19 @@ struct GCNRegPressure {
clear();
}
- bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; }
+ bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; }
void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
- unsigned getSGRPNum() const { return Value[SGPR32]; }
- unsigned getVGRPNum() const { return Value[VGPR32]; }
+ unsigned getSGPRNum() const { return Value[SGPR32]; }
+ unsigned getVGPRNum() const { return Value[VGPR32]; }
unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
unsigned getOccupancy(const SISubtarget &ST) const {
- return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()),
- ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+ return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+ ST.getOccupancyWithNumVGPRs(getVGPRNum()));
}
void inc(unsigned Reg,
@@ -92,16 +92,21 @@ public:
typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
protected:
+ const LiveIntervals &LIS;
LiveRegSet LiveRegs;
GCNRegPressure CurPressure, MaxPressure;
const MachineInstr *LastTrackedMI = nullptr;
mutable const MachineRegisterInfo *MRI = nullptr;
- GCNRPTracker() {}
+ GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+ LaneBitmask getDefRegMask(const MachineOperand &MO) const;
+ LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
public:
// live regs for the current state
const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
+ void clearMaxPressure() { MaxPressure.clear(); }
+
// returns MaxPressure, resetting it
decltype(MaxPressure) moveMaxPressure() {
auto Res = MaxPressure;
@@ -111,17 +116,16 @@ public:
decltype(LiveRegs) moveLiveRegs() {
return std::move(LiveRegs);
}
+ static void printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
+ const MachineRegisterInfo &MRI);
};
class GCNUpwardRPTracker : public GCNRPTracker {
- const LiveIntervals &LIS;
- LaneBitmask getDefRegMask(const MachineOperand &MO) const;
- LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
public:
- GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+ GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
// reset tracker to the point just below MI
// filling live regs upon this point using LIS
- void reset(const MachineInstr &MI);
+ void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
// move to the state just above the MI
void recede(const MachineInstr &MI);
@@ -131,6 +135,41 @@ public:
bool isValid() const;
};
+class GCNDownwardRPTracker : public GCNRPTracker {
+ // Last position of reset or advanceBeforeNext
+ MachineBasicBlock::const_iterator NextMI;
+
+ MachineBasicBlock::const_iterator MBBEnd;
+
+public:
+ GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
+
+ const MachineBasicBlock::const_iterator getNext() const { return NextMI; }
+
+ // Reset tracker to the point before the MI
+ // filling live regs upon this point using LIS.
+ // Returns false if block is empty except debug values.
+ bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
+
+ // Move to the state right before the next MI. Returns false if reached
+ // end of the block.
+ bool advanceBeforeNext();
+
+ // Move to the state at the MI, advanceBeforeNext has to be called first.
+ void advanceToNext();
+
+ // Move to the state at the next MI. Returns false if reached end of block.
+ bool advance();
+
+ // Advance instructions until before End.
+ bool advance(MachineBasicBlock::const_iterator End);
+
+ // Reset to Begin and advance to End.
+ bool advance(MachineBasicBlock::const_iterator Begin,
+ MachineBasicBlock::const_iterator End,
+ const LiveRegSet *LiveRegsCopy = nullptr);
+};
+
LaneBitmask getLiveLaneMask(unsigned Reg,
SlotIndex SI,
const LiveIntervals &LIS,
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 630442625aa3..8ec46665daf5 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -316,46 +316,57 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
*MF.getFunction())),
- MinOccupancy(StartingOccupancy), Stage(0) {
+ MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}
void GCNScheduleDAGMILive::schedule() {
+ if (Stage == 0) {
+ // Just record regions at the first pass.
+ Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+ return;
+ }
+
std::vector<MachineInstr*> Unsched;
Unsched.reserve(NumRegionInstrs);
for (auto &I : *this)
Unsched.push_back(&I);
- std::pair<unsigned, unsigned> PressureBefore;
+ GCNRegPressure PressureBefore;
if (LIS) {
- DEBUG(dbgs() << "Pressure before scheduling:\n");
- discoverLiveIns();
- PressureBefore = getRealRegPressure();
+ PressureBefore = Pressure[RegionIdx];
+
+ DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+ GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
+ dbgs() << "Region live-in pressure: ";
+ llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
+ dbgs() << "Region register pressure: ";
+ PressureBefore.print(dbgs()));
}
ScheduleDAGMILive::schedule();
- if (Stage == 0)
- Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+ Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
if (!LIS)
return;
// Check the results of scheduling.
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
- DEBUG(dbgs() << "Pressure after scheduling:\n");
auto PressureAfter = getRealRegPressure();
- LiveIns.clear();
- if (PressureAfter.first <= S.SGPRCriticalLimit &&
- PressureAfter.second <= S.VGPRCriticalLimit) {
+ DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs()));
+
+ if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
+ PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
+ Pressure[RegionIdx] = PressureAfter;
DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
}
- unsigned WavesAfter = getMaxWaves(PressureAfter.first,
- PressureAfter.second, MF);
- unsigned WavesBefore = getMaxWaves(PressureBefore.first,
- PressureBefore.second, MF);
+ unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(),
+ PressureAfter.getVGPRNum(), MF);
+ unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(),
+ PressureBefore.getVGPRNum(), MF);
DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
", after " << WavesAfter << ".\n");
@@ -368,8 +379,10 @@ void GCNScheduleDAGMILive::schedule() {
<< MinOccupancy << ".\n");
}
- if (WavesAfter >= WavesBefore)
+ if (WavesAfter >= WavesBefore) {
+ Pressure[RegionIdx] = PressureAfter;
return;
+ }
DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RegionEnd = RegionBegin;
@@ -398,166 +411,139 @@ void GCNScheduleDAGMILive::schedule() {
DEBUG(dbgs() << "Scheduling " << *MI);
}
RegionBegin = Unsched.front()->getIterator();
- if (Stage == 0)
- Regions.back() = std::make_pair(RegionBegin, RegionEnd);
+ Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
placeDebugValues();
}
-static inline void setMask(const MachineRegisterInfo &MRI,
- const SIRegisterInfo *SRI, unsigned Reg,
- LaneBitmask &PrevMask, LaneBitmask NewMask,
- unsigned &SGPRs, unsigned &VGPRs) {
- int NewRegs = countPopulation(NewMask.getAsInteger()) -
- countPopulation(PrevMask.getAsInteger());
- if (SRI->isSGPRReg(MRI, Reg))
- SGPRs += NewRegs;
- if (SRI->isVGPR(MRI, Reg))
- VGPRs += NewRegs;
- assert ((int)SGPRs >= 0 && (int)VGPRs >= 0);
- PrevMask = NewMask;
+GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
+ GCNDownwardRPTracker RPTracker(*LIS);
+ RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+ return RPTracker.moveMaxPressure();
}
-void GCNScheduleDAGMILive::discoverLiveIns() {
- unsigned SGPRs = 0;
- unsigned VGPRs = 0;
+void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
+ GCNDownwardRPTracker RPTracker(*LIS);
+
+ // If the block has the only successor then live-ins of that successor are
+ // live-outs of the current block. We can reuse calculated live set if the
+ // successor will be sent to scheduling past current block.
+ const MachineBasicBlock *OnlySucc = nullptr;
+ if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) {
+ SlotIndexes *Ind = LIS->getSlotIndexes();
+ if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin()))
+ OnlySucc = *MBB->succ_begin();
+ }
- auto &MI = *begin()->getParent()->getFirstNonDebugInstr();
- const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
- SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
- assert (SI.isValid());
-
- DEBUG(dbgs() << "Region live-ins:");
- for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
- if (MRI.reg_nodbg_empty(Reg))
- continue;
- const LiveInterval &LI = LIS->getInterval(Reg);
- LaneBitmask LaneMask = LaneBitmask::getNone();
- if (LI.hasSubRanges()) {
- for (const auto &S : LI.subranges())
- if (S.liveAt(SI))
- LaneMask |= S.LaneMask;
- } else if (LI.liveAt(SI)) {
- LaneMask = MRI.getMaxLaneMaskForVReg(Reg);
- }
+ // Scheduler sends regions from the end of the block upwards.
+ size_t CurRegion = RegionIdx;
+ for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
+ if (Regions[CurRegion].first->getParent() != MBB)
+ break;
+ --CurRegion;
+
+ auto I = MBB->begin();
+ auto LiveInIt = MBBLiveIns.find(MBB);
+ if (LiveInIt != MBBLiveIns.end()) {
+ auto LiveIn = std::move(LiveInIt->second);
+ RPTracker.reset(*MBB->begin(), &LiveIn);
+ MBBLiveIns.erase(LiveInIt);
+ } else {
+ I = Regions[CurRegion].first;
+ RPTracker.reset(*I);
+ }
- if (LaneMask.any()) {
- setMask(MRI, SRI, Reg, LiveIns[Reg], LaneMask, SGPRs, VGPRs);
+ for ( ; ; ) {
+ I = RPTracker.getNext();
- DEBUG(dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':'
- << PrintLaneMask(LiveIns[Reg]));
+ if (Regions[CurRegion].first == I) {
+ LiveIns[CurRegion] = RPTracker.getLiveRegs();
+ RPTracker.clearMaxPressure();
}
- }
- LiveInPressure = std::make_pair(SGPRs, VGPRs);
+ if (Regions[CurRegion].second == I) {
+ Pressure[CurRegion] = RPTracker.moveMaxPressure();
+ if (CurRegion-- == RegionIdx)
+ break;
+ }
+ RPTracker.advanceToNext();
+ RPTracker.advanceBeforeNext();
+ }
- DEBUG(dbgs() << "\nLive-in pressure:\nSGPR = " << SGPRs
- << "\nVGPR = " << VGPRs << '\n');
+ if (OnlySucc) {
+ if (I != MBB->end()) {
+ RPTracker.advanceToNext();
+ RPTracker.advance(MBB->end());
+ }
+ RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs());
+ RPTracker.advanceBeforeNext();
+ MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
+ }
}
-std::pair<unsigned, unsigned>
-GCNScheduleDAGMILive::getRealRegPressure() const {
- unsigned SGPRs, MaxSGPRs, VGPRs, MaxVGPRs;
- SGPRs = MaxSGPRs = LiveInPressure.first;
- VGPRs = MaxVGPRs = LiveInPressure.second;
-
- const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
- DenseMap<unsigned, LaneBitmask> LiveRegs(LiveIns);
+void GCNScheduleDAGMILive::finalizeSchedule() {
+ GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+ DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
- for (const MachineInstr &MI : *this) {
- if (MI.isDebugValue())
- continue;
- SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
- assert (SI.isValid());
+ LiveIns.resize(Regions.size());
+ Pressure.resize(Regions.size());
- // Remove dead registers or mask bits.
- for (auto &It : LiveRegs) {
- if (It.second.none())
- continue;
- const LiveInterval &LI = LIS->getInterval(It.first);
- if (LI.hasSubRanges()) {
- for (const auto &S : LI.subranges())
- if (!S.liveAt(SI))
- setMask(MRI, SRI, It.first, It.second, It.second & ~S.LaneMask,
- SGPRs, VGPRs);
- } else if (!LI.liveAt(SI)) {
- setMask(MRI, SRI, It.first, It.second, LaneBitmask::getNone(),
- SGPRs, VGPRs);
- }
- }
+ do {
+ Stage++;
+ RegionIdx = 0;
+ MachineBasicBlock *MBB = nullptr;
- // Add new registers or mask bits.
- for (const auto &MO : MI.defs()) {
- if (!MO.isReg())
- continue;
- unsigned Reg = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
- continue;
- unsigned SubRegIdx = MO.getSubReg();
- LaneBitmask LaneMask = SubRegIdx != 0
- ? TRI->getSubRegIndexLaneMask(SubRegIdx)
- : MRI.getMaxLaneMaskForVReg(Reg);
- LaneBitmask &LM = LiveRegs[Reg];
- setMask(MRI, SRI, Reg, LM, LM | LaneMask, SGPRs, VGPRs);
- }
- MaxSGPRs = std::max(MaxSGPRs, SGPRs);
- MaxVGPRs = std::max(MaxVGPRs, VGPRs);
- }
+ if (Stage > 1) {
+ // Retry function scheduling if we found resulting occupancy and it is
+ // lower than used for first pass scheduling. This will give more freedom
+ // to schedule low register pressure blocks.
+ // Code is partially copied from MachineSchedulerBase::scheduleRegions().
- DEBUG(dbgs() << "Real region's register pressure:\nSGPR = " << MaxSGPRs
- << "\nVGPR = " << MaxVGPRs << '\n');
+ if (!LIS || StartingOccupancy <= MinOccupancy)
+ break;
- return std::make_pair(MaxSGPRs, MaxVGPRs);
-}
+ DEBUG(dbgs()
+ << "Retrying function scheduling with lowest recorded occupancy "
+ << MinOccupancy << ".\n");
-void GCNScheduleDAGMILive::finalizeSchedule() {
- // Retry function scheduling if we found resulting occupancy and it is
- // lower than used for first pass scheduling. This will give more freedom
- // to schedule low register pressure blocks.
- // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+ S.setTargetOccupancy(MinOccupancy);
+ }
- if (!LIS || StartingOccupancy <= MinOccupancy)
- return;
+ for (auto Region : Regions) {
+ RegionBegin = Region.first;
+ RegionEnd = Region.second;
- DEBUG(dbgs() << "Retrying function scheduling with lowest recorded occupancy "
- << MinOccupancy << ".\n");
+ if (RegionBegin->getParent() != MBB) {
+ if (MBB) finishBlock();
+ MBB = RegionBegin->getParent();
+ startBlock(MBB);
+ if (Stage == 1)
+ computeBlockPressure(MBB);
+ }
- Stage++;
- GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
- S.setTargetOccupancy(MinOccupancy);
+ unsigned NumRegionInstrs = std::distance(begin(), end());
+ enterRegion(MBB, begin(), end(), NumRegionInstrs);
- MachineBasicBlock *MBB = nullptr;
- for (auto Region : Regions) {
- RegionBegin = Region.first;
- RegionEnd = Region.second;
+ // Skip empty scheduling regions (0 or 1 schedulable instructions).
+ if (begin() == end() || begin() == std::prev(end())) {
+ exitRegion();
+ continue;
+ }
- if (RegionBegin->getParent() != MBB) {
- if (MBB) finishBlock();
- MBB = RegionBegin->getParent();
- startBlock(MBB);
- }
+ DEBUG(dbgs() << "********** MI Scheduling **********\n");
+ DEBUG(dbgs() << MF.getName()
+ << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+ << "\n From: " << *begin() << " To: ";
+ if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+ else dbgs() << "End";
+ dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
- unsigned NumRegionInstrs = std::distance(begin(), end());
- enterRegion(MBB, begin(), end(), NumRegionInstrs);
+ schedule();
- // Skip empty scheduling regions (0 or 1 schedulable instructions).
- if (begin() == end() || begin() == std::prev(end())) {
exitRegion();
- continue;
+ ++RegionIdx;
}
- DEBUG(dbgs() << "********** MI Scheduling **********\n");
- DEBUG(dbgs() << MF.getName()
- << ":BB#" << MBB->getNumber() << " " << MBB->getName()
- << "\n From: " << *begin() << " To: ";
- if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
- else dbgs() << "End";
- dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+ finishBlock();
- schedule();
-
- exitRegion();
- }
- finishBlock();
- LiveIns.shrink_and_clear();
+ } while (Stage < 2);
}
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 15af232704ff..3ed3cd5b3b1c 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+#include "GCNRegPressure.h"
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
@@ -74,21 +75,28 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive {
// Scheduling stage number.
unsigned Stage;
+ // Current region index.
+ size_t RegionIdx;
+
// Vecor of regions recorder for later rescheduling
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;
- // Region live-ins.
- DenseMap<unsigned, LaneBitmask> LiveIns;
+ // Region live-in cache.
+ SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
+
+ // Region pressure cache.
+ SmallVector<GCNRegPressure, 32> Pressure;
+
+ // Temporary basic block live-in cache.
+ DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
- // Number of live-ins to the current region, first SGPR then VGPR.
- std::pair<unsigned, unsigned> LiveInPressure;
+ // Return current region pressure.
+ GCNRegPressure getRealRegPressure() const;
- // Collect current region live-ins.
- void discoverLiveIns();
+ // Compute and cache live-ins and pressure for all regions in block.
+ void computeBlockPressure(const MachineBasicBlock *MBB);
- // Return current region pressure. First value is SGPR number, second is VGPR.
- std::pair<unsigned, unsigned> getRealRegPressure() const;
public:
GCNScheduleDAGMILive(MachineSchedContext *C,
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index d8cb98fe1b19..8cb35c506135 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -126,7 +126,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
Void = Type::getVoidTy(Context);
Boolean = Type::getInt1Ty(Context);
Int64 = Type::getInt64Ty(Context);
- ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
+ ReturnStruct = StructType::get(Boolean, Int64);
BoolTrue = ConstantInt::getTrue(Context);
BoolFalse = ConstantInt::getFalse(Context);
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index cc93c27731ff..48a14e4dbea2 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -488,6 +488,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FCANONICALIZE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
@@ -2003,6 +2004,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
break;
}
assert(Found);
+ (void)Found;
// This should be before all vector instructions.
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
@@ -4604,6 +4606,24 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performExtractVectorEltCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SDValue Vec = N->getOperand(0);
+
+ SelectionDAG &DAG= DCI.DAG;
+ if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+ SDLoc SL(N);
+ EVT EltVT = N->getValueType(0);
+ SDValue Idx = N->getOperand(1);
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec.getOperand(0), Idx);
+ return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+ }
+
+ return SDValue();
+}
+
+
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
const SDNode *N1) const {
@@ -4891,6 +4911,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
break;
}
+ case ISD::EXTRACT_VECTOR_ELT:
+ return performExtractVectorEltCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index d177777ad5ee..046e677756d1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -100,6 +100,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 92e452a3d6a0..065fd09eb356 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -496,6 +496,188 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
return Opcode;
}
+void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg,
+ int64_t Value) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
+ if (RegClass == &AMDGPU::SReg_32RegClass ||
+ RegClass == &AMDGPU::SGPR_32RegClass ||
+ RegClass == &AMDGPU::SReg_32_XM0RegClass ||
+ RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+ .addImm(Value);
+ return;
+ }
+
+ if (RegClass == &AMDGPU::SReg_64RegClass ||
+ RegClass == &AMDGPU::SGPR_64RegClass ||
+ RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+ .addImm(Value);
+ return;
+ }
+
+ if (RegClass == &AMDGPU::VGPR_32RegClass) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+ .addImm(Value);
+ return;
+ }
+ if (RegClass == &AMDGPU::VReg_64RegClass) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
+ .addImm(Value);
+ return;
+ }
+
+ unsigned EltSize = 4;
+ unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ if (RI.isSGPRClass(RegClass)) {
+ if (RI.getRegSizeInBits(*RegClass) > 32) {
+ Opcode = AMDGPU::S_MOV_B64;
+ EltSize = 8;
+ } else {
+ Opcode = AMDGPU::S_MOV_B32;
+ EltSize = 4;
+ }
+ }
+
+ ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
+ for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+ int64_t IdxValue = Idx == 0 ? Value : 0;
+
+ MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+ get(Opcode), RI.getSubReg(DestReg, Idx));
+ Builder.addImm(IdxValue);
+ }
+}
+
+const TargetRegisterClass *
+SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
+ return &AMDGPU::VGPR_32RegClass;
+}
+
+void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DstReg,
+ ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg,
+ unsigned FalseReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
+ "Not a VGPR32 reg");
+
+ if (Cond.size() == 1) {
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .add(Cond[0]);
+ } else if (Cond.size() == 2) {
+ assert(Cond[0].isImm() && "Cond[0] is not an immediate");
+ switch (Cond[0].getImm()) {
+ case SIInstrInfo::SCC_TRUE: {
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ .addImm(-1)
+ .addImm(0);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addReg(SReg);
+ break;
+ }
+ case SIInstrInfo::SCC_FALSE: {
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ .addImm(0)
+ .addImm(-1);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addReg(SReg);
+ break;
+ }
+ case SIInstrInfo::VCCNZ: {
+ MachineOperand RegOp = Cond[1];
+ RegOp.setImplicit(false);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .add(RegOp);
+ break;
+ }
+ case SIInstrInfo::VCCZ: {
+ MachineOperand RegOp = Cond[1];
+ RegOp.setImplicit(false);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(TrueReg)
+ .addReg(FalseReg)
+ .add(RegOp);
+ break;
+ }
+ case SIInstrInfo::EXECNZ: {
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+ .addImm(0);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ .addImm(-1)
+ .addImm(0);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addReg(SReg);
+ break;
+ }
+ case SIInstrInfo::EXECZ: {
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+ .addImm(0);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ .addImm(0)
+ .addImm(-1);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addReg(SReg);
+ llvm_unreachable("Unhandled branch predicate EXECZ");
+ break;
+ }
+ default:
+ llvm_unreachable("invalid branch predicate");
+ }
+ } else {
+ llvm_unreachable("Can only handle Cond size 1 or 2");
+ }
+}
+
+unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ unsigned SrcReg, int Value) const {
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
+ .addImm(Value)
+ .addReg(SrcReg);
+
+ return Reg;
+}
+
+unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ unsigned SrcReg, int Value) const {
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
+ .addImm(Value)
+ .addReg(SrcReg);
+
+ return Reg;
+}
+
unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
if (RI.getRegSizeInBits(*DstRC) == 32) {
@@ -834,6 +1016,20 @@ void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
insertWaitStates(MBB, MI, 1);
}
+void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
+ auto MF = MBB.getParent();
+ SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+ assert(Info->isEntryFunction());
+
+ if (MBB.succ_empty()) {
+ bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
+ if (HasNoTerminator)
+ BuildMI(MBB, MBB.end(), DebugLoc(),
+ get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
+ }
+}
+
unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default: return 1; // FIXME: Do wait states equal cycles?
@@ -1241,14 +1437,20 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
return false;
}
- BranchPredicate Pred = getBranchPredicate(I->getOpcode());
- if (Pred == INVALID_BR)
- return true;
+ MachineBasicBlock *CondBB = nullptr;
- MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
- Cond.push_back(MachineOperand::CreateImm(Pred));
- Cond.push_back(I->getOperand(1)); // Save the branch register.
+ if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+ CondBB = I->getOperand(1).getMBB();
+ Cond.push_back(I->getOperand(0));
+ } else {
+ BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+ if (Pred == INVALID_BR)
+ return true;
+ CondBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(Pred));
+ Cond.push_back(I->getOperand(1)); // Save the branch register.
+ }
++I;
if (I == MBB.end()) {
@@ -1351,6 +1553,13 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
return 1;
}
+ if(Cond.size() == 1 && Cond[0].isReg()) {
+ BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
+ .add(Cond[0])
+ .addMBB(TBB);
+ return 1;
+ }
+
assert(TBB && Cond[0].isImm());
unsigned Opcode
@@ -1390,9 +1599,16 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
bool SIInstrInfo::reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const {
- assert(Cond.size() == 2);
- Cond[0].setImm(-Cond[0].getImm());
- return false;
+ if (Cond.size() != 2) {
+ return true;
+ }
+
+ if (Cond[0].isImm()) {
+ Cond[0].setImm(-Cond[0].getImm());
+ return false;
+ }
+
+ return true;
}
bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
@@ -3920,6 +4136,82 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
return false;
}
+bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
+ return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
+}
+
+void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+ MachineBasicBlock *IfEnd) const {
+ MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
+ assert(TI != IfEntry->end());
+
+ MachineInstr *Branch = &(*TI);
+ MachineFunction *MF = IfEntry->getParent();
+ MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
+
+ if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+ unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ MachineInstr *SIIF =
+ BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
+ .add(Branch->getOperand(0))
+ .add(Branch->getOperand(1));
+ MachineInstr *SIEND =
+ BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
+ .addReg(DstReg);
+
+ IfEntry->erase(TI);
+ IfEntry->insert(IfEntry->end(), SIIF);
+ IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
+ }
+}
+
+void SIInstrInfo::convertNonUniformLoopRegion(
+ MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
+ MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
+ // We expect 2 terminators, one conditional and one unconditional.
+ assert(TI != LoopEnd->end());
+
+ MachineInstr *Branch = &(*TI);
+ MachineFunction *MF = LoopEnd->getParent();
+ MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
+
+ if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+
+ unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ MachineInstrBuilder HeaderPHIBuilder =
+ BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
+ for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
+ E = LoopEntry->pred_end();
+ PI != E; ++PI) {
+ if (*PI == LoopEnd) {
+ HeaderPHIBuilder.addReg(BackEdgeReg);
+ } else {
+ MachineBasicBlock *PMBB = *PI;
+ unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
+ ZeroReg, 0);
+ HeaderPHIBuilder.addReg(ZeroReg);
+ }
+ HeaderPHIBuilder.addMBB(*PI);
+ }
+ MachineInstr *HeaderPhi = HeaderPHIBuilder;
+ MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
+ get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
+ .addReg(DstReg)
+ .add(Branch->getOperand(0));
+ MachineInstr *SILOOP =
+ BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
+ .addReg(BackEdgeReg)
+ .addMBB(LoopEntry);
+
+ LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
+ LoopEnd->erase(TI);
+ LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
+ LoopEnd->insert(LoopEnd->end(), SILOOP);
+ }
+}
+
ArrayRef<std::pair<int, const char *>>
SIInstrInfo::getSerializableTargetIndices() const {
static const std::pair<int, const char *> TargetIndices[] = {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 03a5ef74b179..f6e5e8883f63 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -143,6 +143,23 @@ public:
RegScavenger *RS, unsigned TmpReg,
unsigned Offset, unsigned Size) const;
+ void materializeImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL,
+ unsigned DestReg,
+ int64_t Value) const;
+
+ const TargetRegisterClass *getPreferredSelectRegClass(
+ unsigned Size) const;
+
+ unsigned insertNE(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ unsigned SrcReg, int Value) const;
+
+ unsigned insertEQ(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ unsigned SrcReg, int Value) const;
+
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, unsigned SrcReg,
bool isKill, int FrameIndex,
@@ -193,7 +210,7 @@ public:
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify) const override;
+ bool AllowModify = false) const override;
unsigned removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved = nullptr) const override;
@@ -218,6 +235,11 @@ public:
unsigned DstReg, ArrayRef<MachineOperand> Cond,
unsigned TrueReg, unsigned FalseReg) const override;
+ void insertVectorSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ unsigned DstReg, ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg, unsigned FalseReg) const;
+
bool
areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override;
@@ -705,6 +727,7 @@ public:
void insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
+ void insertReturn(MachineBasicBlock &MBB) const;
/// \brief Return the number of wait states that result from executing this
/// instruction.
unsigned getNumWaitStates(const MachineInstr &MI) const;
@@ -750,6 +773,14 @@ public:
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const;
+ bool isNonUniformBranchInstr(MachineInstr &Instr) const;
+
+ void convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+ MachineBasicBlock *IfEnd) const;
+
+ void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry,
+ MachineBasicBlock *LoopEnd) const;
+
ArrayRef<std::pair<int, const char *>>
getSerializableTargetIndices() const override;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 7ccb54f54e34..3b4bdc864253 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -174,6 +174,13 @@ def SI_MASK_BRANCH : VPseudoInstSI <
let isTerminator = 1 in {
+ def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
+ (outs),
+ (ins SReg_64:$vcc, brtarget:$target),
+ [(brcond i1:$vcc, bb:$target)]> {
+ let Size = 12;
+}
+
def SI_IF: CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
[(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 2281f338ab45..4a11d9471f1d 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -164,8 +164,11 @@ multiclass VOP2eInst <string opName,
class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
- field string Asm32 = "$vdst, $src0, $src1, $imm";
field bit HasExt = 0;
+
+ // Hack to stop printing _e64
+ let DstRC = RegisterOperand<VGPR_32>;
+ field string Asm32 = " $vdst, $src0, $src1, $imm";
}
def VOP_MADAK_F16 : VOP_MADAK <f16>;
@@ -174,8 +177,11 @@ def VOP_MADAK_F32 : VOP_MADAK <f32>;
class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
- field string Asm32 = "$vdst, $src0, $imm, $src1";
field bit HasExt = 0;
+
+ // Hack to stop printing _e64
+ let DstRC = RegisterOperand<VGPR_32>;
+ field string Asm32 = " $vdst, $src0, $imm, $src1";
}
def VOP_MADMK_F16 : VOP_MADMK <f16>;
@@ -298,7 +304,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
let SubtargetPredicate = isGCN in {
defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
-def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">;
let isCommutable = 1 in {
defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
@@ -328,7 +334,7 @@ let Constraints = "$vdst = $src2", DisableEncoding="$src2",
defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
}
-def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>;
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">;
// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
@@ -383,7 +389,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
let SubtargetPredicate = isVI in {
-def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>;
+def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
@@ -394,7 +400,7 @@ defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
-def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>;
+def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
@@ -651,6 +657,17 @@ multiclass VOP2_Real_e64_vi <bits<10> op> {
VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
+multiclass VOP2_Real_e64only_vi <bits<10> op> {
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Hack to stop printing _e64
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64");
+ let OutOperandList = (outs VGPR_32:$vdst);
+ let AsmString = ps.Mnemonic # " " # ps.AsmOperands;
+ }
+}
+
multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> {
def _e64_vi :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
@@ -718,17 +735,17 @@ defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi <0x1e>;
defm V_READLANE_B32 : VOP32_Real_vi <0x289>;
defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>;
-defm V_BFM_B32 : VOP2_Real_e64_vi <0x293>;
-defm V_BCNT_U32_B32 : VOP2_Real_e64_vi <0x28b>;
-defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64_vi <0x28c>;
-defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64_vi <0x28d>;
-defm V_LDEXP_F32 : VOP2_Real_e64_vi <0x288>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>;
-defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>;
-defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64_vi <0x296>;
-defm V_CVT_PK_U16_U32 : VOP2_Real_e64_vi <0x297>;
-defm V_CVT_PK_I16_I32 : VOP2_Real_e64_vi <0x298>;
+defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>;
+defm V_BCNT_U32_B32 : VOP2_Real_e64only_vi <0x28b>;
+defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64only_vi <0x28c>;
+defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64only_vi <0x28d>;
+defm V_LDEXP_F32 : VOP2_Real_e64only_vi <0x288>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64only_vi <0x1f0>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64only_vi <0x294>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64only_vi <0x295>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64only_vi <0x296>;
+defm V_CVT_PK_U16_U32 : VOP2_Real_e64only_vi <0x297>;
+defm V_CVT_PK_I16_I32 : VOP2_Real_e64only_vi <0x298>;
defm V_ADD_F16 : VOP2_Real_e32e64_vi <0x1f>;
defm V_SUB_F16 : VOP2_Real_e32e64_vi <0x20>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 217a07488853..ffa6c60d6b1f 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -232,7 +232,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
let SubtargetPredicate = isCIVI in {
-def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>;
def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
@@ -402,7 +401,6 @@ multiclass VOP3be_Real_ci<bits<9> op> {
}
}
-defm V_MQSAD_U16_U8 : VOP3_Real_ci <0x172>;
defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>;
defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x175>;
defm V_MAD_U64_U32 : VOP3be_Real_ci <0x176>;
@@ -426,7 +424,6 @@ multiclass VOP3be_Real_vi<bits<10> op> {
} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
-defm V_MQSAD_U16_U8 : VOP3_Real_vi <0x172>;
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 28c407f74125..dd7fe871345a 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -404,21 +404,11 @@ public:
/// Returns predicate register associated with the given frame instruction.
unsigned getFramePred(const MachineInstr &MI) const {
assert(isFrameInstr(MI));
- if (isFrameSetup(MI))
- // Operands of ADJCALLSTACKDOWN:
- // - argument declared in ADJCALLSTACKDOWN pattern:
- // 0 - frame size
- // 1 - predicate code (like ARMCC::AL)
- // - added by predOps:
- // 2 - predicate reg
- return MI.getOperand(2).getReg();
- assert(MI.getOpcode() == ARM::ADJCALLSTACKUP ||
- MI.getOpcode() == ARM::tADJCALLSTACKUP);
- // Operands of ADJCALLSTACKUP:
- // - argument declared in ADJCALLSTACKUP pattern:
+ // Operands of ADJCALLSTACKDOWN/ADJCALLSTACKUP:
+ // - argument declared in the pattern:
// 0 - frame size
- // 1 - arg of CALLSEQ_END
- // 2 - predicate code
+ // 1 - arg of CALLSEQ_START/CALLSEQ_END
+ // 2 - predicate code (like ARMCC::AL)
// - added by predOps:
// 3 - predicate reg
return MI.getOperand(3).getReg();
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 9178c67afa6e..46ac4d0ad933 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -433,7 +433,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// We now know the size of the stack - update the ADJCALLSTACKDOWN
// accordingly.
- CallSeqStart.addImm(ArgHandler.StackSize).add(predOps(ARMCC::AL));
+ CallSeqStart.addImm(ArgHandler.StackSize).addImm(0).add(predOps(ARMCC::AL));
MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP)
.addImm(ArgHandler.StackSize)
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 56cac855620d..4f6a73b5980d 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -1949,7 +1949,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AdjStackDown))
- .addImm(NumBytes));
+ .addImm(NumBytes).addImm(0));
// Process the args.
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e64582402fe1..f8b584db7b99 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -473,9 +473,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
// Use divmod compiler-rt calls for iOS 5.0 and later.
- if (Subtarget->isTargetWatchOS() ||
- (Subtarget->isTargetIOS() &&
- !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
+ if (Subtarget->isTargetMachO() &&
+ !(Subtarget->isTargetIOS() &&
+ Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
}
@@ -1817,8 +1817,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!isSibCall)
- Chain = DAG.getCALLSEQ_START(Chain,
- DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue StackPtr =
DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
@@ -7365,7 +7364,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Pair of floats / doubles used to pass the result.
- Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+ Type *RetTy = StructType::get(ArgTy, ArgTy);
auto &DL = DAG.getDataLayout();
ArgListTy Args;
@@ -13115,7 +13114,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
- Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
+ Type *RetTy = StructType::get(Ty, Ty);
if (Subtarget->isTargetWindows())
InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
@@ -13417,9 +13416,9 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
}
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
- AtomicOrdering Ord, bool IsStore,
- bool IsLoad) const {
+Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -13428,7 +13427,7 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
case AtomicOrdering::Acquire:
return nullptr; // Nothing to do
case AtomicOrdering::SequentiallyConsistent:
- if (!IsStore)
+ if (!Inst->hasAtomicStore())
return nullptr; // Nothing to do
/*FALLTHROUGH*/
case AtomicOrdering::Release:
@@ -13442,9 +13441,9 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
llvm_unreachable("Unknown fence ordering in emitLeadingFence");
}
-Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
- AtomicOrdering Ord, bool IsStore,
- bool IsLoad) const {
+Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 08c51b66dfe7..875c06210ae6 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -483,10 +483,10 @@ class InstrItineraryData;
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
- Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
- bool IsStore, bool IsLoad) const override;
- Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
- bool IsStore, bool IsLoad) const override;
+ Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index a94d6048f02d..d06b7d0896f1 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -16,7 +16,8 @@
//
// Type profiles.
-def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
def SDT_ARMStructByVal : SDTypeProfile<0, 4,
[SDTCisVT<0, i32>, SDTCisVT<1, i32>,
@@ -1968,8 +1969,8 @@ PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
[(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKDOWN :
-PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
- [(ARMcallseq_start timm:$amt)]>;
+PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary,
+ [(ARMcallseq_start timm:$amt, timm:$amt2)]>;
}
def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 8048c758e998..bee83dfb6f63 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -284,8 +284,8 @@ def tADJCALLSTACKUP :
Requires<[IsThumb, IsThumb1Only]>;
def tADJCALLSTACKDOWN :
- PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
- [(ARMcallseq_start imm:$amt)]>,
+ PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2), NoItinerary,
+ [(ARMcallseq_start imm:$amt, imm:$amt2)]>,
Requires<[IsThumb, IsThumb1Only]>;
}
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 2ac3fda9f448..8c680cdf9b47 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -101,14 +101,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
assert(RegBank && "Can't get reg bank for virtual register");
const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
- (void)DstSize;
- unsigned SrcReg = I.getOperand(1).getReg();
- const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
- (void)SrcSize;
- // We use copies for trunc, so it's ok for the size of the destination to be
- // smaller (the higher bits will just be undefined).
- assert(DstSize <= SrcSize && "Copy with different width?!");
-
assert((RegBank->getID() == ARM::GPRRegBankID ||
RegBank->getID() == ARM::FPRRegBankID) &&
"Unsupported reg bank");
@@ -135,28 +127,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
return true;
}
-static bool selectFAdd(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
- MachineRegisterInfo &MRI) {
- assert(TII.getSubtarget().hasVFP2() && "Can't select fp add without vfp");
-
- LLT Ty = MRI.getType(MIB->getOperand(0).getReg());
- unsigned ValSize = Ty.getSizeInBits();
-
- if (ValSize == 32) {
- if (TII.getSubtarget().useNEONForSinglePrecisionFP())
- return false;
- MIB->setDesc(TII.get(ARM::VADDS));
- } else {
- assert(ValSize == 64 && "Unsupported size for floating point value");
- if (TII.getSubtarget().isFPOnlySP())
- return false;
- MIB->setDesc(TII.get(ARM::VADDD));
- }
- MIB.add(predOps(ARMCC::AL));
-
- return true;
-}
-
static bool selectSequence(MachineInstrBuilder &MIB,
const ARMBaseInstrInfo &TII,
MachineRegisterInfo &MRI,
@@ -352,6 +322,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
}
break;
}
+ case G_ANYEXT:
case G_TRUNC: {
// The high bits are undefined, so there's nothing special to do, just
// treat it as a copy.
@@ -362,12 +333,12 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
if (SrcRegBank.getID() != DstRegBank.getID()) {
- DEBUG(dbgs() << "G_TRUNC operands on different register banks\n");
+ DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");
return false;
}
if (SrcRegBank.getID() != ARM::GPRRegBankID) {
- DEBUG(dbgs() << "G_TRUNC on non-GPR not supported yet\n");
+ DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n");
return false;
}
@@ -393,10 +364,6 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
}
MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
break;
- case G_FADD:
- if (!selectFAdd(MIB, TII, MRI))
- return false;
- break;
case G_FRAME_INDEX:
// Add 0 to the given frame index and hope it will eventually be folded into
// the user(s).
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 9b86030fdd29..5bf6c7aed6b8 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -45,9 +45,11 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
setAction({Op, 1, p0}, Legal);
}
- for (unsigned Op : {G_ADD, G_SUB, G_MUL})
- for (auto Ty : {s1, s8, s16, s32})
- setAction({Op, Ty}, Legal);
+ for (unsigned Op : {G_ADD, G_SUB, G_MUL}) {
+ for (auto Ty : {s1, s8, s16})
+ setAction({Op, Ty}, WidenScalar);
+ setAction({Op, s32}, Legal);
+ }
for (unsigned Op : {G_SDIV, G_UDIV}) {
for (auto Ty : {s8, s16})
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index 581d5fe159fd..7e4d598a6e0b 100644
--- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -88,13 +88,15 @@ bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
}
}
}
+ bool Changed = false;
// Remove the tagged DMB
for (auto MI : ToRemove) {
MI->eraseFromParent();
++NumDMBsRemoved;
+ Changed = true;
}
- return NumDMBsRemoved > 0;
+ return Changed;
}
/// createARMOptimizeBarriersPass - Returns an instance of the remove double
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 13a32211f88c..a20997c95cd9 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -225,6 +225,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case G_UDIV:
case G_SEXT:
case G_ZEXT:
+ case G_ANYEXT:
case G_TRUNC:
case G_GEP:
// FIXME: We're abusing the fact that everything lives in a GPR for now; in
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index d09f3ecbaa28..5583d6148b08 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -13,7 +13,9 @@
#include "ARM.h"
#include "ARMCallLowering.h"
#include "ARMLegalizerInfo.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
#include "ARMRegisterBankInfo.h"
+#endif
#include "ARMSubtarget.h"
#include "ARMTargetMachine.h"
#include "ARMTargetObjectFile.h"
diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp
index c297865db820..0ec8e8b08ceb 100644
--- a/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/lib/Target/AVR/AVRFrameLowering.cpp
@@ -375,7 +375,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
DebugLoc DL = MI->getDebugLoc();
unsigned int Opcode = MI->getOpcode();
- int Amount = MI->getOperand(0).getImm();
+ int Amount = TII.getFrameSize(*MI);
// Adjcallstackup does not need to allocate stack space for the call, instead
// we insert push instructions that will allocate the necessary stack.
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index f0ab6acedad1..ef9c00e4b784 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -361,7 +361,7 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
- Type *RetTy = (Type *)StructType::get(Ty, Ty, nullptr);
+ Type *RetTy = (Type *)StructType::get(Ty, Ty);
SDLoc dl(Op);
TargetLowering::CallLoweringInfo CLI(DAG);
@@ -1166,8 +1166,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
@@ -1611,8 +1610,9 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineFunction::iterator I = MBB->getParent()->begin();
- ++I;
+ MachineFunction::iterator I;
+ for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I);
+ if (I != MF->end()) ++I;
MF->insert(I, trueMBB);
MF->insert(I, falseMBB);
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index 1b6547ef7795..06ad2b3ffdf8 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -17,7 +17,7 @@ include "AVRInstrFormats.td"
// AVR Type Profiles
//===----------------------------------------------------------------------===//
-def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
@@ -333,9 +333,9 @@ let Defs = [SP, SREG],
Uses = [SP] in
{
def ADJCALLSTACKDOWN : Pseudo<(outs),
- (ins i16imm:$amt),
+ (ins i16imm:$amt, i16imm:$amt2),
"#ADJCALLSTACKDOWN",
- [(AVRcallseq_start timm:$amt)]>;
+ [(AVRcallseq_start timm:$amt, timm:$amt2)]>;
// R31R30 is used to update SP, since it is a scratch reg and this instruction
// is placed after the function call then R31R30 should be always free.
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index 2813e24d2ac7..11a47bad78ba 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -52,7 +52,6 @@ AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
- const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
// Reserve the intermediate result registers r1 and r2
// The result of instructions like 'mul' is always stored here.
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index b9b3dff95c0a..6897161c903c 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -257,8 +257,7 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
auto PtrVT = getPointerTy(MF.getDataLayout());
- Chain = DAG.getCALLSEQ_START(
- Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
SmallVector<std::pair<unsigned, SDValue>, MaxArgs> RegsToPass;
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index 93ee24371c4d..c6c0ff587c6b 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -16,7 +16,8 @@ include "BPFInstrFormats.td"
// Instruction Operands and Patterns
// These are target-independent nodes, but have target-specific formats.
-def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
+ SDTCisVT<1, iPTR>]>;
def SDT_BPFCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
def SDT_BPFCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
def SDT_BPFSetFlag : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>]>;
@@ -445,9 +446,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
// ADJCALLSTACKDOWN/UP pseudo insns
let Defs = [R11], Uses = [R11] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
- "#ADJCALLSTACKDOWN $amt",
- [(BPFcallseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+ "#ADJCALLSTACKDOWN $amt1 $amt2",
+ [(BPFcallseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
"#ADJCALLSTACKUP $amt1 $amt2",
[(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 861af94f1e38..1dffebe97f2d 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -848,8 +848,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue Glue;
if (!IsTailCall) {
- SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
- Chain = DAG.getCALLSEQ_START(Chain, C, dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
Glue = Chain.getValue(1);
}
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 5a5799dbe009..e4df7ff5c200 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1209,7 +1209,7 @@ bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
KnownBits Known(T->getBitWidth());
computeKnownBits(V, Known, DL);
- return Known.Zero.countLeadingOnes() >= IterCount;
+ return Known.countMinLeadingZeros() >= IterCount;
}
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index 32503d111c24..81b5e10c1173 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -714,7 +714,8 @@ def: Pat<(i1 0), (PS_false)>;
def: Pat<(i1 1), (PS_true)>;
// Pseudo instructions.
-def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
SDTCisVT<1, i32> ]>;
@@ -732,8 +733,8 @@ def HexagonTCRet : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def: Pat<(callseq_start timm:$amt),
- (ADJCALLSTACKDOWN imm:$amt)>;
+def: Pat<(callseq_start timm:$amt, timm:$amt2),
+ (ADJCALLSTACKDOWN imm:$amt, imm:$amt2)>;
def: Pat<(callseq_end timm:$amt1, timm:$amt2),
(ADJCALLSTACKUP imm:$amt1, imm:$amt2)>;
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index 8c2caea2d5c5..0f99dfe342b8 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -80,7 +80,7 @@ def PS_false : InstHexagon<(outs PredRegs:$dst), (ins), "",
[(set I1:$dst, 0)], "", C2_andn.Itinerary, TypeCR>;
let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
".error \"should not emit\" ", []>;
let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
index d156294a0b0c..0a9cac2565f2 100644
--- a/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -11,9 +11,9 @@
//
//===----------------------------------------------------------------------===//
+#include "LanaiISelLowering.h"
#include "Lanai.h"
#include "LanaiCondCode.h"
-#include "LanaiISelLowering.h"
#include "LanaiMachineFunctionInfo.h"
#include "LanaiSubtarget.h"
#include "LanaiTargetObjectFile.h"
@@ -38,10 +38,11 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetCallingConv.h"
@@ -649,10 +650,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo(
ByValArgs.push_back(FIPtr);
}
- Chain = DAG.getCALLSEQ_START(
- Chain,
- DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
SmallVector<SDValue, 12> MemOpChains;
@@ -1502,3 +1500,24 @@ SDValue LanaiTargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
}
+
+void LanaiTargetLowering::computeKnownBitsForTargetNode(
+ const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+ const SelectionDAG &DAG, unsigned Depth) const {
+ unsigned BitWidth = Known.getBitWidth();
+ switch (Op.getOpcode()) {
+ default:
+ break;
+ case LanaiISD::SETCC:
+ Known = KnownBits(BitWidth);
+ Known.Zero.setBits(1, BitWidth);
+ break;
+ case LanaiISD::SELECT_CC:
+ KnownBits Known2;
+ DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
+ DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
+ Known.Zero &= Known2.Zero;
+ Known.One &= Known2.One;
+ break;
+ }
+}
diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h
index c2fba4f9d167..49ad52a39771 100644
--- a/lib/Target/Lanai/LanaiISelLowering.h
+++ b/lib/Target/Lanai/LanaiISelLowering.h
@@ -106,6 +106,11 @@ public:
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
private:
SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
CallingConv::ID CallConv, bool IsVarArg,
diff --git a/lib/Target/Lanai/LanaiInstrInfo.td b/lib/Target/Lanai/LanaiInstrInfo.td
index 285fca11737d..776fee101dfe 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/lib/Target/Lanai/LanaiInstrInfo.td
@@ -22,7 +22,8 @@ include "LanaiInstrFormats.td"
// -------------------------------------------------- //
// These are target-independent nodes, but have target-specific formats.
-def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
def SDT_LanaiCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
SDTCisVT<1, i32>]>;
def SDT_LanaiCall : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
@@ -750,9 +751,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber SP.
let Defs = [SP], Uses = [SP] in {
- def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
- "#ADJCALLSTACKDOWN $amt",
- [(CallSeqStart timm:$amt)]>;
+ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKDOWN $amt1 $amt2",
+ [(CallSeqStart timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKUP $amt1 $amt2",
[(CallSeqEnd timm:$amt1, timm:$amt2)]>;
@@ -770,9 +771,6 @@ let Uses = [SR] in {
[(set (i32 GPR:$Rs1), (LanaiSetCC imm:$DDDI))]>;
}
-// SCC's output is already 1-bit so and'ing with 1 is redundant.
-def : Pat<(and (LanaiSetCC imm:$DDDI), 1), (SCC imm:$DDDI)>;
-
// Select with hardware support
let Uses = [SR], isSelect = 1 in {
def SELECT : InstRR<0b111, (outs GPR:$Rd),
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index f1cb0b6c031b..b4ff8f66c55f 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -236,7 +236,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
// adjcallstackdown instruction into 'add SP, <amt>'
// TODO: consider using push / pop instead of sub + store / add
MachineInstr &Old = *I;
- uint64_t Amount = Old.getOperand(0).getImm();
+ uint64_t Amount = TII.getFrameSize(Old);
if (Amount != 0) {
// We need to keep the stack aligned properly. To do this, we round the
// amount of space needed for the outgoing arguments up to the next
@@ -252,8 +252,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
} else {
assert(Old.getOpcode() == TII.getCallFrameDestroyOpcode());
// factor out the amount the callee already popped.
- uint64_t CalleeAmt = Old.getOperand(1).getImm();
- Amount -= CalleeAmt;
+ Amount -= TII.getFramePoppedByCallee(Old);
if (Amount)
New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::ADD16ri),
MSP430::SP)
@@ -272,7 +271,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
} else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
// If we are performing frame pointer elimination and if the callee pops
// something off the stack pointer, add it back.
- if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+ if (uint64_t CalleeAmt = TII.getFramePoppedByCallee(*I)) {
MachineInstr &Old = *I;
MachineInstr *New =
BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 40b1dd3cc2eb..cc6e64043f54 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -40,21 +40,24 @@ using namespace llvm;
typedef enum {
NoHWMult,
- HWMultIntr,
- HWMultNoIntr
+ HWMult16,
+ HWMult32,
+ HWMultF5
} HWMultUseMode;
static cl::opt<HWMultUseMode>
-HWMultMode("msp430-hwmult-mode", cl::Hidden,
+HWMultMode("mhwmult", cl::Hidden,
cl::desc("Hardware multiplier use mode"),
- cl::init(HWMultNoIntr),
+ cl::init(NoHWMult),
cl::values(
- clEnumValN(NoHWMult, "no",
+ clEnumValN(NoHWMult, "none",
"Do not use hardware multiplier"),
- clEnumValN(HWMultIntr, "interrupts",
- "Assume hardware multiplier can be used inside interrupts"),
- clEnumValN(HWMultNoIntr, "use",
- "Assume hardware multiplier cannot be used inside interrupts")));
+ clEnumValN(HWMult16, "16bit",
+ "Use 16-bit hardware multiplier"),
+ clEnumValN(HWMult32, "32bit",
+ "Use 32-bit hardware multiplier"),
+ clEnumValN(HWMultF5, "f5series",
+ "Use F5 series hardware multiplier")));
MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
const MSP430Subtarget &STI)
@@ -131,29 +134,29 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
// FIXME: Implement efficiently multiplication by a constant
- setOperationAction(ISD::MUL, MVT::i8, Expand);
- setOperationAction(ISD::MULHS, MVT::i8, Expand);
- setOperationAction(ISD::MULHU, MVT::i8, Expand);
- setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand);
- setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand);
- setOperationAction(ISD::MUL, MVT::i16, Expand);
+ setOperationAction(ISD::MUL, MVT::i8, Promote);
+ setOperationAction(ISD::MULHS, MVT::i8, Promote);
+ setOperationAction(ISD::MULHU, MVT::i8, Promote);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i8, Promote);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i8, Promote);
+ setOperationAction(ISD::MUL, MVT::i16, LibCall);
setOperationAction(ISD::MULHS, MVT::i16, Expand);
setOperationAction(ISD::MULHU, MVT::i16, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
- setOperationAction(ISD::UDIV, MVT::i8, Expand);
- setOperationAction(ISD::UDIVREM, MVT::i8, Expand);
- setOperationAction(ISD::UREM, MVT::i8, Expand);
- setOperationAction(ISD::SDIV, MVT::i8, Expand);
- setOperationAction(ISD::SDIVREM, MVT::i8, Expand);
- setOperationAction(ISD::SREM, MVT::i8, Expand);
- setOperationAction(ISD::UDIV, MVT::i16, Expand);
+ setOperationAction(ISD::UDIV, MVT::i8, Promote);
+ setOperationAction(ISD::UDIVREM, MVT::i8, Promote);
+ setOperationAction(ISD::UREM, MVT::i8, Promote);
+ setOperationAction(ISD::SDIV, MVT::i8, Promote);
+ setOperationAction(ISD::SDIVREM, MVT::i8, Promote);
+ setOperationAction(ISD::SREM, MVT::i8, Promote);
+ setOperationAction(ISD::UDIV, MVT::i16, LibCall);
setOperationAction(ISD::UDIVREM, MVT::i16, Expand);
- setOperationAction(ISD::UREM, MVT::i16, Expand);
- setOperationAction(ISD::SDIV, MVT::i16, Expand);
+ setOperationAction(ISD::UREM, MVT::i16, LibCall);
+ setOperationAction(ISD::SDIV, MVT::i16, LibCall);
setOperationAction(ISD::SDIVREM, MVT::i16, Expand);
- setOperationAction(ISD::SREM, MVT::i16, Expand);
+ setOperationAction(ISD::SREM, MVT::i16, LibCall);
// varargs support
setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -162,15 +165,183 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::JumpTable, MVT::i16, Custom);
- // Libcalls names.
- if (HWMultMode == HWMultIntr) {
- setLibcallName(RTLIB::MUL_I8, "__mulqi3hw");
- setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
- } else if (HWMultMode == HWMultNoIntr) {
- setLibcallName(RTLIB::MUL_I8, "__mulqi3hw_noint");
- setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+ // EABI Libcalls - EABI Section 6.2
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const ISD::CondCode Cond;
+ } LibraryCalls[] = {
+ // Floating point conversions - EABI Table 6
+ { RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf", ISD::SETCC_INVALID },
+ { RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd", ISD::SETCC_INVALID },
+ // The following is NOT implemented in libgcc
+ //{ RTLIB::FPTOSINT_F64_I16, "__mspabi_fixdi", ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli", ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli", ISD::SETCC_INVALID },
+ // The following is NOT implemented in libgcc
+ //{ RTLIB::FPTOUINT_F64_I16, "__mspabi_fixdu", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull", ISD::SETCC_INVALID },
+ // The following is NOT implemented in libgcc
+ //{ RTLIB::FPTOSINT_F32_I16, "__mspabi_fixfi", ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli", ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli", ISD::SETCC_INVALID },
+ // The following is NOT implemented in libgcc
+ //{ RTLIB::FPTOUINT_F32_I16, "__mspabi_fixfu", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc
+ //{ RTLIB::SINTTOFP_I16_F64, "__mspabi_fltid", ISD::SETCC_INVALID },
+ { RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc but is not in the EABI
+ { RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc
+ //{ RTLIB::UINTTOFP_I16_F64, "__mspabi_fltud", ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld", ISD::SETCC_INVALID },
+ // The following IS implemented in libgcc but is not in the EABI
+ { RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc
+ //{ RTLIB::SINTTOFP_I16_F32, "__mspabi_fltif", ISD::SETCC_INVALID },
+ { RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc but is not in the EABI
+ { RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc
+ //{ RTLIB::UINTTOFP_I16_F32, "__mspabi_fltuf", ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf", ISD::SETCC_INVALID },
+ // The following IS implemented in libgcc but is not in the EABI
+ { RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf", ISD::SETCC_INVALID },
+
+ // Floating point comparisons - EABI Table 7
+ { RTLIB::OEQ_F64, "__mspabi_cmpd", ISD::SETEQ },
+ { RTLIB::UNE_F64, "__mspabi_cmpd", ISD::SETNE },
+ { RTLIB::OGE_F64, "__mspabi_cmpd", ISD::SETGE },
+ { RTLIB::OLT_F64, "__mspabi_cmpd", ISD::SETLT },
+ { RTLIB::OLE_F64, "__mspabi_cmpd", ISD::SETLE },
+ { RTLIB::OGT_F64, "__mspabi_cmpd", ISD::SETGT },
+ { RTLIB::OEQ_F32, "__mspabi_cmpf", ISD::SETEQ },
+ { RTLIB::UNE_F32, "__mspabi_cmpf", ISD::SETNE },
+ { RTLIB::OGE_F32, "__mspabi_cmpf", ISD::SETGE },
+ { RTLIB::OLT_F32, "__mspabi_cmpf", ISD::SETLT },
+ { RTLIB::OLE_F32, "__mspabi_cmpf", ISD::SETLE },
+ { RTLIB::OGT_F32, "__mspabi_cmpf", ISD::SETGT },
+
+ // Floating point arithmetic - EABI Table 8
+ { RTLIB::ADD_F64, "__mspabi_addd", ISD::SETCC_INVALID },
+ { RTLIB::ADD_F32, "__mspabi_addf", ISD::SETCC_INVALID },
+ { RTLIB::DIV_F64, "__mspabi_divd", ISD::SETCC_INVALID },
+ { RTLIB::DIV_F32, "__mspabi_divf", ISD::SETCC_INVALID },
+ { RTLIB::MUL_F64, "__mspabi_mpyd", ISD::SETCC_INVALID },
+ { RTLIB::MUL_F32, "__mspabi_mpyf", ISD::SETCC_INVALID },
+ { RTLIB::SUB_F64, "__mspabi_subd", ISD::SETCC_INVALID },
+ { RTLIB::SUB_F32, "__mspabi_subf", ISD::SETCC_INVALID },
+ // The following are NOT implemented in libgcc
+ // { RTLIB::NEG_F64, "__mspabi_negd", ISD::SETCC_INVALID },
+ // { RTLIB::NEG_F32, "__mspabi_negf", ISD::SETCC_INVALID },
+
+ // TODO: SLL/SRA/SRL are in libgcc, RLL isn't
+
+ // Universal Integer Operations - EABI Table 9
+ { RTLIB::SDIV_I16, "__mspabi_divi", ISD::SETCC_INVALID },
+ { RTLIB::SDIV_I32, "__mspabi_divli", ISD::SETCC_INVALID },
+ { RTLIB::SDIV_I64, "__mspabi_divlli", ISD::SETCC_INVALID },
+ { RTLIB::UDIV_I16, "__mspabi_divu", ISD::SETCC_INVALID },
+ { RTLIB::UDIV_I32, "__mspabi_divul", ISD::SETCC_INVALID },
+ { RTLIB::UDIV_I64, "__mspabi_divull", ISD::SETCC_INVALID },
+ { RTLIB::SREM_I16, "__mspabi_remi", ISD::SETCC_INVALID },
+ { RTLIB::SREM_I32, "__mspabi_remli", ISD::SETCC_INVALID },
+ { RTLIB::SREM_I64, "__mspabi_remlli", ISD::SETCC_INVALID },
+ { RTLIB::UREM_I16, "__mspabi_remu", ISD::SETCC_INVALID },
+ { RTLIB::UREM_I32, "__mspabi_remul", ISD::SETCC_INVALID },
+ { RTLIB::UREM_I64, "__mspabi_remull", ISD::SETCC_INVALID },
+
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ if (LC.Cond != ISD::SETCC_INVALID)
+ setCmpLibcallCC(LC.Op, LC.Cond);
+ }
+
+ if (HWMultMode == HWMult16) {
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ } LibraryCalls[] = {
+ // Integer Multiply - EABI Table 9
+ { RTLIB::MUL_I16, "__mspabi_mpyi_hw" },
+ { RTLIB::MUL_I32, "__mspabi_mpyl_hw" },
+ { RTLIB::MUL_I64, "__mspabi_mpyll_hw" },
+ // TODO The __mspabi_mpysl*_hw functions ARE implemented in libgcc
+ // TODO The __mspabi_mpyul*_hw functions ARE implemented in libgcc
+ };
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ }
+ } else if (HWMultMode == HWMult32) {
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ } LibraryCalls[] = {
+ // Integer Multiply - EABI Table 9
+ { RTLIB::MUL_I16, "__mspabi_mpyi_hw" },
+ { RTLIB::MUL_I32, "__mspabi_mpyl_hw32" },
+ { RTLIB::MUL_I64, "__mspabi_mpyll_hw32" },
+ // TODO The __mspabi_mpysl*_hw32 functions ARE implemented in libgcc
+ // TODO The __mspabi_mpyul*_hw32 functions ARE implemented in libgcc
+ };
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ }
+ } else if (HWMultMode == HWMultF5) {
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ } LibraryCalls[] = {
+ // Integer Multiply - EABI Table 9
+ { RTLIB::MUL_I16, "__mspabi_mpyi_f5hw" },
+ { RTLIB::MUL_I32, "__mspabi_mpyl_f5hw" },
+ { RTLIB::MUL_I64, "__mspabi_mpyll_f5hw" },
+ // TODO The __mspabi_mpysl*_f5hw functions ARE implemented in libgcc
+ // TODO The __mspabi_mpyul*_f5hw functions ARE implemented in libgcc
+ };
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ }
+ } else { // NoHWMult
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ } LibraryCalls[] = {
+ // Integer Multiply - EABI Table 9
+ { RTLIB::MUL_I16, "__mspabi_mpyi" },
+ { RTLIB::MUL_I32, "__mspabi_mpyl" },
+ { RTLIB::MUL_I64, "__mspabi_mpyll" },
+ // The __mspabi_mpysl* functions are NOT implemented in libgcc
+ // The __mspabi_mpyul* functions are NOT implemented in libgcc
+ };
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ }
+ setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::MSP430_BUILTIN);
}
+ // Several of the runtime library functions use a special calling conv
+ setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN);
+ // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
+
setMinFunctionAlignment(1);
setPrefFunctionAlignment(2);
}
@@ -281,10 +452,27 @@ template<typename ArgT>
static void AnalyzeArguments(CCState &State,
SmallVectorImpl<CCValAssign> &ArgLocs,
const SmallVectorImpl<ArgT> &Args) {
- static const MCPhysReg RegList[] = {
+ static const MCPhysReg CRegList[] = {
MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
};
- static const unsigned NbRegs = array_lengthof(RegList);
+ static const unsigned CNbRegs = array_lengthof(CRegList);
+ static const MCPhysReg BuiltinRegList[] = {
+ MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+ MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
+ };
+ static const unsigned BuiltinNbRegs = array_lengthof(BuiltinRegList);
+
+ ArrayRef<MCPhysReg> RegList;
+ unsigned NbRegs;
+
+ bool Builtin = (State.getCallingConv() == CallingConv::MSP430_BUILTIN);
+ if (Builtin) {
+ RegList = BuiltinRegList;
+ NbRegs = BuiltinNbRegs;
+ } else {
+ RegList = CRegList;
+ NbRegs = CNbRegs;
+ }
if (State.isVarArg()) {
AnalyzeVarArgs(State, Args);
@@ -294,6 +482,11 @@ static void AnalyzeArguments(CCState &State,
SmallVector<unsigned, 4> ArgsParts;
ParseFunctionArgs(Args, ArgsParts);
+ if (Builtin) {
+ assert(ArgsParts.size() == 2 &&
+ "Builtin calling convention requires two arguments");
+ }
+
unsigned RegsLeft = NbRegs;
bool UsedStack = false;
unsigned ValNo = 0;
@@ -323,6 +516,11 @@ static void AnalyzeArguments(CCState &State,
unsigned Parts = ArgsParts[i];
+ if (Builtin) {
+ assert(Parts == 4 &&
+ "Builtin calling convention requires 64-bit arguments");
+ }
+
if (!UsedStack && Parts == 2 && RegsLeft == 1) {
// Special case for 32-bit register split, see EABI section 3.3.3
unsigned Reg = State.AllocateReg(RegList);
@@ -400,6 +598,7 @@ MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
switch (CallConv) {
default:
llvm_unreachable("Unsupported calling convention");
+ case CallingConv::MSP430_BUILTIN:
case CallingConv::Fast:
case CallingConv::C:
return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
@@ -598,7 +797,6 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
/// LowerCCCCallTo - functions arguments are copied from virtual regs to
/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
-// TODO: sret.
SDValue MSP430TargetLowering::LowerCCCCallTo(
SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -615,8 +813,7 @@ SDValue MSP430TargetLowering::LowerCCCCallTo(
unsigned NumBytes = CCInfo.getNextStackOffset();
auto PtrVT = getPointerTy(DAG.getDataLayout());
- Chain = DAG.getCALLSEQ_START(Chain,
- DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
SmallVector<SDValue, 12> MemOpChains;
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index e3259bd6a7bc..d81f17e753c5 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -85,6 +85,12 @@ public:
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;
+
+ int64_t getFramePoppedByCallee(const MachineInstr &I) const {
+ assert(isFrameInstr(I) && "Not a frame instruction");
+ assert(I.getOperand(1).getImm() >= 0 && "Size must not be negative");
+ return I.getOperand(1).getImm();
+ }
};
}
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 22fc2474fae6..1cd18611e52c 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -23,7 +23,8 @@ class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
// Type Profiles.
//===----------------------------------------------------------------------===//
def SDT_MSP430Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
-def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>,
+ SDTCisVT<1, i16>]>;
def SDT_MSP430CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
def SDT_MSP430Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisPtrTy<0>]>;
@@ -113,9 +114,9 @@ def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber SR.
let Defs = [SP, SR], Uses = [SP] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
"#ADJCALLSTACKDOWN",
- [(MSP430callseq_start timm:$amt)]>;
+ [(MSP430callseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
"#ADJCALLSTACKUP",
[(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
@@ -209,7 +210,7 @@ let isCall = 1 in
// a use to prevent stack-pointer assignments that appear immediately
// before calls from potentially appearing dead. Uses for argument
// registers are added manually.
- let Defs = [R12, R13, R14, R15, SR],
+ let Defs = [R11, R12, R13, R14, R15, SR],
Uses = [SP] in {
def CALLi : II16i<0x0,
(outs), (ins i16imm:$dst),
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 81cd9d1ad3f8..9600bc28f100 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -41,12 +41,12 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const Function* F = MF->getFunction();
static const MCPhysReg CalleeSavedRegs[] = {
MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
- MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+ MSP430::R8, MSP430::R9, MSP430::R10,
0
};
static const MCPhysReg CalleeSavedRegsFP[] = {
MSP430::R5, MSP430::R6, MSP430::R7,
- MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+ MSP430::R8, MSP430::R9, MSP430::R10,
0
};
static const MCPhysReg CalleeSavedRegsIntr[] = {
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 21c99da0922d..b83f44a74d5b 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1133,7 +1133,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
if (NumBytes < 16)
NumBytes = 16;
- emitInst(Mips::ADJCALLSTACKDOWN).addImm(16);
+ emitInst(Mips::ADJCALLSTACKDOWN).addImm(16).addImm(0);
// Process the args.
MVT firstMVT;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 8f39ebd42a5c..78bae6954c3c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -2787,7 +2787,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);
if (!IsTailCall)
- Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NextStackOffset, 0, DL);
SDValue StackPtr =
DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index b90077d7807d..8761946b8dbb 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -21,7 +21,7 @@ def SDT_MipsCMov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
SDTCisSameAs<1, 2>,
SDTCisSameAs<3, 4>,
SDTCisInt<4>]>;
-def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
def SDT_MipsCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
@@ -1719,8 +1719,8 @@ let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in {
}
let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
-def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
- [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
[(callseq_end timm:$amt1, timm:$amt2)]>;
}
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index 68dcbdfb4211..f8d9c34556bc 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -257,7 +257,7 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
// Get the instruction that loads the function address from the GOT.
Reg = MO->getReg();
- Val = (Value*)nullptr;
+ Val = nullptr;
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
MachineInstr *DefMI = MRI.getVRegDef(Reg);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 61fdda8aa109..ebaaf42bc64e 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1430,8 +1430,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
return Chain;
SDValue tempChain = Chain;
- Chain = DAG.getCALLSEQ_START(
- Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
SDValue InFlag = Chain.getValue(1);
unsigned paramCount = 0;
@@ -1549,7 +1548,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getMemIntrinsicNode(
Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
- TheStoreType, MachinePointerInfo(), EltAlign);
+ TheStoreType, MachinePointerInfo(), EltAlign,
+ /* Volatile */ false, /* ReadMem */ false,
+ /* WriteMem */ true, /* Size */ 0);
InFlag = Chain.getValue(1);
// Cleanup.
@@ -1609,7 +1610,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
theVal, InFlag };
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
CopyParamOps, elemtype,
- MachinePointerInfo());
+ MachinePointerInfo(), /* Align */ 0,
+ /* Volatile */ false, /* ReadMem */ false,
+ /* WriteMem */ true, /* Size */ 0);
InFlag = Chain.getValue(1);
}
@@ -1795,7 +1798,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
SDValue RetVal = DAG.getMemIntrinsicNode(
Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
- MachinePointerInfo(), EltAlign);
+ MachinePointerInfo(), EltAlign, /* Volatile */ false,
+ /* ReadMem */ true, /* WriteMem */ false, /* Size */ 0);
for (unsigned j = 0; j < NumElts; ++j) {
SDValue Ret = RetVal.getValue(j);
@@ -2579,7 +2583,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
StoreOperands, TheStoreType,
- MachinePointerInfo(), 1);
+ MachinePointerInfo(), /* Align */ 1,
+ /* Volatile */ false, /* ReadMem */ false,
+ /* WriteMem */ true, /* Size */ 0);
// Cleanup vector state.
StoreOperands.clear();
}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 9378b29a9d0e..b5b5ea1ed639 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3101,7 +3101,8 @@ def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
(CBranchOther Int1Regs:$a, bb:$target)>;
// Call
-def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
@@ -3126,10 +3127,10 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
: NVPTXInst<outs, ins, asmstr, pattern>;
def Callseq_Start :
- NVPTXInst<(outs), (ins i32imm:$amt),
- "\\{ // callseq $amt\n"
+ NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "\\{ // callseq $amt1, $amt2\n"
"\t.reg .b32 temp_param_reg;",
- [(callseq_start timm:$amt)]>;
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
def Callseq_End :
NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
"\\} // callseq $amt1",
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 12ffbfdeacc1..11d22377611b 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -204,6 +204,17 @@ static const unsigned G8Regs[] = {
PPC::X28, PPC::X29, PPC::X30, PPC::X31
};
+static const unsigned G80Regs[] = {
+ PPC::ZERO8, PPC::X1, PPC::X2, PPC::X3,
+ PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+ PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+ PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+ PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+ PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+ PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+ PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+
static const unsigned QFRegs[] = {
PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3,
PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
@@ -301,6 +312,12 @@ static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
return decodeRegisterClass(Inst, RegNo, G8Regs);
}
+static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, G80Regs);
+}
+
#define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
#define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 609d959c6d08..84bb9ec56800 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -95,7 +95,8 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
return;
}
- if (MI->getOpcode() == PPC::RLDICR) {
+ if (MI->getOpcode() == PPC::RLDICR ||
+ MI->getOpcode() == PPC::RLDICR_32) {
unsigned char SH = MI->getOperand(2).getImm();
unsigned char ME = MI->getOperand(3).getImm();
// rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 9b91b9ab8f82..2fc8654deeab 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -1330,7 +1330,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
// Issue CALLSEQ_START.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TII.getCallFrameSetupOpcode()))
- .addImm(NumBytes);
+ .addImm(NumBytes).addImm(0);
// Prepare to assign register arguments. Every argument uses up a
// GPR protocol register even if it's passed in a floating-point
@@ -2246,6 +2246,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
}
case PPC::EXTSW:
+ case PPC::EXTSW_32:
case PPC::EXTSW_32_64: {
if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
return false;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 1b0402bf003d..5fa7b2c6bfb1 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -54,6 +54,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -68,6 +69,14 @@ using namespace llvm;
#define DEBUG_TYPE "ppc-codegen"
+STATISTIC(NumSextSetcc,
+ "Number of (sext(setcc)) nodes expanded into GPR sequence.");
+STATISTIC(NumZextSetcc,
+ "Number of (zext(setcc)) nodes expanded into GPR sequence.");
+STATISTIC(SignExtensionsAdded,
+ "Number of sign extensions for compare inputs added.");
+STATISTIC(ZeroExtensionsAdded,
+ "Number of zero extensions for compare inputs added.");
// FIXME: Remove this once the bug has been fixed!
cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
@@ -252,7 +261,28 @@ namespace {
#include "PPCGenDAGISel.inc"
private:
+ // Conversion type for interpreting results of a 32-bit instruction as
+ // a 64-bit value or vice versa.
+ enum ExtOrTruncConversion { Ext, Trunc };
+
+ // Modifiers to guide how an ISD::SETCC node's result is to be computed
+ // in a GPR.
+ // ZExtOrig - use the original condition code, zero-extend value
+ // ZExtInvert - invert the condition code, zero-extend value
+ // SExtOrig - use the original condition code, sign-extend value
+ // SExtInvert - invert the condition code, sign-extend value
+ enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert };
+
bool trySETCC(SDNode *N);
+ bool tryEXTEND(SDNode *N);
+ SDValue signExtendInputIfNeeded(SDValue Input);
+ SDValue zeroExtendInputIfNeeded(SDValue Input);
+ SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv);
+ SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl);
+ SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl);
+ SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts);
void PeepholePPC64();
void PeepholePPC64ZExt();
@@ -2471,6 +2501,225 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
return true;
}
+/// If this node is a sign/zero extension of an integer comparison,
+/// it can usually be computed in GPR's rather than using comparison
+/// instructions and ISEL. We only do this on 64-bit targets for now
+/// as the code is specialized for 64-bit (it uses 64-bit instructions
+/// and assumes 64-bit registers).
+bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) {
+ if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
+ return false;
+ assert((N->getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOpcode() == ISD::SIGN_EXTEND) &&
+ "Expecting a zero/sign extend node!");
+
+ if (N->getOperand(0).getOpcode() != ISD::SETCC)
+ return false;
+
+ SDValue WideRes =
+ getSETCCInGPR(N->getOperand(0),
+ N->getOpcode() == ISD::SIGN_EXTEND ?
+ SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
+
+ if (!WideRes)
+ return false;
+
+ SDLoc dl(N);
+ bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32;
+ bool Output32Bit = N->getValueType(0) == MVT::i32;
+
+ NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0;
+ NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1;
+
+ SDValue ConvOp = WideRes;
+ if (Inputs32Bit != Output32Bit)
+ ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext :
+ ExtOrTruncConversion::Trunc);
+ ReplaceNode(N, ConvOp.getNode());
+
+ return true;
+}
+
+/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) {
+ assert(Input.getValueType() == MVT::i32 &&
+ "Can only sign-extend 32-bit values here.");
+ unsigned Opc = Input.getOpcode();
+
+ // The value was sign extended and then truncated to 32-bits. No need to
+ // sign extend it again.
+ if (Opc == ISD::TRUNCATE &&
+ (Input.getOperand(0).getOpcode() == ISD::AssertSext ||
+ Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND))
+ return Input;
+
+ LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+ // The input is a sign-extending load. No reason to sign-extend.
+ if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD)
+ return Input;
+
+ ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+ // We don't sign-extend constants and already sign-extended values.
+ if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG ||
+ Opc == ISD::SIGN_EXTEND)
+ return Input;
+
+ SDLoc dl(Input);
+ SignExtensionsAdded++;
+ return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0);
+}
+
+/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) {
+ assert(Input.getValueType() == MVT::i32 &&
+ "Can only zero-extend 32-bit values here.");
+ LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+ unsigned Opc = Input.getOpcode();
+
+ // No need to zero-extend loaded values (unless they're loaded with
+ // a sign-extending load).
+ if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD)
+ return Input;
+
+ ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+ bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0;
+ // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have
+ // to conservatively actually clear the high bits. We also don't need to
+ // zero-extend constants or values that are already zero-extended.
+ if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND)
+ return Input;
+
+ SDLoc dl(Input);
+ ZeroExtensionsAdded++;
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input,
+ getI64Imm(0, dl), getI64Imm(32, dl)),
+ 0);
+}
+
+// Handle a 32-bit value in a 64-bit register and vice-versa. These are of
+// course not actual zero/sign extensions that will generate machine code,
+// they're just a way to reinterpret a 32 bit value in a register as a
+// 64 bit value and vice-versa.
+SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes,
+ ExtOrTruncConversion Conv) {
+ SDLoc dl(NatWidthRes);
+
+ // For reinterpreting 32-bit values as 64 bit values, we generate
+ // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1>
+ if (Conv == ExtOrTruncConversion::Ext) {
+ SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0);
+ SDValue SubRegIdx =
+ CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+ return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64,
+ ImDef, NatWidthRes, SubRegIdx), 0);
+ }
+
+ assert(Conv == ExtOrTruncConversion::Trunc &&
+ "Unknown convertion between 32 and 64 bit values.");
+ // For reinterpreting 64-bit values as 32-bit values, we just need to
+ // EXTRACT_SUBREG (i.e. extract the low word).
+ SDValue SubRegIdx =
+ CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+ return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32,
+ NatWidthRes, SubRegIdx), 0);
+}
+
+/// Produces a zero-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl) {
+ bool IsRHSZero = RHSValue == 0;
+ switch (CC) {
+ default: return SDValue();
+ case ISD::SETEQ: {
+ // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5)
+ // (zext (setcc %a, 0, seteq)) -> (lshr (cntlzw %a), 5)
+ SDValue Xor = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+ SDValue Clz =
+ SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+ SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl),
+ getI32Imm(31, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+ ShiftOps), 0);
+ }
+ }
+}
+
+/// Produces a sign-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl) {
+ bool IsRHSZero = RHSValue == 0;
+ switch (CC) {
+ default: return SDValue();
+ case ISD::SETEQ: {
+ // (sext (setcc %a, %b, seteq)) ->
+ // (ashr (shl (ctlz (xor %a, %b)), 58), 63)
+ // (sext (setcc %a, 0, seteq)) ->
+ // (ashr (shl (ctlz %a), 58), 63)
+ SDValue CountInput = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+ SDValue Cntlzw =
+ SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0);
+ SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) };
+ SDValue Sldi =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi,
+ getI32Imm(63, dl)), 0);
+ }
+ }
+}
+
+/// Returns an equivalent of a SETCC node but with the result the same width as
+/// the inputs. This can nalso be used for SELECT_CC if either the true or false
+/// values is a power of two while the other is zero.
+SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
+ SetccInGPROpts ConvOpts) {
+ assert((Compare.getOpcode() == ISD::SETCC ||
+ Compare.getOpcode() == ISD::SELECT_CC) &&
+ "An ISD::SETCC node required here.");
+
+ SDValue LHS = Compare.getOperand(0);
+ SDValue RHS = Compare.getOperand(1);
+
+ // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC.
+ int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2;
+ ISD::CondCode CC =
+ cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get();
+ EVT InputVT = LHS.getValueType();
+ if (InputVT != MVT::i32)
+ return SDValue();
+
+ SDLoc dl(Compare);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
+
+ if (ConvOpts == SetccInGPROpts::ZExtInvert ||
+ ConvOpts == SetccInGPROpts::SExtInvert)
+ CC = ISD::getSetCCInverse(CC, true);
+
+ if (ISD::isSignedIntSetCC(CC)) {
+ LHS = signExtendInputIfNeeded(LHS);
+ RHS = signExtendInputIfNeeded(RHS);
+ } else if (ISD::isUnsignedIntSetCC(CC)) {
+ LHS = zeroExtendInputIfNeeded(LHS);
+ RHS = zeroExtendInputIfNeeded(RHS);
+ }
+
+ bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig ||
+ ConvOpts == SetccInGPROpts::SExtInvert;
+ if (IsSext)
+ return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+ return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+}
+
void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
// Transfer memoperands.
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -2508,6 +2757,12 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
}
break;
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ if (tryEXTEND(N))
+ return;
+ break;
+
case ISD::SETCC:
if (trySETCC(N))
return;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 685f24cb502e..17bdd595da10 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -923,6 +923,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
// We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::BUILD_VECTOR);
if (Subtarget.hasFPCVT())
@@ -4949,8 +4952,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
- dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue CallSeqStart = Chain;
// Load the return address and frame pointer so it can be moved somewhere else
@@ -5000,9 +5002,8 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
Flags, DAG, dl);
// This must go outside the CALLSEQ_START..END.
- SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
- CallSeqStart.getNode()->getOperand(1),
- SDLoc(MemcpyCall));
+ SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
+ SDLoc(MemcpyCall));
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
NewCallSeqStart.getNode());
Chain = CallSeqStart = NewCallSeqStart;
@@ -5083,9 +5084,9 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
CallSeqStart.getNode()->getOperand(0),
Flags, DAG, dl);
// The MEMCPY must go outside the CALLSEQ_START..END.
- SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
- CallSeqStart.getNode()->getOperand(1),
- SDLoc(MemcpyCall));
+ int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
+ SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
+ SDLoc(MemcpyCall));
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
NewCallSeqStart.getNode());
return NewCallSeqStart;
@@ -5268,8 +5269,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall)
- Chain = DAG.getCALLSEQ_START(Chain,
- DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue CallSeqStart = Chain;
// Load the return address and frame pointer so it can be move somewhere else
@@ -5828,8 +5828,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
- dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue CallSeqStart = Chain;
// Load the return address and frame pointer so it can be move somewhere else
@@ -8741,9 +8740,9 @@ static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
// The mappings for emitLeading/TrailingFence is taken from
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
- AtomicOrdering Ord, bool IsStore,
- bool IsLoad) const {
+Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -8751,10 +8750,10 @@ Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
return nullptr;
}
-Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
- AtomicOrdering Ord, bool IsStore,
- bool IsLoad) const {
- if (IsLoad && isAcquireOrStronger(Ord))
+Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord))
return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
// FIXME: this is too conservative, a dependent branch + isync is enough.
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
@@ -11316,6 +11315,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
SDLoc dl(N);
switch (N->getOpcode()) {
default: break;
+ case ISD::SHL:
+ return combineSHL(N, DCI);
+ case ISD::SRA:
+ return combineSRA(N, DCI);
+ case ISD::SRL:
+ return combineSRL(N, DCI);
case PPCISD::SHL:
if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
return N->getOperand(0);
@@ -12948,3 +12953,58 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
return Imm.isPosZero();
}
}
+
+// For vector shift operation op, fold
+// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
+static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N0.getValueType();
+ unsigned OpSizeInBits = VT.getScalarSizeInBits();
+ unsigned Opcode = N->getOpcode();
+ unsigned TargetOpcode;
+
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected shift operation");
+ case ISD::SHL:
+ TargetOpcode = PPCISD::SHL;
+ break;
+ case ISD::SRL:
+ TargetOpcode = PPCISD::SRL;
+ break;
+ case ISD::SRA:
+ TargetOpcode = PPCISD::SRA;
+ break;
+ }
+
+ if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
+ N1->getOpcode() == ISD::AND)
+ if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
+ if (Mask->getZExtValue() == OpSizeInBits - 1)
+ return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));
+
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
+ if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+ return Value;
+
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
+ if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+ return Value;
+
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
+ if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+ return Value;
+
+ return SDValue();
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 32661099b79d..4fc744257262 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -117,9 +117,13 @@ namespace llvm {
/// at function entry, used for PIC code.
GlobalBaseReg,
- /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
- /// shift amounts. These nodes are generated by the multi-precision shift
- /// code.
+ /// These nodes represent PPC shifts.
+ ///
+ /// For scalar types, only the last `n + 1` bits of the shift amounts
+ /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
+ /// for exact behaviors.
+ ///
+ /// For vector types, only the last n bits are used. See vsld.
SRL, SRA, SHL,
/// The combination of sra[wd]i and addze used to implemented signed
@@ -617,10 +621,10 @@ namespace llvm {
return true;
}
- Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
- bool IsStore, bool IsLoad) const override;
- Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
- bool IsStore, bool IsLoad) const override;
+ Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
@@ -999,6 +1003,9 @@ namespace llvm {
SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
/// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
/// SETCC with integer subtraction when (1) there is a legal way of doing it
@@ -1017,14 +1024,6 @@ namespace llvm {
SDValue
combineElementTruncationToVectorTruncation(SDNode *N,
DAGCombinerInfo &DCI) const;
-
- bool supportsModuloShift(ISD::NodeType Inst,
- EVT ReturnType) const override {
- assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) &&
- "Expect a shift instruction");
- assert(isOperationLegal(Inst, ReturnType));
- return ReturnType.isVector();
- }
};
namespace PPC {
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 997b96ca6ec8..a8433919f0f3 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -634,10 +634,19 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
"extsw", "$rA, $rS", IIC_IntSimple,
[(set i64:$rA, (sext i32:$rS))]>, isPPC64;
+let isCodeGenOnly = 1 in
+def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS),
+ "extsw $rA, $rS", IIC_IntSimple,
+ []>, isPPC64;
defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
"sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
[(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def SRADI_32 : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH),
+ "sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64;
+
defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS),
"cntlzd", "$rA, $rS", IIC_IntGeneral,
[(set i64:$rA, (ctlz i64:$rS))]>;
@@ -721,15 +730,26 @@ defm RLDICL : MDForm_1r<30, 0,
// For fast-isel:
let isCodeGenOnly = 1 in
def RLDICL_32_64 : MDForm_1<30, 0,
- (outs g8rc:$rA),
- (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
- "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
- []>, isPPC64;
+ (outs g8rc:$rA),
+ (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64;
// End fast-isel.
+let isCodeGenOnly = 1 in
+def RLDICL_32 : MDForm_1<30, 0,
+ (outs gprc:$rA),
+ (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64;
defm RLDICR : MDForm_1r<30, 1,
(outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
"rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
[]>, isPPC64;
+let isCodeGenOnly = 1 in
+def RLDICR_32 : MDForm_1<30, 1,
+ (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldicr $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64;
defm RLDIC : MDForm_1r<30, 2,
(outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
"rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index c380766e9f5c..e14d18fd5433 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -987,6 +987,12 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
(v8i16 (VSLH $vA, $vB))>;
def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
(v4i32 (VSLW $vA, $vB))>;
+def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)),
+ (v16i8 (VSLB $vA, $vB))>;
+def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
+ (v8i16 (VSLH $vA, $vB))>;
+def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VSLW $vA, $vB))>;
def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
(v16i8 (VSRB $vA, $vB))>;
@@ -994,6 +1000,12 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
(v8i16 (VSRH $vA, $vB))>;
def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
(v4i32 (VSRW $vA, $vB))>;
+def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)),
+ (v16i8 (VSRB $vA, $vB))>;
+def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
+ (v8i16 (VSRH $vA, $vB))>;
+def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VSRW $vA, $vB))>;
def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
(v16i8 (VSRAB $vA, $vB))>;
@@ -1001,6 +1013,12 @@ def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)),
(v8i16 (VSRAH $vA, $vB))>;
def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)),
(v4i32 (VSRAW $vA, $vB))>;
+def : Pat<(v16i8 (PPCsra v16i8:$vA, v16i8:$vB)),
+ (v16i8 (VSRAB $vA, $vB))>;
+def : Pat<(v8i16 (PPCsra v8i16:$vA, v8i16:$vB)),
+ (v8i16 (VSRAH $vA, $vB))>;
+def : Pat<(v4i32 (PPCsra v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VSRAW $vA, $vB))>;
// Float to integer and integer to float conversions
def : Pat<(v4i32 (fp_to_sint v4f32:$vA)),
@@ -1072,14 +1090,24 @@ def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB),
// Vector shifts
def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>;
def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
- "vsld $vD, $vA, $vB", IIC_VecGeneral,
- [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>;
+ "vsld $vD, $vA, $vB", IIC_VecGeneral, []>;
def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
- "vsrd $vD, $vA, $vB", IIC_VecGeneral,
- [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>;
+ "vsrd $vD, $vA, $vB", IIC_VecGeneral, []>;
def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
- "vsrad $vD, $vA, $vB", IIC_VecGeneral,
- [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>;
+ "vsrad $vD, $vA, $vB", IIC_VecGeneral, []>;
+
+def : Pat<(v2i64 (shl v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSLD $vA, $vB))>;
+def : Pat<(v2i64 (PPCshl v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSLD $vA, $vB))>;
+def : Pat<(v2i64 (srl v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSRD $vA, $vB))>;
+def : Pat<(v2i64 (PPCsrl v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSRD $vA, $vB))>;
+def : Pat<(v2i64 (sra v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSRAD $vA, $vB))>;
+def : Pat<(v2i64 (PPCsra v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSRAD $vA, $vB))>;
// Vector Integer Arithmetic Instructions
let isCommutable = 1 in {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index f004ce49cac0..1af5e7f28342 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -33,7 +33,8 @@ def SDT_PPCVexts : SDTypeProfile<1, 2, [
SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
]>;
-def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
def SDT_PPCCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
SDTCisVT<1, i32> ]>;
def SDT_PPCvperm : SDTypeProfile<1, 3, [
@@ -1099,9 +1100,11 @@ multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
let hasCtrlDep = 1 in {
let Defs = [R1], Uses = [R1] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
- [(callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+ "#ADJCALLSTACKDOWN $amt1 $amt2",
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+ "#ADJCALLSTACKUP $amt1 $amt2",
[(callseq_end timm:$amt1, timm:$amt2)]>;
}
@@ -4163,6 +4166,8 @@ def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0
def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"clrldi $rA, $rS, $n",
+ (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>;
def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 967557452f24..b98140fedfc0 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1436,7 +1436,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in {
def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
"mtvsrws $XT, $rA", IIC_VecGeneral, []>;
- def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+ def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB),
"mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
[]>, Requires<[In64BitMode]>;
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 0c1260a2965b..c7aa4cb78b7a 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -99,7 +99,8 @@ protected:
// Don't really need to save data to the stack - the clobbered
// registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
// gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
- BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0);
+ BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
+ .addImm(0);
// Expand into two ops built prior to the existing instruction.
MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index acb34d5baaa8..9e7e3c6b705a 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -773,8 +773,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
}
}
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
- dl);
+ Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl);
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
@@ -1165,8 +1164,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
// Adjust the stack pointer to make room for the arguments.
// FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
// with more than 6 arguments.
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
// Collect the set of registers to pass to the function and their values.
// This will be emitted as a sequence of CopyToReg nodes glued to the call
@@ -2058,7 +2056,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SDValue Chain = DAG.getEntryNode();
SDValue InFlag;
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, DL, true), DL);
+ Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
InFlag = Chain.getValue(1);
SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -3386,7 +3384,10 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
default: break;
- case 'r': return C_RegisterClass;
+ case 'r':
+ case 'f':
+ case 'e':
+ return C_RegisterClass;
case 'I': // SIMM13
return C_Other;
}
@@ -3465,6 +3466,24 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &SP::IntPairRegClass);
else
return std::make_pair(0U, &SP::IntRegsRegClass);
+ case 'f':
+ if (VT == MVT::f32)
+ return std::make_pair(0U, &SP::FPRegsRegClass);
+ else if (VT == MVT::f64)
+ return std::make_pair(0U, &SP::LowDFPRegsRegClass);
+ else if (VT == MVT::f128)
+ return std::make_pair(0U, &SP::LowQFPRegsRegClass);
+ llvm_unreachable("Unknown ValueType for f-register-type!");
+ break;
+ case 'e':
+ if (VT == MVT::f32)
+ return std::make_pair(0U, &SP::FPRegsRegClass);
+ else if (VT == MVT::f64)
+ return std::make_pair(0U, &SP::DFPRegsRegClass);
+ else if (VT == MVT::f128)
+ return std::make_pair(0U, &SP::QFPRegsRegClass);
+ llvm_unreachable("Unknown ValueType for e-register-type!");
+ break;
}
} else if (!Constraint.empty() && Constraint.size() <= 5
&& Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 5a19c624abb5..ae45c8be6752 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -195,7 +195,8 @@ def SPsjlj_longjmp: SDNode<"SPISD::EH_SJLJ_LONGJMP",
[SDNPHasChain, SDNPSideEffect]>;
// These are target-independent nodes, but have target-specific formats.
-def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
SDTCisVT<1, i32> ]>;
@@ -404,9 +405,9 @@ let Defs = [O7] in {
}
let Defs = [O6], Uses = [O6] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
- "!ADJCALLSTACKDOWN $amt",
- [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "!ADJCALLSTACKDOWN $amt1, $amt2",
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
"!ADJCALLSTACKUP $amt1",
[(callseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index 6ecfddfc7d66..6625eaafd992 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -346,11 +346,13 @@ def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
// Floating point register classes.
def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
-
def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
-
def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
+// The Low?FPRegs classes are used only for inline-asm constraints.
+def LowDFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>;
+def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>;
+
// Floating point control register classes.
def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 3f91ca9035a6..efcf6696fd50 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -262,6 +262,9 @@ public:
bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const {
return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287);
}
+ bool isMemDisp12Len4(RegisterKind RegKind) const {
+ return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x10);
+ }
bool isMemDisp12Len8(RegisterKind RegKind) const {
return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100);
}
@@ -347,6 +350,7 @@ public:
bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); }
bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); }
bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); }
+ bool isBDLAddr64Disp12Len4() const { return isMemDisp12Len4(ADDR64Reg); }
bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, ADDR64Reg); }
bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); }
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index a281a0aa6bcc..27fd70bc6092 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -327,6 +327,18 @@ static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field,
return MCDisassembler::Success;
}
+static DecodeStatus decodeBDLAddr12Len4Operand(MCInst &Inst, uint64_t Field,
+ const unsigned *Regs) {
+ uint64_t Length = Field >> 16;
+ uint64_t Base = (Field >> 12) & 0xf;
+ uint64_t Disp = Field & 0xfff;
+ assert(Length < 16 && "Invalid BDLAddr12Len4");
+ Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+ Inst.addOperand(MCOperand::createImm(Disp));
+ Inst.addOperand(MCOperand::createImm(Length + 1));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field,
const unsigned *Regs) {
uint64_t Length = Field >> 16;
@@ -399,6 +411,13 @@ static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
}
+static DecodeStatus decodeBDLAddr64Disp12Len4Operand(MCInst &Inst,
+ uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDLAddr12Len4Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst,
uint64_t Field,
uint64_t Address,
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index 092eb4011adc..d188f56512ab 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -77,6 +77,9 @@ private:
uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ uint64_t getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
@@ -220,6 +223,17 @@ getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
}
uint64_t SystemZMCCodeEmitter::
+getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+ uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+ uint64_t Len = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
+ assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len));
+ return (Len << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index 86a1322c9e23..74cf653b9d95 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -63,7 +63,7 @@ via a register.)
--
-We don't use ICM or STCM.
+We don't use ICM, STCM, or CLM.
--
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index 716e5add8051..7bfa378aa85c 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -68,6 +68,11 @@ def FeaturePopulationCount : SystemZFeature<
"Assume that the population-count facility is installed"
>;
+def FeatureMessageSecurityAssist4 : SystemZFeature<
+ "message-security-assist-extension4", "MessageSecurityAssist4",
+ "Assume that the message-security-assist extension facility 4 is installed"
+>;
+
def Arch9NewFeatures : SystemZFeatureList<[
FeatureDistinctOps,
FeatureFastSerialization,
@@ -75,7 +80,8 @@ def Arch9NewFeatures : SystemZFeatureList<[
FeatureHighWord,
FeatureInterlockedAccess1,
FeatureLoadStoreOnCond,
- FeaturePopulationCount
+ FeaturePopulationCount,
+ FeatureMessageSecurityAssist4
]>;
//===----------------------------------------------------------------------===//
@@ -133,6 +139,11 @@ def FeatureLoadStoreOnCond2 : SystemZFeature<
"Assume that the load/store-on-condition facility 2 is installed"
>;
+def FeatureMessageSecurityAssist5 : SystemZFeature<
+ "message-security-assist-extension5", "MessageSecurityAssist5",
+ "Assume that the message-security-assist extension facility 5 is installed"
+>;
+
def FeatureVector : SystemZFeature<
"vector", "Vector",
"Assume that the vectory facility is installed"
@@ -142,6 +153,7 @@ def FeatureNoVector : SystemZMissingFeature<"Vector">;
def Arch11NewFeatures : SystemZFeatureList<[
FeatureLoadAndZeroRightmostByte,
FeatureLoadStoreOnCond2,
+ FeatureMessageSecurityAssist5,
FeatureVector
]>;
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 6989aabb8c6a..235e095f0010 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1110,9 +1110,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Mark the start of the call.
if (!IsTailCall)
- Chain = DAG.getCALLSEQ_START(Chain,
- DAG.getConstant(NumBytes, DL, PtrVT, true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
// Copy argument values to their designated locations.
SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
@@ -6354,3 +6352,12 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
llvm_unreachable("Unexpected instr type to insert");
}
}
+
+// This is only used by the isel schedulers, and is needed only to prevent
+// compiler from crashing when list-ilp is used.
+const TargetRegisterClass *
+SystemZTargetLowering::getRepRegClassFor(MVT VT) const {
+ if (VT == MVT::Untyped)
+ return &SystemZ::ADDR128BitRegClass;
+ return TargetLowering::getRepRegClassFor(VT);
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 1c34dc43e8bb..79c8c4d92669 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -590,6 +590,8 @@ private:
MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI,
MachineBasicBlock *MBB,
unsigned Opcode) const;
+
+ const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
};
} // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index bb6d27e24828..364b81f98eed 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -458,6 +458,12 @@ def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>;
def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>;
def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
+// Divide to integer.
+let Defs = [CC] in {
+ def DIEBR : TernaryRRFb<"diebr", 0xB353, FP32, FP32, FP32>;
+ def DIDBR : TernaryRRFb<"didbr", 0xB35B, FP64, FP64, FP64>;
+}
+
//===----------------------------------------------------------------------===//
// Comparisons
//===----------------------------------------------------------------------===//
@@ -469,6 +475,13 @@ let Defs = [CC], CCValues = 0xF in {
def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
+
+ def KEBR : CompareRRE<"kebr", 0xB308, null_frag, FP32, FP32>;
+ def KDBR : CompareRRE<"kdbr", 0xB318, null_frag, FP64, FP64>;
+ def KXBR : CompareRRE<"kxbr", 0xB348, null_frag, FP128, FP128>;
+
+ def KEB : CompareRXE<"keb", 0xED08, null_frag, FP32, load, 4>;
+ def KDB : CompareRXE<"kdb", 0xED18, null_frag, FP64, load, 8>;
}
// Test Data Class.
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index c727f486087e..a37da2807854 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -710,6 +710,21 @@ class InstRSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{15-0} = RI2;
}
+class InstRSLa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<20> BDL1;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = BDL1{19-16};
+ let Inst{35-32} = 0;
+ let Inst{31-16} = BDL1{15-0};
+ let Inst{15-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
class InstRSYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
: InstSystemZ<6, outs, ins, asmstr, pattern> {
field bits<48> Inst;
@@ -817,6 +832,37 @@ class InstSSa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{15-0} = BD2;
}
+class InstSSb<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<20> BDL1;
+ bits<20> BDL2;
+
+ let Inst{47-40} = op;
+ let Inst{39-36} = BDL1{19-16};
+ let Inst{35-32} = BDL2{19-16};
+ let Inst{31-16} = BDL1{15-0};
+ let Inst{15-0} = BDL2{15-0};
+}
+
+class InstSSc<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<20> BDL1;
+ bits<16> BD2;
+ bits<4> I3;
+
+ let Inst{47-40} = op;
+ let Inst{39-36} = BDL1{19-16};
+ let Inst{35-32} = I3;
+ let Inst{31-16} = BDL1{15-0};
+ let Inst{15-0} = BD2;
+}
+
class InstSSd<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
: InstSystemZ<6, outs, ins, asmstr, pattern> {
field bits<48> Inst;
@@ -850,6 +896,20 @@ class InstSSe<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{15-0} = BD4;
}
+class InstSSf<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<16> BD1;
+ bits<24> BDL2;
+
+ let Inst{47-40} = op;
+ let Inst{39-32} = BDL2{23-16};
+ let Inst{31-16} = BD1;
+ let Inst{15-0} = BDL2{15-0};
+}
+
class InstSSE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
: InstSystemZ<6, outs, ins, asmstr, pattern> {
field bits<48> Inst;
@@ -1567,6 +1627,9 @@ class ICV<string name>
// Inherent:
// One register output operand and no input operands.
//
+// InherentDual:
+// Two register output operands and no input operands.
+//
// StoreInherent:
// One address operand. The instruction stores to the address.
//
@@ -1642,8 +1705,9 @@ class ICV<string name>
// Two input operands and an implicit CC output operand.
//
// Test:
-// Two input operands and an implicit CC output operand. The second
-// input operand is an "address" operand used as a test class mask.
+// One or two input operands and an implicit CC output operand. If
+// present, the second input operand is an "address" operand used as
+// a test class mask.
//
// Ternary:
// One register output operand and three input operands.
@@ -1691,6 +1755,10 @@ class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
let R2 = 0;
}
+class InherentDualRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
+ : InstRRE<opcode, (outs cls:$R1, cls:$R2), (ins),
+ mnemonic#"\t$R1, $R2", []>;
+
class InherentVRIa<string mnemonic, bits<16> opcode, bits<16> value>
: InstVRIa<opcode, (outs VR128:$V1), (ins), mnemonic#"\t$V1", []> {
let I2 = value;
@@ -1714,6 +1782,12 @@ class SideEffectInherentS<string mnemonic, bits<16> opcode,
let BD2 = 0;
}
+class SideEffectInherentRRE<string mnemonic, bits<16> opcode>
+ : InstRRE<opcode, (outs), (ins), mnemonic, []> {
+ let R1 = 0;
+ let R2 = 0;
+}
+
// Allow an optional TLS marker symbol to generate TLS call relocations.
class CallRI<string mnemonic, bits<12> opcode>
: InstRIb<opcode, (outs), (ins GR64:$R1, brtarget16tls:$RI2),
@@ -2084,6 +2158,13 @@ multiclass LoadMultipleRSPair<string mnemonic, bits<8> rsOpcode,
}
}
+class LoadMultipleSSe<string mnemonic, bits<8> opcode, RegisterOperand cls>
+ : InstSSe<opcode, (outs cls:$R1, cls:$R3),
+ (ins bdaddr12only:$BD2, bdaddr12only:$BD4),
+ mnemonic#"\t$R1, $R3, $BD2, $BD4", []> {
+ let mayLoad = 1;
+}
+
class LoadMultipleVRSa<string mnemonic, bits<16> opcode>
: InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2),
mnemonic#"\t$V1, $V3, $BD2", []> {
@@ -2355,6 +2436,15 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let OpType = "reg";
}
+class UnaryMemRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let M3 = 0;
+}
+
class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
RegisterOperand cls, Immediate imm>
: InstRIa<opcode, (outs cls:$R1), (ins imm:$I2),
@@ -2585,11 +2675,61 @@ class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
: InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
mnemonic#"\t$I1, $I2", []>;
+class SideEffectBinarySI<string mnemonic, bits<8> opcode, Operand imm>
+ : InstSI<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2", []>;
+
class SideEffectBinarySIL<string mnemonic, bits<16> opcode,
SDPatternOperator operator, Immediate imm>
: InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>;
+class SideEffectBinarySSa<string mnemonic, bits<8> opcode>
+ : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1, bdaddr12only:$BD2),
+ mnemonic##"\t$BDL1, $BD2", []>;
+
+class SideEffectBinarySSb<string mnemonic, bits<8> opcode>
+ : InstSSb<opcode,
+ (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
+ mnemonic##"\t$BDL1, $BDL2", []>;
+
+class SideEffectBinarySSf<string mnemonic, bits<8> opcode>
+ : InstSSf<opcode, (outs), (ins bdaddr12only:$BD1, bdladdr12onlylen8:$BDL2),
+ mnemonic##"\t$BD1, $BDL2", []>;
+
+class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src";
+ let DisableEncoding = "$R1src, $R2src";
+}
+
+class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRE<opcode, (outs cls2:$R2), (ins cls1:$R1, cls2:$R2src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Constraints = "$R2 = $R2src";
+ let DisableEncoding = "$R2src";
+}
+
+class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRE<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src";
+ let DisableEncoding = "$R1src, $R2src";
+}
+
+class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src";
+ let DisableEncoding = "$R1src, $R2src";
+ let M3 = 0;
+}
+
class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
RegisterOperand cls1, RegisterOperand cls2>
: InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
@@ -2654,6 +2794,20 @@ class BinaryRRFb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let M4 = 0;
}
+class BinaryMemRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2, Immediate imm>
+ : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3),
+ mnemonic#"\t$R1, $R2, $M3", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+multiclass BinaryMemRRFcOpt<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2> {
+ def "" : BinaryMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+ def Opt : UnaryMemRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
class BinaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
RegisterOperand cls2>
: InstRRFe<opcode, (outs cls1:$R1), (ins imm32zx4:$M3, cls2:$R2),
@@ -3112,6 +3266,34 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let AccessBytes = bytes;
}
+class StoreBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr12only>
+ : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+ mnemonic#"\t$R1, $M3, $BD2", []> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+class StoreBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+ mnemonic#"\t$R1, $M3, $BD2", []> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+multiclass StoreBinaryRSPair<string mnemonic, bits<8> rsOpcode,
+ bits<16> rsyOpcode, RegisterOperand cls,
+ bits<5> bytes> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : StoreBinaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+ let DispSize = "20" in
+ def Y : StoreBinaryRSY<mnemonic#"y", rsyOpcode, cls, bytes,
+ bdaddr20pair>;
+ }
+}
+
class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
Immediate index>
: InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
@@ -3237,6 +3419,40 @@ multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
}
}
+class CompareRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr12only>
+ : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+ mnemonic#"\t$R1, $M3, $BD2", []> {
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class CompareRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+ mnemonic#"\t$R1, $M3, $BD2", []> {
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+multiclass CompareRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+ RegisterOperand cls, bits<5> bytes> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : CompareRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+ let DispSize = "20" in
+ def Y : CompareRSY<mnemonic#"y", rsyOpcode, cls, bytes, bdaddr20pair>;
+ }
+}
+
+class CompareSSb<string mnemonic, bits<8> opcode>
+ : InstSSb<opcode,
+ (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
+ mnemonic##"\t$BDL1, $BDL2", []> {
+ let isCompare = 1;
+ let mayLoad = 1;
+}
+
class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
SDPatternOperator load, Immediate imm,
AddressingMode mode = bdaddr12only>
@@ -3313,18 +3529,68 @@ class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let M3 = 0;
}
+class TestRSL<string mnemonic, bits<16> opcode>
+ : InstRSLa<opcode, (outs), (ins bdladdr12onlylen4:$BDL1),
+ mnemonic#"\t$BDL1", []> {
+ let mayLoad = 1;
+}
+
+class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
+ : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1,
+ shift12only:$BD2, imm32zx4:$I3),
+ mnemonic##"\t$BDL1, $BD2, $I3", []>;
+
+class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1,
+ RegisterOperand cls2,
+ RegisterOperand cls3>
+ : InstRRFb<opcode, (outs cls1:$R1, cls2:$R2, cls3:$R3),
+ (ins cls1:$R1src, cls2:$R2src, cls3:$R3src),
+ mnemonic#"\t$R1, $R3, $R2", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src, $R3 = $R3src";
+ let DisableEncoding = "$R1src, $R2src, $R3src";
+ let M4 = 0;
+}
+
class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
RegisterOperand cls1, RegisterOperand cls2,
Immediate imm>
: InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
mnemonic#"\t$R1, $R2, $M3", []>;
+class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2,
+ Immediate imm>
+ : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2),
+ (ins cls1:$R1src, cls2:$R2src, imm:$M3),
+ mnemonic#"\t$R1, $R2, $M3", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src";
+ let DisableEncoding = "$R1src, $R2src";
+}
+
+multiclass SideEffectTernaryMemMemRRFcOpt<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1,
+ RegisterOperand cls2> {
+ def "" : SideEffectTernaryMemMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+ def Opt : SideEffectBinaryMemMemRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
class SideEffectTernarySSF<string mnemonic, bits<12> opcode,
RegisterOperand cls>
: InstSSF<opcode, (outs),
(ins bdaddr12only:$BD1, bdaddr12only:$BD2, cls:$R3),
mnemonic#"\t$BD1, $BD2, $R3", []>;
+class TernaryRRFb<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2,
+ RegisterOperand cls3>
+ : InstRRFb<opcode, (outs cls1:$R1, cls3:$R3),
+ (ins cls1:$R1src, cls2:$R2, imm32zx4:$M4),
+ mnemonic#"\t$R1, $R3, $R2, $M4", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
RegisterOperand cls2>
: InstRRFe<opcode, (outs cls1:$R1),
@@ -3376,6 +3642,24 @@ multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
}
}
+class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRSa<opcode, (outs cls1:$R1, cls2:$R3),
+ (ins cls1:$R1src, cls2:$R3src, shift12only:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2", []> {
+ let Constraints = "$R1 = $R1src, $R3 = $R3src";
+ let DisableEncoding = "$R1src, $R3src";
+}
+
+class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRSYa<opcode, (outs cls1:$R1, cls2:$R3),
+ (ins cls1:$R1src, cls2:$R3src, shift20only:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2", []> {
+ let Constraints = "$R1 = $R1src, $R3 = $R3src";
+ let DisableEncoding = "$R1src, $R3src";
+}
+
class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
: InstRXF<opcode, (outs cls:$R1),
@@ -3981,9 +4265,7 @@ class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
// another instruction to handle the excess.
multiclass MemorySS<string mnemonic, bits<8> opcode,
SDPatternOperator sequence, SDPatternOperator loop> {
- def "" : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
- bdaddr12only:$BD2),
- mnemonic##"\t$BDL1, $BD2", []>;
+ def "" : SideEffectBinarySSa<mnemonic, opcode>;
let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
imm64:$length),
@@ -4003,13 +4285,8 @@ multiclass MemorySS<string mnemonic, bits<8> opcode,
// the full loop (the main instruction plus the branch on CC==3).
multiclass StringRRE<string mnemonic, bits<16> opcode,
SDPatternOperator operator> {
- def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
- (ins GR64:$R1src, GR64:$R2src),
- mnemonic#"\t$R1, $R2", []> {
- let Uses = [R0L];
- let Constraints = "$R1 = $R1src, $R2 = $R2src";
- let DisableEncoding = "$R1src, $R2src";
- }
+ let Uses = [R0L] in
+ def "" : SideEffectBinaryMemMemRRE<mnemonic, opcode, GR64, GR64>;
let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
def Loop : Pseudo<(outs GR64:$end),
(ins GR64:$start1, GR64:$start2, GR32:$char),
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index d63525f29412..fa5ecdd85243 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -12,8 +12,8 @@
//===----------------------------------------------------------------------===//
let hasNoSchedulingInfo = 1 in {
- def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
- [(callseq_start timm:$amt)]>;
+ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
[(callseq_end timm:$amt1, timm:$amt2)]>;
}
@@ -464,6 +464,11 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store, imm64sx16>;
// Memory-to-memory moves.
let mayLoad = 1, mayStore = 1 in
defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+ def MVCL : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>;
+ def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>;
+ def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
+}
// String moves.
let mayLoad = 1, mayStore = 1, Defs = [CC] in
@@ -707,6 +712,10 @@ def : StoreGR64PC<STHRL, aligned_truncstorei16>;
defm : StoreGR64Pair<ST, STY, truncstorei32>;
def : StoreGR64PC<STRL, aligned_truncstorei32>;
+// Store characters under mask -- not (yet) used for codegen.
+defm STCM : StoreBinaryRSPair<"stcm", 0xBE, 0xEB2D, GR32, 0>;
+def STCMH : StoreBinaryRSY<"stcmh", 0xEB2C, GRH32, 0>;
+
//===----------------------------------------------------------------------===//
// Multi-register moves
//===----------------------------------------------------------------------===//
@@ -715,6 +724,7 @@ def : StoreGR64PC<STRL, aligned_truncstorei32>;
defm LM : LoadMultipleRSPair<"lm", 0x98, 0xEB98, GR32>;
def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>;
def LMH : LoadMultipleRSY<"lmh", 0xEB96, GRH32>;
+def LMD : LoadMultipleSSe<"lmd", 0xEF, GR64>;
// Multi-register stores.
defm STM : StoreMultipleRSPair<"stm", 0x90, 0xEB90, GR32>;
@@ -742,6 +752,10 @@ def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>;
def STRV : StoreRXY<"strv", 0xE33E, z_strv, GR32, 4>;
def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>;
+// Byte-swapping memory-to-memory moves.
+let mayLoad = 1, mayStore = 1 in
+ def MVCIN : SideEffectBinarySSa<"mvcin", 0xE8>;
+
//===----------------------------------------------------------------------===//
// Load address instructions
//===----------------------------------------------------------------------===//
@@ -816,6 +830,7 @@ defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
defm : InsertMem<"inserti8", IC, GR64, azextloadi8, bdxaddr12pair>;
defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
+// Insert characters under mask -- not (yet) used for codegen.
let Defs = [CC] in {
defm ICM : TernaryRSPair<"icm", 0xBF, 0xEB81, GR32, 0>;
def ICMH : TernaryRSY<"icmh", 0xEB80, GRH32, 0>;
@@ -919,6 +934,10 @@ let Defs = [CC] in {
defm AL : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
def ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
def ALG : BinaryRXY<"alg", 0xE30A, addc, GR64, load, 8>;
+
+ // Addition to memory.
+ def ALSI : BinarySIY<"alsi", 0xEB6E, null_frag, imm32sx8>;
+ def ALGSI : BinarySIY<"algsi", 0xEB7E, null_frag, imm64sx8>;
}
defm : ZXB<addc, GR64, ALGFR>;
@@ -1166,9 +1185,14 @@ def MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
def MSG : BinaryRXY<"msg", 0xE30C, mul, GR64, load, 8>;
// Multiplication of a register, producing two results.
+def MR : BinaryRR <"mr", 0x1C, null_frag, GR128, GR32>;
+def MLR : BinaryRRE<"mlr", 0xB996, null_frag, GR128, GR32>;
def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>;
// Multiplication of memory, producing two results.
+def M : BinaryRX <"m", 0x5C, null_frag, GR128, load, 4>;
+def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
+def ML : BinaryRXY<"ml", 0xE396, null_frag, GR128, load, 4>;
def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
//===----------------------------------------------------------------------===//
@@ -1177,12 +1201,14 @@ def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
let hasSideEffects = 1 in { // Do not speculatively execute.
// Division and remainder, from registers.
+ def DR : BinaryRR <"dr", 0x1D, null_frag, GR128, GR32>;
def DSGFR : BinaryRRE<"dsgfr", 0xB91D, z_sdivrem32, GR128, GR32>;
def DSGR : BinaryRRE<"dsgr", 0xB90D, z_sdivrem64, GR128, GR64>;
def DLR : BinaryRRE<"dlr", 0xB997, z_udivrem32, GR128, GR32>;
def DLGR : BinaryRRE<"dlgr", 0xB987, z_udivrem64, GR128, GR64>;
// Division and remainder, from memory.
+ def D : BinaryRX <"d", 0x5D, null_frag, GR128, load, 4>;
def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>;
def DSG : BinaryRXY<"dsg", 0xE30D, z_sdivrem64, GR128, load, 8>;
def DL : BinaryRXY<"dl", 0xE397, z_udivrem32, GR128, load, 4>;
@@ -1193,23 +1219,32 @@ let hasSideEffects = 1 in { // Do not speculatively execute.
// Shifts
//===----------------------------------------------------------------------===//
-// Shift left.
+// Logical shift left.
let hasSideEffects = 0 in {
defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
- defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
+ def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>;
+}
+
+// Arithmetic shift left.
+let Defs = [CC] in {
+ defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
+ def SLAG : BinaryRSY<"slag", 0xEB0B, null_frag, GR64>;
+ def SLDA : BinaryRS<"slda", 0x8F, null_frag, GR128>;
}
// Logical shift right.
let hasSideEffects = 0 in {
defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
+ def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>;
}
// Arithmetic shift right.
let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>;
+ def SRDA : BinaryRS<"srda", 0x8E, null_frag, GR128>;
}
// Rotate left.
@@ -1351,8 +1386,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
defm : ZXB<z_ucmp, GR64, CLGFR>;
// Memory-to-memory comparison.
-let mayLoad = 1, Defs = [CC] in
+let mayLoad = 1, Defs = [CC] in {
defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+ def CLCL : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>;
+ def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>;
+ def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>;
+}
// String comparison.
let mayLoad = 1, Defs = [CC] in
@@ -1381,6 +1420,12 @@ let Defs = [CC] in {
def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
def TMH : InstAlias<"tmh\t$R, $I", (TMLH GR32:$R, imm32lh16:$I), 0>;
+// Compare logical characters under mask -- not (yet) used for codegen.
+let Defs = [CC] in {
+ defm CLM : CompareRSPair<"clm", 0xBD, 0xEB21, GR32, 0>;
+ def CLMH : CompareRSY<"clmh", 0xEB20, GRH32, 0>;
+}
+
//===----------------------------------------------------------------------===//
// Prefetch and execution hint
//===----------------------------------------------------------------------===//
@@ -1581,6 +1626,115 @@ let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
}
//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1 in
+ def TR : SideEffectBinarySSa<"tr", 0xDC>;
+
+let mayLoad = 1, Defs = [CC, R0L, R1D] in {
+ def TRT : SideEffectBinarySSa<"trt", 0xDD>;
+ def TRTR : SideEffectBinarySSa<"trtr", 0xD0>;
+}
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L] in
+ def TRE : SideEffectBinaryMemMemRRE<"tre", 0xB2A5, GR128, GR64>;
+
+let mayLoad = 1, Uses = [R1D], Defs = [CC] in {
+ defm TRTE : BinaryMemRRFcOpt<"trte", 0xB9BF, GR128, GR64>;
+ defm TRTRE : BinaryMemRRFcOpt<"trtre", 0xB9BD, GR128, GR64>;
+}
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
+ defm TROO : SideEffectTernaryMemMemRRFcOpt<"troo", 0xB993, GR128, GR64>;
+ defm TROT : SideEffectTernaryMemMemRRFcOpt<"trot", 0xB992, GR128, GR64>;
+ defm TRTO : SideEffectTernaryMemMemRRFcOpt<"trto", 0xB991, GR128, GR64>;
+ defm TRTT : SideEffectTernaryMemMemRRFcOpt<"trtt", 0xB990, GR128, GR64>;
+}
+
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+ defm CU12 : SideEffectTernaryMemMemRRFcOpt<"cu12", 0xB2A7, GR128, GR128>;
+ defm CU14 : SideEffectTernaryMemMemRRFcOpt<"cu14", 0xB9B0, GR128, GR128>;
+ defm CU21 : SideEffectTernaryMemMemRRFcOpt<"cu21", 0xB2A6, GR128, GR128>;
+ defm CU24 : SideEffectTernaryMemMemRRFcOpt<"cu24", 0xB9B1, GR128, GR128>;
+ def CU41 : SideEffectBinaryMemMemRRE<"cu41", 0xB9B2, GR128, GR128>;
+ def CU42 : SideEffectBinaryMemMemRRE<"cu42", 0xB9B3, GR128, GR128>;
+
+ let isAsmParserOnly = 1 in {
+ defm CUUTF : SideEffectTernaryMemMemRRFcOpt<"cuutf", 0xB2A6, GR128, GR128>;
+ defm CUTFU : SideEffectTernaryMemMemRRFcOpt<"cutfu", 0xB2A7, GR128, GR128>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
+ def KM : SideEffectBinaryMemMemRRE<"km", 0xB92E, GR128, GR128>;
+ def KMC : SideEffectBinaryMemMemRRE<"kmc", 0xB92F, GR128, GR128>;
+
+ def KIMD : SideEffectBinaryMemRRE<"kimd", 0xB93E, GR64, GR128>;
+ def KLMD : SideEffectBinaryMemRRE<"klmd", 0xB93F, GR64, GR128>;
+ def KMAC : SideEffectBinaryMemRRE<"kmac", 0xB91E, GR64, GR128>;
+
+ let Predicates = [FeatureMessageSecurityAssist4] in {
+ def KMF : SideEffectBinaryMemMemRRE<"kmf", 0xB92A, GR128, GR128>;
+ def KMO : SideEffectBinaryMemMemRRE<"kmo", 0xB92B, GR128, GR128>;
+ def KMCTR : SideEffectTernaryMemMemMemRRFb<"kmctr", 0xB92D,
+ GR128, GR128, GR128>;
+ def PCC : SideEffectInherentRRE<"pcc", 0xB92C>;
+ }
+ let Predicates = [FeatureMessageSecurityAssist5] in
+ def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+defm CVB : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, load, 4>;
+def CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, load, 8>;
+
+defm CVD : StoreRXPair<"cvd", 0x4E, 0xE326, null_frag, GR32, 4>;
+def CVDG : StoreRXY<"cvdg", 0xE32E, null_frag, GR64, 8>;
+
+let mayLoad = 1, mayStore = 1 in {
+ def MVN : SideEffectBinarySSa<"mvn", 0xD1>;
+ def MVZ : SideEffectBinarySSa<"mvz", 0xD3>;
+ def MVO : SideEffectBinarySSb<"mvo", 0xF1>;
+
+ def PACK : SideEffectBinarySSb<"pack", 0xF2>;
+ def PKA : SideEffectBinarySSf<"pka", 0xE9>;
+ def PKU : SideEffectBinarySSf<"pku", 0xE1>;
+ def UNPK : SideEffectBinarySSb<"unpk", 0xF3>;
+ let Defs = [CC] in {
+ def UNPKA : SideEffectBinarySSa<"unpka", 0xEA>;
+ def UNPKU : SideEffectBinarySSa<"unpku", 0xE2>;
+ }
+}
+
+let mayLoad = 1, mayStore = 1 in {
+ let Defs = [CC] in {
+ def AP : SideEffectBinarySSb<"ap", 0xFA>;
+ def SP : SideEffectBinarySSb<"sp", 0xFB>;
+ def ZAP : SideEffectBinarySSb<"zap", 0xF8>;
+ def SRP : SideEffectTernarySSc<"srp", 0xF0>;
+ }
+ def MP : SideEffectBinarySSb<"mp", 0xFC>;
+ def DP : SideEffectBinarySSb<"dp", 0xFD>;
+ let Defs = [CC] in {
+ def ED : SideEffectBinarySSa<"ed", 0xDE>;
+ def EDMK : SideEffectBinarySSa<"edmk", 0xDF>;
+ }
+}
+
+let Defs = [CC] in {
+ def CP : CompareSSb<"cp", 0xF9>;
+ def TP : TestRSL<"tp", 0xEBC0>;
+}
+
+//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
@@ -1712,12 +1866,39 @@ let usesCustomInserter = 1 in {
// Search a block of memory for a character.
let mayLoad = 1, Defs = [CC] in
- defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+ defm SRST : StringRRE<"srst", 0xB25E, z_search_string>;
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+ def SRSTU : SideEffectBinaryMemMemRRE<"srstu", 0xB9BE, GR64, GR64>;
+
+// Compare until substring equal.
+let mayLoad = 1, Defs = [CC], Uses = [R0L, R1L] in
+ def CUSE : SideEffectBinaryMemMemRRE<"cuse", 0xB257, GR128, GR128>;
+
+// Compare and form codeword.
+let mayLoad = 1, Defs = [CC, R1D, R2D, R3D], Uses = [R1D, R2D, R3D] in
+ def CFC : SideEffectAddressS<"cfc", 0xB21A, null_frag>;
+
+// Update tree.
+let mayLoad = 1, mayStore = 1, Defs = [CC, R0D, R1D, R2D, R3D, R5D],
+ Uses = [R0D, R1D, R2D, R3D, R4D, R5D] in
+ def UPT : SideEffectInherentE<"upt", 0x0102>;
+
+// Checksum.
+let mayLoad = 1, Defs = [CC] in
+ def CKSM : SideEffectBinaryMemMemRRE<"cksm", 0xB241, GR64, GR128>;
+
+// Compression call.
+let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in
+ def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>;
// Supervisor call.
let hasSideEffects = 1, isCall = 1, Defs = [CC] in
def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
+// Monitor call.
+let hasSideEffects = 1, isCall = 1 in
+ def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>;
+
// Store clock.
let hasSideEffects = 1, Defs = [CC] in {
def STCK : StoreInherentS<"stck", 0xB205, null_frag, 8>;
@@ -1729,10 +1910,18 @@ let hasSideEffects = 1, Defs = [CC] in {
let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
+// Extract CPU attribute.
+let hasSideEffects = 1 in
+ def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>;
+
// Extract CPU time.
let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in
def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
+// Extract PSW.
+let hasSideEffects = 1, Uses = [CC] in
+ def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>;
+
// Execute.
let hasSideEffects = 1 in {
def EX : SideEffectBinaryRX<"ex", 0x44, GR64>;
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 7bb4fe5afb3f..713612129d90 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -531,6 +531,7 @@ def BDAddr64Disp12 : AddressAsmOperand<"BDAddr", "64", "12">;
def BDAddr64Disp20 : AddressAsmOperand<"BDAddr", "64", "20">;
def BDXAddr64Disp12 : AddressAsmOperand<"BDXAddr", "64", "12">;
def BDXAddr64Disp20 : AddressAsmOperand<"BDXAddr", "64", "20">;
+def BDLAddr64Disp12Len4 : AddressAsmOperand<"BDLAddr", "64", "12", "Len4">;
def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr", "64", "12", "Len8">;
def BDRAddr64Disp12 : AddressAsmOperand<"BDRAddr", "64", "12">;
def BDVAddr64Disp12 : AddressAsmOperand<"BDVAddr", "64", "12">;
@@ -578,6 +579,7 @@ def bdxaddr20pair : BDXMode<"BDXAddr", "64", "20", "Pair">;
def dynalloc12only : BDXMode<"DynAlloc", "64", "12", "Only">;
def laaddr12pair : BDXMode<"LAAddr", "64", "12", "Pair">;
def laaddr20pair : BDXMode<"LAAddr", "64", "20", "Pair">;
+def bdladdr12onlylen4 : BDLMode<"BDLAddr", "64", "12", "Only", "4">;
def bdladdr12onlylen8 : BDLMode<"BDLAddr", "64", "12", "Only", "8">;
def bdraddr12only : BDRMode<"BDRAddr", "64", "12", "Only">;
def bdvaddr12only : BDVMode< "64", "12">;
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index fde26ed4e1c5..adfc69c5d4cf 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -10,7 +10,8 @@
//===----------------------------------------------------------------------===//
// Type profiles
//===----------------------------------------------------------------------===//
-def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>]>;
+def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>,
+ SDTCisVT<1, i64>]>;
def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i64>,
SDTCisVT<1, i64>]>;
def SDT_ZCall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index dbba8ab42b5a..1ce0168f95e9 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -56,12 +56,16 @@ def LSU_lat1 : SchedWrite;
// Floating point unit (zEC12 and earlier)
def FPU : SchedWrite;
def FPU2 : SchedWrite;
+def DFU : SchedWrite;
+def DFU2 : SchedWrite;
// Vector sub units (z13)
def VecBF : SchedWrite;
def VecBF2 : SchedWrite;
def VecDF : SchedWrite;
def VecDF2 : SchedWrite;
+def VecDFX : SchedWrite;
+def VecDFX2 : SchedWrite;
def VecFPd : SchedWrite; // Blocking BFP div/sqrt unit.
def VecMul : SchedWrite;
def VecStr : SchedWrite;
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index 7aee6f52e9a7..612c3b6cf96e 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -76,6 +76,8 @@ def : WriteRes<VecBF, [Z13_VecUnit]> { let Latency = 8; }
def : WriteRes<VecBF2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
def : WriteRes<VecDF, [Z13_VecUnit]> { let Latency = 8; }
def : WriteRes<VecDF2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDFX, [Z13_VecUnit]> { let Latency = 1; }
+def : WriteRes<VecDFX2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 2; }
def : WriteRes<VecFPd, [Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
@@ -179,6 +181,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
// Move character
def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
// Pseudo -> reg move
def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -268,6 +271,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Multi-register moves
@@ -277,6 +281,9 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
(instregex "LM(H|Y|G)?$")>;
+// Load multiple disjoint
+def : InstRW<[FXb, Lat30, GroupAlone], (instregex "LMD$")>;
+
// Store multiple (estimated average of ceil(5/2) FXb ops)
def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
GroupAlone], (instregex "STM(G|H|Y)?$")>;
@@ -288,6 +295,7 @@ def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
//===----------------------------------------------------------------------===//
// Load address instructions
@@ -345,7 +353,7 @@ def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
def : InstRW<[FXa], (instregex "ALR(K)?$")>;
def : InstRW<[FXa], (instregex "AR(K)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "A(G)?SI$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
// Logical addition with carry
def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
@@ -438,11 +446,15 @@ def : InstRW<[FXa, Lat9, GroupAlone], (instregex "MLGR$")>;
def : InstRW<[FXa, Lat5], (instregex "MGHI$")>;
def : InstRW<[FXa, Lat5], (instregex "MHI$")>;
def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXa, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXa, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
//===----------------------------------------------------------------------===//
// Division and remainder
//===----------------------------------------------------------------------===//
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
def : InstRW<[FXa, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
def : InstRW<[LSU, FXa, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
@@ -456,7 +468,8 @@ def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SLA(K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXa, FXa, FXa, FXa, Lat8], (instregex "S(L|R)D(A|L)$")>;
// Rotate
def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -505,7 +518,7 @@ def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
// Compare logical character
def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
// Test under mask
@@ -516,6 +529,9 @@ def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+// Compare logical characters under mask
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
//===----------------------------------------------------------------------===//
// Prefetch and execution hint
//===----------------------------------------------------------------------===//
@@ -563,6 +579,42 @@ def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXb, VecDF, FXb, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXb, VecDFX, LSU, LSU, Lat9, GroupAlone],
+ (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXb, VecDFX2, LSU, LSU, Lat30, GroupAlone],
+ (instregex "(M|D)P$")>;
+def : InstRW<[FXb, FXb, VecDFX2, LSU, LSU, LSU, Lat15, GroupAlone],
+ (instregex "SRP$")>;
+def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
+def : InstRW<[VecDFX, LSU, Lat4, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
@@ -640,13 +692,30 @@ def : InstRW<[FXa], (instregex "ZEXT128_(32|64)$")>;
// String instructions
def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
// Move with key
def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+// Monitor call
+def : InstRW<[FXb], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXb, Lat30], (instregex "ECAG$")>;
+
// Extract CPU Time
def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>;
+// Extract PSW
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+
// Execute
def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
@@ -811,14 +880,17 @@ def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+// Divide to integer
+def : InstRW<[VecFPd, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
//===----------------------------------------------------------------------===//
// FP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)B$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)BR?$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXBR$")>;
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
// Test Data Class
def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index a950e54e7601..670df8ff5541 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
def Z196_FXUnit : ProcResource<2>;
def Z196_LSUnit : ProcResource<2>;
def Z196_FPUnit : ProcResource<1>;
+def Z196_DFUnit : ProcResource<1>;
// Subtarget specific definitions of scheduling resources.
def : WriteRes<FXU, [Z196_FXUnit]> { let Latency = 1; }
@@ -66,6 +67,8 @@ def : WriteRes<LSU, [Z196_LSUnit]> { let Latency = 4; }
def : WriteRes<LSU_lat1, [Z196_LSUnit]> { let Latency = 1; }
def : WriteRes<FPU, [Z196_FPUnit]> { let Latency = 8; }
def : WriteRes<FPU2, [Z196_FPUnit, Z196_FPUnit]> { let Latency = 9; }
+def : WriteRes<DFU, [Z196_DFUnit]> { let Latency = 2; }
+def : WriteRes<DFU2, [Z196_DFUnit, Z196_DFUnit]> { let Latency = 3; }
// -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -152,6 +155,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
// Move character
def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
// Pseudo -> reg move
def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -226,6 +230,7 @@ def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Multi-register moves
@@ -235,6 +240,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
(instregex "LM(H|Y|G)?$")>;
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
// Store multiple (estimated average of 3 ops)
def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
(instregex "STM(H|Y|G)?$")>;
@@ -246,6 +254,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
//===----------------------------------------------------------------------===//
// Load address instructions
@@ -285,7 +294,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>;
// Addition
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>;
def : InstRW<[FXU], (instregex "AIH$")>;
def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
@@ -294,15 +303,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
def : InstRW<[FXU], (instregex "AGR(K)?$")>;
def : InstRW<[FXU], (instregex "AHI(K)?$")>;
def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
def : InstRW<[FXU], (instregex "ALGHSIK$")>;
def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
def : InstRW<[FXU], (instregex "ALR(K)?$")>;
def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
// Logical addition with carry
def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
@@ -395,11 +403,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
//===----------------------------------------------------------------------===//
// Division and remainder
//===----------------------------------------------------------------------===//
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "D$")>;
def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
(instregex "DSG(F)?R$")>;
def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
@@ -416,7 +430,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU, Lat2], (instregex "SLA(K)?$")>;
+def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
// Rotate
def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -465,7 +480,7 @@ def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>;
// Compare logical character
def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
// Test under mask
@@ -476,6 +491,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+// Compare logical characters under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
//===----------------------------------------------------------------------===//
// Prefetch
//===----------------------------------------------------------------------===//
@@ -520,6 +538,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+ (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+ (instregex "(M|D)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+ (instregex "SRP$")>;
+def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
@@ -571,13 +625,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
// String instructions
def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
// Move with key
def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+// Monitor call
+def : InstRW<[FXU], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
+
// Extract CPU Time
def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
+// Extract PSW
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+
// Execute
def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -740,14 +811,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+// Divide to integer
+def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
//===----------------------------------------------------------------------===//
// FP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
-def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
// Test Data Class
def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 8ab6c826f1ed..1bdb8779dc72 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
def ZEC12_FXUnit : ProcResource<2>;
def ZEC12_LSUnit : ProcResource<2>;
def ZEC12_FPUnit : ProcResource<1>;
+def ZEC12_DFUnit : ProcResource<1>;
def ZEC12_VBUnit : ProcResource<1>;
// Subtarget specific definitions of scheduling resources.
@@ -67,6 +68,8 @@ def : WriteRes<LSU, [ZEC12_LSUnit]> { let Latency = 4; }
def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
def : WriteRes<FPU, [ZEC12_FPUnit]> { let Latency = 8; }
def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
+def : WriteRes<DFU, [ZEC12_DFUnit]> { let Latency = 2; }
+def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_FPUnit]> { let Latency = 3; }
def : WriteRes<VBU, [ZEC12_VBUnit]>; // Virtual Branching Unit
// -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -155,6 +158,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
// Move character
def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
// Pseudo -> reg move
def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -236,6 +240,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Multi-register moves
@@ -245,6 +250,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
(instregex "LM(H|Y|G)?$")>;
+// Load multiple disjoint
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>;
+
// Store multiple (estimated average of 3 ops)
def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
(instregex "STM(H|Y|G)?$")>;
@@ -256,6 +264,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
//===----------------------------------------------------------------------===//
// Load address instructions
@@ -295,7 +304,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>;
// Addition
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>;
def : InstRW<[FXU], (instregex "AIH$")>;
def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
@@ -304,15 +313,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
def : InstRW<[FXU], (instregex "AGR(K)?$")>;
def : InstRW<[FXU], (instregex "AHI(K)?$")>;
def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
def : InstRW<[FXU], (instregex "ALGHSIK$")>;
def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
def : InstRW<[FXU], (instregex "ALR(K)?$")>;
def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
// Logical addition with carry
def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
@@ -405,11 +413,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
//===----------------------------------------------------------------------===//
// Division and remainder
//===----------------------------------------------------------------------===//
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "D$")>;
def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
(instregex "DSG(F)?R$")>;
def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
@@ -426,7 +440,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SLA(K)?$")>;
+def : InstRW<[FXU], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
// Rotate
def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -475,7 +490,7 @@ def : InstRW<[FXU, Lat2], (instregex "CGFR$")>;
// Compare logical character
def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
// Test under mask
@@ -486,6 +501,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+// Compare logical characters under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
//===----------------------------------------------------------------------===//
// Prefetch and execution hint
//===----------------------------------------------------------------------===//
@@ -532,6 +550,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+ (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+ (instregex "(M|D)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+ (instregex "SRP$")>;
+def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
@@ -609,13 +663,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
// String instructions
def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
// Move with key
def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+// Monitor call
+def : InstRW<[FXU], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
+
// Extract CPU Time
def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
+// Extract PSW
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+
// Execute
def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -778,14 +849,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+// Divide to integer
+def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
//===----------------------------------------------------------------------===//
// FP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
-def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
// Test Data Class
def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index ce07ea3318a5..022679a7bc18 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -37,12 +37,13 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
const TargetMachine &TM)
: SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
- HasPopulationCount(false), HasFastSerialization(false),
- HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
+ HasPopulationCount(false), HasMessageSecurityAssist4(false),
+ HasFastSerialization(false), HasInterlockedAccess1(false),
+ HasMiscellaneousExtensions(false),
HasExecutionHint(false), HasLoadAndTrap(false),
HasTransactionalExecution(false), HasProcessorAssist(false),
HasVector(false), HasLoadStoreOnCond2(false),
- HasLoadAndZeroRightmostByte(false),
+ HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
TLInfo(TM, *this), TSInfo(), FrameLowering() {}
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index cdb61327a16a..770dd7cd939f 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -39,6 +39,7 @@ protected:
bool HasHighWord;
bool HasFPExtension;
bool HasPopulationCount;
+ bool HasMessageSecurityAssist4;
bool HasFastSerialization;
bool HasInterlockedAccess1;
bool HasMiscellaneousExtensions;
@@ -49,6 +50,7 @@ protected:
bool HasVector;
bool HasLoadStoreOnCond2;
bool HasLoadAndZeroRightmostByte;
+ bool HasMessageSecurityAssist5;
private:
Triple TargetTriple;
@@ -104,6 +106,10 @@ public:
// Return true if the target has the population-count facility.
bool hasPopulationCount() const { return HasPopulationCount; }
+ // Return true if the target has the message-security-assist
+ // extension facility 4.
+ bool hasMessageSecurityAssist4() const { return HasMessageSecurityAssist4; }
+
// Return true if the target has the fast-serialization facility.
bool hasFastSerialization() const { return HasFastSerialization; }
@@ -132,6 +138,10 @@ public:
return HasLoadAndZeroRightmostByte;
}
+ // Return true if the target has the message-security-assist
+ // extension facility 5.
+ bool hasMessageSecurityAssist5() const { return HasMessageSecurityAssist5; }
+
// Return true if the target has the vector facility.
bool hasVector() const { return HasVector; }
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 73d1d4be293b..6b45839c14b0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -19,8 +19,8 @@ let Defs = [ARGUMENTS] in {
// Call sequence markers. These have an immediate which represents the amount of
// stack space to allocate or free, which is used for varargs lowering.
let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
- [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
+ [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>;
def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
[(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
} // isCodeGenOnly = 1
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index a601b575f579..fa2146f7db84 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -25,7 +25,8 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
// WebAssembly-specific DAG Node Types.
//===----------------------------------------------------------------------===//
-def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
+ SDTCisVT<1, iPTR>]>;
def SDT_WebAssemblyCallSeqEnd :
SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 784c3a6557ff..3a421fe77392 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
+def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
+ "LEA instruction with 3 ops or certain registers is slow">;
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
def FeatureSoftFloat
@@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureLAHFSAHF,
+ FeatureSlow3OpsLEA,
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate
]>;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index ebd179e786da..fc3b4836c178 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -180,44 +180,6 @@ private:
} // end anonymous namespace.
-static std::pair<X86::CondCode, bool>
-getX86ConditionCode(CmpInst::Predicate Predicate) {
- X86::CondCode CC = X86::COND_INVALID;
- bool NeedSwap = false;
- switch (Predicate) {
- default: break;
- // Floating-point Predicates
- case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
- case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
- case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
- case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
- case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
- case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
- case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
- case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
- case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
- case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
- case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
- case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
- case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH;
- case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
-
- // Integer Predicates
- case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
- case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
- case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
- case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
- case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
- case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
- case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
- case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
- case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
- case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
- }
-
- return std::make_pair(CC, NeedSwap);
-}
-
static std::pair<unsigned, bool>
getX86SSEConditionCode(CmpInst::Predicate Predicate) {
unsigned CC;
@@ -1559,7 +1521,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
X86::CondCode CC;
bool SwapArgs;
- std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+ std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
unsigned Opc = X86::getSETFromCond(CC);
@@ -1697,7 +1659,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
bool SwapArgs;
unsigned BranchOpc;
- std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+ std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
BranchOpc = X86::GetCondBranchFromCond(CC);
@@ -2070,7 +2032,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
}
bool NeedSwap;
- std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+ std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
const Value *CmpLHS = CI->getOperand(0);
@@ -2319,7 +2281,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
const auto *CI = dyn_cast<CmpInst>(Cond);
if (CI && (CI->getParent() == I->getParent())) {
bool NeedSwap;
- std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+ std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
if (CC > X86::LAST_VALID_COND)
return false;
@@ -3293,7 +3255,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// Issue CALLSEQ_START
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
- .addImm(NumBytes).addImm(0);
+ .addImm(NumBytes).addImm(0).addImm(0);
// Walk the register/memloc assignments, inserting copies/loads.
const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 2cd4c1a3e7b3..9f649dad8bc0 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -27,20 +27,26 @@
#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
-#define DEBUG_TYPE "x86-fixup-LEAs"
+namespace llvm {
+void initializeFixupLEAPassPass(PassRegistry &);
+}
+
+#define FIXUPLEA_DESC "X86 LEA Fixup"
+#define FIXUPLEA_NAME "x86-fixup-LEAs"
+
+#define DEBUG_TYPE FIXUPLEA_NAME
STATISTIC(NumLEAs, "Number of LEA instructions created");
namespace {
class FixupLEAPass : public MachineFunctionPass {
enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
- static char ID;
+
/// \brief Loop over all of the instructions in the basic block
/// replacing applicable instructions with LEA instructions,
/// where appropriate.
bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
- StringRef getPassName() const override { return "X86 LEA Fixup"; }
/// \brief Given a machine register, look for the instruction
/// which writes it in the current basic block. If found,
@@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass {
void processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI);
+
+ /// \brief Given a LEA instruction which is unprofitable
+ /// on SNB+ try to replace it with other instructions.
+ /// According to Intel's Optimization Reference Manual:
+ /// " For LEA instructions with three source operands and some specific
+ /// situations, instruction latency has increased to 3 cycles, and must
+ /// dispatch via port 1:
+ /// - LEA that has all three source operands: base, index, and offset
+ /// - LEA that uses base and index registers where the base is EBP, RBP,
+ /// or R13
+ /// - LEA that uses RIP relative addressing mode
+ /// - LEA that uses 16-bit addressing mode "
+ /// This function currently handles the first 2 cases only.
+ MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
+ MachineFunction::iterator MFI);
+
/// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
/// and convert them to INC or DEC respectively.
bool fixupIncDec(MachineBasicBlock::iterator &I,
@@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass {
MachineBasicBlock::iterator &MBBI) const;
public:
- FixupLEAPass() : MachineFunctionPass(ID) {}
+ static char ID;
+
+ StringRef getPassName() const override { return FIXUPLEA_DESC; }
+
+ FixupLEAPass() : MachineFunctionPass(ID) {
+ initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
+ }
/// \brief Loop over all of the basic blocks,
/// replacing instructions by equivalent LEA instructions
@@ -104,9 +132,12 @@ private:
bool OptIncDec;
bool OptLEA;
};
-char FixupLEAPass::ID = 0;
}
+char FixupLEAPass::ID = 0;
+
+INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
+
MachineInstr *
FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI) const {
@@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
MF = &Func;
const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
- OptLEA = ST.LEAusesAG() || ST.slowLEA();
+ OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
if (!OptLEA && !OptIncDec)
return false;
@@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
return MachineBasicBlock::iterator();
}
-static inline bool isLEA(const int opcode) {
- return opcode == X86::LEA16r || opcode == X86::LEA32r ||
- opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+static inline bool isLEA(const int Opcode) {
+ return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+ Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+static inline bool isInefficientLEAReg(unsigned int Reg) {
+ return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13;
+}
+
+static inline bool isRegOperand(const MachineOperand &Op) {
+ return Op.isReg() && Op.getReg() != X86::NoRegister;
+}
+/// hasIneffecientLEARegs - LEA that uses base and index registers
+/// where the base is EBP, RBP, or R13
+static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
+ const MachineOperand &Index) {
+ return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
+ isRegOperand(Index);
+}
+
+static inline bool hasLEAOffset(const MachineOperand &Offset) {
+ return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
+}
+
+// LEA instruction that has all three operands: offset, base and index
+static inline bool isThreeOperandsLEA(const MachineOperand &Base,
+ const MachineOperand &Index,
+ const MachineOperand &Offset) {
+ return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset);
+}
+
+static inline int getADDrrFromLEA(int LEAOpcode) {
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA16r:
+ return X86::ADD16rr;
+ case X86::LEA32r:
+ return X86::ADD32rr;
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ return X86::ADD64rr;
+ }
+}
+
+static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
+ bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA16r:
+ return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri;
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
+ case X86::LEA64r:
+ return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32;
+ }
}
/// isLEASimpleIncOrDec - Does this LEA have one these forms:
@@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) {
MachineInstr &MI = *I;
- const int opcode = MI.getOpcode();
- if (!isLEA(opcode))
+ const int Opcode = MI.getOpcode();
+ if (!isLEA(Opcode))
return;
if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
!TII->isSafeToClobberEFLAGS(*MFI, I))
@@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
return;
if (MI.getOperand(2).getImm() > 1)
return;
- int addrr_opcode, addri_opcode;
- switch (opcode) {
- default:
- llvm_unreachable("Unexpected LEA instruction");
- case X86::LEA16r:
- addrr_opcode = X86::ADD16rr;
- addri_opcode = X86::ADD16ri;
- break;
- case X86::LEA32r:
- addrr_opcode = X86::ADD32rr;
- addri_opcode = X86::ADD32ri;
- break;
- case X86::LEA64_32r:
- case X86::LEA64r:
- addrr_opcode = X86::ADD64rr;
- addri_opcode = X86::ADD64ri32;
- break;
- }
DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
DEBUG(dbgs() << "FixLEA: Replaced by: ";);
MachineInstr *NewMI = nullptr;
- const MachineOperand &Dst = MI.getOperand(0);
// Make ADD instruction for two registers writing to LEA's destination
if (SrcR1 != 0 && SrcR2 != 0) {
- const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
- const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
- NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
- .add(Dst)
- .add(Src1)
- .add(Src2);
- MFI->insert(I, NewMI);
+ const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
+ const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+ NewMI =
+ BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
DEBUG(NewMI->dump(););
}
// Make ADD instruction for immediate
if (MI.getOperand(4).getImm() != 0) {
+ const MCInstrDesc &ADDri =
+ TII->get(getADDriFromLEA(Opcode, MI.getOperand(4)));
const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
- NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
- .add(Dst)
+ NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
.add(SrcR)
.addImm(MI.getOperand(4).getImm());
- MFI->insert(I, NewMI);
DEBUG(NewMI->dump(););
}
if (NewMI) {
MFI->erase(I);
- I = static_cast<MachineBasicBlock::iterator>(NewMI);
+ I = NewMI;
+ }
+}
+
+MachineInstr *
+FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
+ MachineFunction::iterator MFI) {
+
+ const int LEAOpcode = MI.getOpcode();
+ if (!isLEA(LEAOpcode))
+ return nullptr;
+
+ const MachineOperand &Dst = MI.getOperand(0);
+ const MachineOperand &Base = MI.getOperand(1);
+ const MachineOperand &Scale = MI.getOperand(2);
+ const MachineOperand &Index = MI.getOperand(3);
+ const MachineOperand &Offset = MI.getOperand(4);
+ const MachineOperand &Segment = MI.getOperand(5);
+
+ if (!(isThreeOperandsLEA(Base, Index, Offset) ||
+ hasInefficientLEABaseReg(Base, Index)) ||
+ !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
+ Segment.getReg() != X86::NoRegister)
+ return nullptr;
+
+ unsigned int DstR = Dst.getReg();
+ unsigned int BaseR = Base.getReg();
+ unsigned int IndexR = Index.getReg();
+ unsigned SSDstR =
+ (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
+ bool IsScale1 = Scale.getImm() == 1;
+ bool IsInefficientBase = isInefficientLEAReg(BaseR);
+ bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+
+ // Skip these cases since it takes more than 2 instructions
+ // to replace the LEA instruction.
+ if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
+ return nullptr;
+ if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
+ (IsInefficientIndex || !IsScale1))
+ return nullptr;
+
+ const DebugLoc DL = MI.getDebugLoc();
+ const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
+ const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+
+ DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
+ DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+
+ // First try to replace LEA with one or two (for the 3-op LEA case)
+ // add instructions:
+ // 1.lea (%base,%index,1), %base => add %index,%base
+ // 2.lea (%base,%index,1), %index => add %base,%index
+ if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
+ const MachineOperand &Src = DstR == BaseR ? Index : Base;
+ MachineInstr *NewMI =
+ BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
+ DEBUG(NewMI->dump(););
+ // Create ADD instruction for the Offset in case of 3-Ops LEA.
+ if (hasLEAOffset(Offset)) {
+ NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+ DEBUG(NewMI->dump(););
+ }
+ return NewMI;
+ }
+ // If the base is inefficient try switching the index and base operands,
+ // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+ // lea offset(%base,%index,scale),%dst =>
+ // lea (%base,%index,scale); add offset,%dst
+ if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+ MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+ .add(Dst)
+ .add(IsInefficientBase ? Index : Base)
+ .add(Scale)
+ .add(IsInefficientBase ? Base : Index)
+ .addImm(0)
+ .add(Segment);
+ DEBUG(NewMI->dump(););
+ // Create ADD instruction for the Offset in case of 3-Ops LEA.
+ if (hasLEAOffset(Offset)) {
+ NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+ DEBUG(NewMI->dump(););
+ }
+ return NewMI;
+ }
+ // Handle the rest of the cases with inefficient base register:
+ assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+ assert(IsInefficientBase && "efficient base should be handled already!");
+
+ // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
+ if (IsScale1 && !hasLEAOffset(Offset)) {
+ TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill());
+ DEBUG(MI.getPrevNode()->dump(););
+
+ MachineInstr *NewMI =
+ BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+ DEBUG(NewMI->dump(););
+ return NewMI;
}
+ // lea offset(%base,%index,scale), %dst =>
+ // lea offset( ,%index,scale), %dst; add %base,%dst
+ MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+ .add(Dst)
+ .addReg(0)
+ .add(Scale)
+ .add(Index)
+ .add(Offset)
+ .add(Segment);
+ DEBUG(NewMI->dump(););
+
+ NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+ DEBUG(NewMI->dump(););
+ return NewMI;
}
bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
@@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
if (OptLEA) {
if (MF.getSubtarget<X86Subtarget>().isSLM())
processInstructionForSLM(I, MFI);
- else
- processInstruction(I, MFI);
+
+ else {
+ if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
+ if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
+ MFI->erase(I);
+ I = NewMI;
+ }
+ } else
+ processInstruction(I, MFI);
+ }
}
}
return false;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 12a10bf3072f..c899f0fd5100 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1178,8 +1178,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
break;
- if (ConstantSDNode
- *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
unsigned Val = CN->getZExtValue();
// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
// that the base operand remains free for further matching. If
@@ -1187,15 +1186,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
if (Val == 1 || Val == 2 || Val == 3) {
AM.Scale = 1 << Val;
- SDValue ShVal = N.getNode()->getOperand(0);
+ SDValue ShVal = N.getOperand(0);
// Okay, we know that we have a scale by now. However, if the scaled
// value is an add of something and a constant, we can fold the
// constant into the disp field here.
if (CurDAG->isBaseWithConstantOffset(ShVal)) {
- AM.IndexReg = ShVal.getNode()->getOperand(0);
- ConstantSDNode *AddVal =
- cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+ AM.IndexReg = ShVal.getOperand(0);
+ ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
if (!foldOffsetIntoAddress(Disp, AM))
return false;
@@ -1245,28 +1243,27 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
if (AM.BaseType == X86ISelAddressMode::RegBase &&
AM.Base_Reg.getNode() == nullptr &&
AM.IndexReg.getNode() == nullptr) {
- if (ConstantSDNode
- *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
CN->getZExtValue() == 9) {
AM.Scale = unsigned(CN->getZExtValue())-1;
- SDValue MulVal = N.getNode()->getOperand(0);
+ SDValue MulVal = N.getOperand(0);
SDValue Reg;
// Okay, we know that we have a scale by now. However, if the scaled
// value is an add of something and a constant, we can fold the
// constant into the disp field here.
if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
- isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
- Reg = MulVal.getNode()->getOperand(0);
+ isa<ConstantSDNode>(MulVal.getOperand(1))) {
+ Reg = MulVal.getOperand(0);
ConstantSDNode *AddVal =
- cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+ cast<ConstantSDNode>(MulVal.getOperand(1));
uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
if (foldOffsetIntoAddress(Disp, AM))
- Reg = N.getNode()->getOperand(0);
+ Reg = N.getOperand(0);
} else {
- Reg = N.getNode()->getOperand(0);
+ Reg = N.getOperand(0);
}
AM.IndexReg = AM.Base_Reg = Reg;
@@ -1289,7 +1286,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Test if the LHS of the sub can be folded.
X86ISelAddressMode Backup = AM;
- if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+ if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
AM = Backup;
break;
}
@@ -1300,7 +1297,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
}
int Cost = 0;
- SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+ SDValue RHS = Handle.getValue().getOperand(1);
// If the RHS involves a register with multiple uses, this
// transformation incurs an extra mov, due to the neg instruction
// clobbering its operand.
@@ -1309,7 +1306,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
- RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+ RHS.getOperand(0).getValueType() == MVT::i32))
++Cost;
// If the base is a register with multiple uses, this
// transformation may save a mov.
@@ -2524,7 +2521,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
N0.getNode()->hasOneUse() &&
N0.getValueType() != MVT::i8 &&
X86::isZeroNode(N1)) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!C) break;
// For example, convert "testl %eax, $8" to "testb %al, $8"
@@ -2532,7 +2529,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
(!(C->getZExtValue() & 0x80) ||
hasNoSignedComparisonUses(Node))) {
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
- SDValue Reg = N0.getNode()->getOperand(0);
+ SDValue Reg = N0.getOperand(0);
// On x86-32, only the ABCD registers have 8-bit subregisters.
if (!Subtarget->is64Bit()) {
@@ -2568,7 +2565,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Shift the immediate right by 8 bits.
SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
dl, MVT::i8);
- SDValue Reg = N0.getNode()->getOperand(0);
+ SDValue Reg = N0.getOperand(0);
// Put the value in an ABCD register.
const TargetRegisterClass *TRC;
@@ -2605,7 +2602,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
hasNoSignedComparisonUses(Node))) {
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
MVT::i16);
- SDValue Reg = N0.getNode()->getOperand(0);
+ SDValue Reg = N0.getOperand(0);
// Extract the 16-bit subregister.
SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
@@ -2628,7 +2625,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
hasNoSignedComparisonUses(Node))) {
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
MVT::i32);
- SDValue Reg = N0.getNode()->getOperand(0);
+ SDValue Reg = N0.getOperand(0);
// Extract the 32-bit subregister.
SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9ee2234595f9..11c08292518a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
@@ -79,6 +80,17 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
" of the loop header PC will be 0)."),
cl::Hidden);
+/// Call this when the user attempts to do something unsupported, like
+/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
+/// report_fatal_error, so calling code should attempt to recover without
+/// crashing.
+static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
+ const char *Msg) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ DAG.getContext()->diagnose(
+ DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
+}
+
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
@@ -1381,7 +1393,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
@@ -1445,8 +1457,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
- setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
@@ -1479,7 +1489,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
@@ -2207,15 +2217,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// or SSE or MMX vectors.
if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
- (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
- report_fatal_error("SSE register return with SSE disabled");
+ (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
+ errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ } else if (ValVT == MVT::f64 &&
+ (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
+ // Likewise we can't return F64 values with SSE1 only. gcc does so, but
+ // llvm-gcc has never done it right and no one has noticed, so this
+ // should be OK for now.
+ errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
- // Likewise we can't return F64 values with SSE1 only. gcc does so, but
- // llvm-gcc has never done it right and no one has noticed, so this
- // should be OK for now.
- if (ValVT == MVT::f64 &&
- (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
- report_fatal_error("SSE2 register return with SSE2 disabled");
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
// the RET instruction and handled by the FP Stackifier.
@@ -2528,7 +2540,8 @@ SDValue X86TargetLowering::LowerCallResult(
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
- report_fatal_error("SSE register return with SSE disabled");
+ errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -3415,8 +3428,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
if (!IsSibcall)
- Chain = DAG.getCALLSEQ_START(
- Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
+ NumBytes - NumBytesToPush, dl);
SDValue RetAddrFrIdx;
// Load return address for tail calls.
@@ -6912,9 +6925,9 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
if (IsSplat)
- return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
- DAG.getConstant(1, dl, VT),
- DAG.getConstant(0, dl, VT));
+ return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
+ DAG.getConstant(1, dl, VT),
+ DAG.getConstant(0, dl, VT));
// insert elements one by one
SDValue DstVec;
@@ -8386,9 +8399,9 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
Subtarget, DAG, DL);
SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
- return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
- DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
- ZeroVector);
+ return DAG.getSelect(DL, VT, VMask,
+ DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+ ZeroVector);
}
static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
@@ -8748,8 +8761,9 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
return DAG.getBitcast(
- VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
- DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
+ VT,
+ DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
+ V1, V2));
}
case MVT::v16f32:
case MVT::v8f64:
@@ -13817,6 +13831,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
return SDValue();
+ // If this VSELECT has a vector if i1 as a mask, it will be directly matched
+ // with patterns on the mask registers on AVX-512.
+ if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
+ return Op;
+
// Try to lower this to a blend-style vector shuffle. This can handle all
// constant condition cases.
if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
@@ -13826,10 +13845,30 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget.hasSSE41())
return SDValue();
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
+ // into an i1 condition so that we can use the mask-based 512-bit blend
+ // instructions.
+ if (VT.getSizeInBits() == 512) {
+ SDValue Cond = Op.getOperand(0);
+ // The vNi1 condition case should be handled above as it can be trivially
+ // lowered.
+ assert(Cond.getValueType().getScalarSizeInBits() ==
+ VT.getScalarSizeInBits() &&
+ "Should have a size-matched integer condition!");
+ // Build a mask by testing the condition against itself (tests for zero).
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
+ // Now return a new VSELECT using the mask.
+ return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
+ }
+
// Only some types will be legal on some subtargets. If we can emit a legal
// VSELECT-matching blend, return Op, and but if we need to expand, return
// a null value.
- switch (Op.getSimpleValueType().SimpleTy) {
+ switch (VT.SimpleTy) {
default:
// Most of the vector types have blends past SSE4.1.
return Op;
@@ -14725,7 +14764,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
// location.
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
SDValue Args[] = { Chain, Offset };
Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
@@ -15348,8 +15387,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
SDValue Zero = DAG.getIntPtrConstant(0, dl);
SDValue Four = DAG.getIntPtrConstant(4, dl);
- SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
- Zero, Four);
+ SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
// Load the value out, extending it from f32 to f80.
@@ -15621,7 +15659,7 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
SDValue Zero =
DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
- SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+ SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
if (VT == ExtVT)
return SelectedVal;
return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
@@ -16713,7 +16751,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
if (BitWidth > AndBitWidth) {
KnownBits Known;
DAG.computeKnownBits(Op0, Known);
- if (Known.Zero.countLeadingOnes() < BitWidth - AndBitWidth)
+ if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
return SDValue();
}
LHS = Op1;
@@ -17455,7 +17493,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
VCmp = DAG.getBitcast(VCmpVT, VCmp);
- SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+ SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
VSel, DAG.getIntPtrConstant(0, DL));
@@ -17483,9 +17521,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
Op2Scalar = Op2.getOperand(0);
if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
- SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
- Op1Scalar.getValueType(),
- Cond, Op1Scalar, Op2Scalar);
+ SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
+ Op1Scalar, Op2Scalar);
if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
return DAG.getBitcast(VT, newSelect);
SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
@@ -17500,8 +17537,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
- SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
- Cond, Op1, Op2);
+ SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
}
@@ -17770,7 +17806,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
} else {
SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
- V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+ V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
if (ExtVT == VT)
return V;
}
@@ -18572,7 +18608,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
bool Is64Bit = Subtarget.is64Bit();
MVT SPTy = getPointerTy(DAG.getDataLayout());
@@ -19021,8 +19057,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- if (isAllOnesConstant(Mask))
- return Op;
+
+ if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
+ if (MaskConst->getZExtValue() & 0x1)
+ return Op;
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -19081,7 +19119,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
// registration, or the .set_setframe offset.
MCSymbol *OffsetSym =
MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
- GlobalValue::getRealLinkageName(Fn->getName()));
+ GlobalValue::dropLLVMManglingEscape(Fn->getName()));
SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
SDValue ParentFrameOffset =
DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
@@ -19683,12 +19721,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
- case CONVERT_MASK_TO_VEC: {
- SDValue Mask = Op.getOperand(1);
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
- }
case BRCST_SUBVEC_TO_VEC: {
SDValue Src = Op.getOperand(1);
SDValue Passthru = Op.getOperand(2);
@@ -19932,7 +19964,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue Op1 = Op.getOperand(1);
auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
- GlobalValue::getRealLinkageName(Fn->getName()));
+ GlobalValue::dropLLVMManglingEscape(Fn->getName()));
// Generate a simple absolute symbol reference. This intrinsic is only
// supported on 32-bit Windows, which isn't PIC.
@@ -21741,6 +21773,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
SDValue Ex = DAG.getBitcast(ExVT, R);
+ // ashr(R, 63) === cmp_slt(R, 0)
+ if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
+ assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
+ "Unsupported PCMPGT op");
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT,
+ getZeroVector(VT, Subtarget, DAG, dl), R);
+ }
+
if (ShiftAmt >= 32) {
// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
SDValue Upper =
@@ -21839,10 +21879,19 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
}
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
(VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
(Subtarget.hasAVX512() && VT == MVT::v8i64))) {
+ // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
+ unsigned SubVectorScale = 1;
+ if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ SubVectorScale =
+ Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
+ Amt = Amt.getOperand(0);
+ }
+
// Peek through any splat that was introduced for i64 shift vectorization.
int SplatIndex = -1;
if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
@@ -21859,7 +21908,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
Amt = Amt.getOperand(0);
unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
- VT.getVectorNumElements();
+ (SubVectorScale * VT.getVectorNumElements());
unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
uint64_t ShiftAmt = 0;
unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
@@ -22233,23 +22282,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
- return DAG.getBitcast(SelVT,
- DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
} else if (Subtarget.hasSSE41()) {
// On SSE41 targets we make use of the fact that VSELECT lowers
// to PBLENDVB which selects bytes based just on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
- return DAG.getBitcast(SelVT,
- DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
}
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
- return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+ return DAG.getSelect(dl, SelVT, C, V0, V1);
};
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
@@ -22371,15 +22418,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
V0 = DAG.getBitcast(ExtVT, V0);
V1 = DAG.getBitcast(ExtVT, V1);
Sel = DAG.getBitcast(ExtVT, Sel);
- return DAG.getBitcast(
- VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+ return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
}
// On pre-SSE41 targets we splat the sign bit - a negative value will
// set all bits of the lanes to true and VSELECT uses that in
// its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue C =
DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
- return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+ return DAG.getSelect(dl, VT, C, V0, V1);
};
// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
@@ -23296,9 +23342,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
SDValue Callee =
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
- Type *RetTy = isF64
- ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
- : (Type*)VectorType::get(ArgTy, 4);
+ Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
+ : (Type *)VectorType::get(ArgTy, 4);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
@@ -25779,7 +25824,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
// Emit CALLSEQ_START right before the instruction.
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
MachineInstrBuilder CallseqStart =
- BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
+ BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
// Emit CALLSEQ_END right after the instruction.
@@ -26517,7 +26562,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
-
+
case TargetOpcode::PATCHABLE_EVENT_CALL:
// Do nothing here, handle in xray instrumentation pass.
return BB;
@@ -29532,7 +29577,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
DAG.getAllOnesConstant(DL, CondVT));
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
- return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+ return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
}
// To use the condition operand as a bitwise mask, it must have elements that
@@ -30015,7 +30060,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
Cond.getOperand(0), Cond.getOperand(1), NewCC);
- return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+ return DAG.getSelect(DL, VT, Cond, LHS, RHS);
}
}
}
@@ -31561,20 +31606,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// (sub (xor X, M), M)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- assert(N->getOpcode() == ISD::OR);
+ assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
- if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
+ if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (VT.is256BitVector() && Subtarget.hasInt256())))
return SDValue();
- assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
- // Canonicalize pandn to RHS
- if (N0.getOpcode() == X86ISD::ANDNP)
+ // Canonicalize AND to LHS.
+ if (N1.getOpcode() == ISD::AND)
std::swap(N0, N1);
+ // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
+ // ANDNP combine allows other combines to happen that prevent matching.
if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
return SDValue();
@@ -31596,20 +31643,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
Y = peekThroughBitcasts(Y);
EVT MaskVT = Mask.getValueType();
-
- // Validate that the Mask operand is a vector sra node.
- // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
- // there is no psrai.b
unsigned EltBits = MaskVT.getScalarSizeInBits();
- unsigned SraAmt = ~0;
- if (Mask.getOpcode() == ISD::SRA) {
- if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
- if (auto *AmtConst = AmtBV->getConstantSplatNode())
- SraAmt = AmtConst->getZExtValue();
- } else if (Mask.getOpcode() == X86ISD::VSRAI)
- SraAmt = Mask.getConstantOperandVal(1);
- if ((SraAmt + 1) != EltBits)
+ // TODO: Attempt to handle floating point cases as well?
+ if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
return SDValue();
SDLoc DL(N);
@@ -31630,7 +31667,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
// (add (xor X, M), (and M, 1))
// And further to:
// (sub (xor X, M), M)
- if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+ if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
+ DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
auto IsNegV = [](SDNode *N, SDValue V) {
return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
@@ -31642,9 +31680,6 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
V = Y;
if (V) {
- if (EltBits != 8 && EltBits != 16 && EltBits != 32)
- return SDValue();
-
SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
SDValue SubOp2 = Mask;
@@ -31661,8 +31696,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
if (V == Y)
std::swap(SubOp1, SubOp2);
- return DAG.getBitcast(VT,
- DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
+ SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+ return DAG.getBitcast(VT, Res);
}
}
@@ -31675,7 +31710,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
X = DAG.getBitcast(BlendVT, X);
Y = DAG.getBitcast(BlendVT, Y);
Mask = DAG.getBitcast(BlendVT, Mask);
- Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+ Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
return DAG.getBitcast(VT, Mask);
}
@@ -33655,8 +33690,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
// are NaN, the NaN value of Op1 is the result.
- auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
- return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+ return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
}
/// Do target-specific dag combines on X86ISD::ANDNP nodes.
@@ -33949,7 +33983,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (InVT == MVT::i1) {
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
- return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+ return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
}
return SDValue();
}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 3dc673e3c35a..d003d027ddb9 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -43,7 +43,8 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber EFLAGS.
let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
+ (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
"#ADJCALLSTACKDOWN",
[]>,
Requires<[NotLP64]>;
@@ -52,8 +53,8 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
[(X86callseq_end timm:$amt1, timm:$amt2)]>,
Requires<[NotLP64]>;
}
-def : Pat<(X86callseq_start timm:$amt1),
- (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+ (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;
// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
@@ -62,7 +63,8 @@ def : Pat<(X86callseq_start timm:$amt1),
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber EFLAGS.
let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
+ (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
"#ADJCALLSTACKDOWN",
[]>,
Requires<[IsLP64]>;
@@ -71,8 +73,8 @@ def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
[(X86callseq_end timm:$amt1, timm:$amt2)]>,
Requires<[IsLP64]>;
}
-def : Pat<(X86callseq_start timm:$amt1),
- (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+ (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;
// x86-64 va_start lowering magic.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 888daa275265..092ceb207ada 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -5729,6 +5729,44 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
}
}
+std::pair<X86::CondCode, bool>
+X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
+ X86::CondCode CC = X86::COND_INVALID;
+ bool NeedSwap = false;
+ switch (Predicate) {
+ default: break;
+ // Floating-point Predicates
+ case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
+ case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
+ case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
+ case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
+ case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
+ case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
+ case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+ // Integer Predicates
+ case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
+ case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
+ case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
+ case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
+ case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
+ case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
+ case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
+ case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
+ }
+
+ return std::make_pair(CC, NeedSwap);
+}
+
/// Return a set opcode for the given condition and
/// whether it has memory operand.
unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
@@ -7589,6 +7627,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
case X86::AVX2_SETALLONES:
return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+ case X86::AVX1_SETALLONES: {
+ unsigned Reg = MIB->getOperand(0).getReg();
+ // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
+ MIB->setDesc(get(X86::VCMPPSYrri));
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
+ return true;
+ }
case X86::AVX512_512_SETALLONES: {
unsigned Reg = MIB->getOperand(0).getReg();
MIB->setDesc(get(X86::VPTERNLOGDZrri));
@@ -8477,6 +8522,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Alignment = 64;
break;
case X86::AVX2_SETALLONES:
+ case X86::AVX1_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_256_SET0:
Alignment = 32;
@@ -8522,6 +8568,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::V_SET0:
case X86::V_SETALLONES:
case X86::AVX2_SETALLONES:
+ case X86::AVX1_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_128_SET0:
case X86::AVX512_256_SET0:
@@ -8563,13 +8610,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
- Opc == X86::AVX512_256_SET0)
+ Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
else
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
- Opc == X86::AVX512_512_SETALLONES);
+ Opc == X86::AVX512_512_SETALLONES ||
+ Opc == X86::AVX1_SETALLONES);
const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
Constant::getNullValue(Ty);
unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 38567831b3a4..e64876073ccf 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -64,6 +64,10 @@ enum CondCode {
// Turn condition code into conditional branch opcode.
unsigned GetCondBranchFromCond(CondCode CC);
+/// \brief Return a pair of condition code for the given predicate and whether
+/// the instruction operands should be swaped to match the condition code.
+std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
+
/// \brief Return a set opcode for the given condition and whether it has
/// a memory operand.
unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
@@ -186,6 +190,8 @@ public:
/// setup..destroy sequence (e.g. by pushes, or inside the callee).
int64_t getFrameAdjustment(const MachineInstr &I) const {
assert(isFrameInstr(I));
+ if (isFrameSetup(I))
+ return I.getOperand(2).getImm();
return I.getOperand(1).getImm();
}
@@ -193,7 +199,10 @@ public:
/// instruction.
void setFrameAdjustment(MachineInstr &I, int64_t V) const {
assert(isFrameInstr(I));
- I.getOperand(1).setImm(V);
+ if (isFrameSetup(I))
+ I.getOperand(2).setImm(V);
+ else
+ I.getOperand(1).setImm(V);
}
/// getSPAdjust - This returns the stack pointer adjustment made by
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 902b0c2c04e3..4d7d8ece92d9 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -84,7 +84,8 @@ def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
-def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
SDTCisVT<1, i32>]>;
@@ -2351,6 +2352,38 @@ let Predicates = [HasBMI2] in {
def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
(BZHI64rm addr:$src,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ // x & (-1 >> (32 - y))
+ def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
+ (BZHI32rr GR32:$src, GR32:$lz)>;
+ def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
+ (BZHI32rm addr:$src, GR32:$lz)>;
+
+ // x & (-1 >> (64 - y))
+ def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+ def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
+ (BZHI64rm addr:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+
+ // x << (32 - y) >> (32 - y)
+ def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))),
+ (i8 (trunc (sub 32, GR32:$lz)))),
+ (BZHI32rr GR32:$src, GR32:$lz)>;
+ def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))),
+ (i8 (trunc (sub 32, GR32:$lz)))),
+ (BZHI32rm addr:$src, GR32:$lz)>;
+
+ // x << (64 - y) >> (64 - y)
+ def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))),
+ (i8 (trunc (sub 64, GR32:$lz)))),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+ def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))),
+ (i8 (trunc (sub 64, GR32:$lz)))),
+ (BZHI64rm addr:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
} // HasBMI2
let Predicates = [HasBMI] in {
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 48da2fa607af..f73d85e7e01b 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -486,6 +486,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero] in {
def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>;
+ let Predicates = [HasAVX1Only, OptForMinSize] in {
+ def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+ }
let Predicates = [HasAVX2] in
def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8i32 immAllOnesV))]>;
@@ -7755,14 +7759,12 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
[]>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
}
-
-// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit
-// all ones value.
-let Predicates = [HasAVX1Only] in
-def : Pat<(v8i32 immAllOnesV),
- (VINSERTF128rr
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm),
- (V_SETALLONES), 1)>;
+// To create a 256-bit all ones value, we should produce VCMPTRUEPS
+// with YMM register containing zero.
+// FIXME: Avoid producing vxorps to clear the fake inputs.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
+}
multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
PatFrag memop_frag> {
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index d65eb1de8d09..de58d719acb4 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -56,13 +56,9 @@ private:
bool selectImpl(MachineInstr &I) const;
// TODO: remove after suported by Tablegen-erated instruction selection.
- unsigned getFAddOp(LLT &Ty, const RegisterBank &RB) const;
- unsigned getFSubOp(LLT &Ty, const RegisterBank &RB) const;
unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc,
uint64_t Alignment) const;
- bool selectBinaryOp(MachineInstr &I, MachineRegisterInfo &MRI,
- MachineFunction &MF) const;
bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -71,6 +67,10 @@ private:
MachineFunction &MF) const;
bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
+ bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
const X86TargetMachine &TM;
const X86Subtarget &STI;
@@ -226,13 +226,11 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
"Generic instruction has unexpected implicit operands\n");
if (selectImpl(I))
- return true;
+ return true;
DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
// TODO: This should be implemented by tblgen.
- if (selectBinaryOp(I, MRI, MF))
- return true;
if (selectLoadStoreOp(I, MRI, MF))
return true;
if (selectFrameIndexOrGep(I, MRI, MF))
@@ -241,109 +239,14 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
return true;
if (selectTrunc(I, MRI, MF))
return true;
+ if (selectZext(I, MRI, MF))
+ return true;
+ if (selectCmp(I, MRI, MF))
+ return true;
return false;
}
-unsigned X86InstructionSelector::getFAddOp(LLT &Ty,
- const RegisterBank &RB) const {
-
- if (X86::VECRRegBankID != RB.getID())
- return TargetOpcode::G_FADD;
-
- if (Ty == LLT::scalar(32)) {
- if (STI.hasAVX512()) {
- return X86::VADDSSZrr;
- } else if (STI.hasAVX()) {
- return X86::VADDSSrr;
- } else if (STI.hasSSE1()) {
- return X86::ADDSSrr;
- }
- } else if (Ty == LLT::scalar(64)) {
- if (STI.hasAVX512()) {
- return X86::VADDSDZrr;
- } else if (STI.hasAVX()) {
- return X86::VADDSDrr;
- } else if (STI.hasSSE2()) {
- return X86::ADDSDrr;
- }
- } else if (Ty == LLT::vector(4, 32)) {
- if ((STI.hasAVX512()) && (STI.hasVLX())) {
- return X86::VADDPSZ128rr;
- } else if (STI.hasAVX()) {
- return X86::VADDPSrr;
- } else if (STI.hasSSE1()) {
- return X86::ADDPSrr;
- }
- }
-
- return TargetOpcode::G_FADD;
-}
-
-unsigned X86InstructionSelector::getFSubOp(LLT &Ty,
- const RegisterBank &RB) const {
-
- if (X86::VECRRegBankID != RB.getID())
- return TargetOpcode::G_FSUB;
-
- if (Ty == LLT::scalar(32)) {
- if (STI.hasAVX512()) {
- return X86::VSUBSSZrr;
- } else if (STI.hasAVX()) {
- return X86::VSUBSSrr;
- } else if (STI.hasSSE1()) {
- return X86::SUBSSrr;
- }
- } else if (Ty == LLT::scalar(64)) {
- if (STI.hasAVX512()) {
- return X86::VSUBSDZrr;
- } else if (STI.hasAVX()) {
- return X86::VSUBSDrr;
- } else if (STI.hasSSE2()) {
- return X86::SUBSDrr;
- }
- } else if (Ty == LLT::vector(4, 32)) {
- if ((STI.hasAVX512()) && (STI.hasVLX())) {
- return X86::VSUBPSZ128rr;
- } else if (STI.hasAVX()) {
- return X86::VSUBPSrr;
- } else if (STI.hasSSE1()) {
- return X86::SUBPSrr;
- }
- }
-
- return TargetOpcode::G_FSUB;
-}
-
-bool X86InstructionSelector::selectBinaryOp(MachineInstr &I,
- MachineRegisterInfo &MRI,
- MachineFunction &MF) const {
-
- const unsigned DefReg = I.getOperand(0).getReg();
- LLT Ty = MRI.getType(DefReg);
- const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
-
- unsigned NewOpc = I.getOpcode();
-
- switch (NewOpc) {
- case TargetOpcode::G_FADD:
- NewOpc = getFAddOp(Ty, RB);
- break;
- case TargetOpcode::G_FSUB:
- NewOpc = getFSubOp(Ty, RB);
- break;
- default:
- break;
- }
-
- if (NewOpc == I.getOpcode())
- return false;
-
- I.setDesc(TII.get(NewOpc));
-
- return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-}
-
unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
unsigned Opc,
uint64_t Alignment) const {
@@ -562,6 +465,105 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
return true;
}
+bool X86InstructionSelector::selectZext(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ if (I.getOpcode() != TargetOpcode::G_ZEXT)
+ return false;
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+
+ if (SrcTy == LLT::scalar(1)) {
+
+ unsigned AndOpc;
+ if (DstTy == LLT::scalar(32))
+ AndOpc = X86::AND32ri8;
+ else if (DstTy == LLT::scalar(64))
+ AndOpc = X86::AND64ri8;
+ else
+ return false;
+
+ const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+ unsigned DefReg =
+ MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank));
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
+ .addImm(0)
+ .addReg(SrcReg)
+ .addImm(X86::sub_8bit);
+
+ MachineInstr &AndInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg)
+ .addReg(DefReg)
+ .addImm(1);
+
+ constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+bool X86InstructionSelector::selectCmp(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ if (I.getOpcode() != TargetOpcode::G_ICMP)
+ return false;
+
+ X86::CondCode CC;
+ bool SwapArgs;
+ std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
+ (CmpInst::Predicate)I.getOperand(1).getPredicate());
+ unsigned OpSet = X86::getSETFromCond(CC);
+
+ unsigned LHS = I.getOperand(2).getReg();
+ unsigned RHS = I.getOperand(3).getReg();
+
+ if (SwapArgs)
+ std::swap(LHS, RHS);
+
+ unsigned OpCmp;
+ LLT Ty = MRI.getType(LHS);
+
+ switch (Ty.getSizeInBits()) {
+ default:
+ return false;
+ case 8:
+ OpCmp = X86::CMP8rr;
+ break;
+ case 16:
+ OpCmp = X86::CMP16rr;
+ break;
+ case 32:
+ OpCmp = X86::CMP32rr;
+ break;
+ case 64:
+ OpCmp = X86::CMP64rr;
+ break;
+ }
+
+ MachineInstr &CmpInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp))
+ .addReg(LHS)
+ .addReg(RHS);
+
+ MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(OpSet), I.getOperand(0).getReg());
+
+ constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+}
+
InstructionSelector *
llvm::createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 2a40399ba571..bc73bb1ae8c5 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
EXPAND_FROM_MEM,
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
- FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
+ FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
};
struct IntrinsicData {
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 4f5e70414aa9..cf26238c0239 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -87,10 +87,16 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_ZEXT, s32}, Legal);
setAction({G_SEXT, s32}, Legal);
- for (auto Ty : {s8, s16}) {
+ for (auto Ty : {s1, s8, s16}) {
setAction({G_ZEXT, 1, Ty}, Legal);
setAction({G_SEXT, 1, Ty}, Legal);
}
+
+ // Comparison
+ setAction({G_ICMP, s1}, Legal);
+
+ for (auto Ty : {s8, s16, s32, p0})
+ setAction({G_ICMP, 1, Ty}, Legal);
}
void X86LegalizerInfo::setLegalizerInfo64bit() {
@@ -139,10 +145,16 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
setAction({G_SEXT, Ty}, Legal);
}
- for (auto Ty : {s8, s16, s32}) {
+ for (auto Ty : {s1, s8, s16, s32}) {
setAction({G_ZEXT, 1, Ty}, Legal);
setAction({G_SEXT, 1, Ty}, Legal);
}
+
+ // Comparison
+ setAction({G_ICMP, s1}, Legal);
+
+ for (auto Ty : {s8, s16, s32, s64, p0})
+ setAction({G_ICMP, 1, Ty}, Legal);
}
void X86LegalizerInfo::setLegalizerInfoSSE1() {
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index cf2ceef8013a..7e4cba1c8345 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -320,14 +320,14 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
case CallingConv::X86_RegCall:
if (Is64Bit) {
if (IsWin64) {
- return (HasSSE ? CSR_Win64_RegCall_SaveList :
+ return (HasSSE ? CSR_Win64_RegCall_SaveList :
CSR_Win64_RegCall_NoSSE_SaveList);
} else {
- return (HasSSE ? CSR_SysV64_RegCall_SaveList :
+ return (HasSSE ? CSR_SysV64_RegCall_SaveList :
CSR_SysV64_RegCall_NoSSE_SaveList);
}
} else {
- return (HasSSE ? CSR_32_RegCall_SaveList :
+ return (HasSSE ? CSR_32_RegCall_SaveList :
CSR_32_RegCall_NoSSE_SaveList);
}
case CallingConv::Cold:
@@ -435,15 +435,15 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return CSR_64_HHVM_RegMask;
case CallingConv::X86_RegCall:
if (Is64Bit) {
- if (IsWin64) {
- return (HasSSE ? CSR_Win64_RegCall_RegMask :
+ if (IsWin64) {
+ return (HasSSE ? CSR_Win64_RegCall_RegMask :
CSR_Win64_RegCall_NoSSE_RegMask);
} else {
- return (HasSSE ? CSR_SysV64_RegCall_RegMask :
+ return (HasSSE ? CSR_SysV64_RegCall_RegMask :
CSR_SysV64_RegCall_NoSSE_RegMask);
}
} else {
- return (HasSSE ? CSR_32_RegCall_RegMask :
+ return (HasSSE ? CSR_32_RegCall_RegMask :
CSR_32_RegCall_NoSSE_RegMask);
}
case CallingConv::Cold:
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index de1514243aeb..02be95e2e556 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -253,6 +253,11 @@ protected:
/// True if the LEA instruction with certain arguments is slow
bool SlowLEA;
+ /// True if the LEA instruction has all three source operands: base, index,
+ /// and offset or if the LEA instruction uses base and index registers where
+ /// the base is EBP, RBP,or R13
+ bool Slow3OpsLEA;
+
/// True if INC and DEC instructions are slow when writing to flags
bool SlowIncDec;
@@ -490,6 +495,7 @@ public:
bool callRegIndirect() const { return CallRegIndirect; }
bool LEAusesAG() const { return LEAUsesAG; }
bool slowLEA() const { return SlowLEA; }
+ bool slow3OpsLEA() const { return Slow3OpsLEA; }
bool slowIncDec() const { return SlowIncDec; }
bool hasCDI() const { return HasCDI; }
bool hasPFI() const { return HasPFI; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 086f55dd60b5..c6a90725d89c 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -61,6 +61,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
namespace llvm {
void initializeWinEHStatePassPass(PassRegistry &);
+void initializeFixupLEAPassPass(PassRegistry &);
void initializeX86ExecutionDepsFixPass(PassRegistry &);
} // end namespace llvm
@@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() {
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
initializeEvexToVexInstPassPass(PR);
+ initializeFixupLEAPassPass(PR);
initializeX86ExecutionDepsFixPass(PR);
}
@@ -87,7 +89,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
if (TT.isOSFreeBSD())
return llvm::make_unique<X86FreeBSDTargetObjectFile>();
- if (TT.isOSLinux() || TT.isOSNaCl())
+ if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
if (TT.isOSFuchsia())
return llvm::make_unique<X86FuchsiaTargetObjectFile>();
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index f3b619a2956a..80e18161a94b 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -247,35 +247,38 @@ int X86TTIImpl::getArithmeticInstrCost(
}
static const CostTblEntry SSE2UniformConstCostTable[] = {
- { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
- { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
- { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
-
- { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand).
- { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand).
- { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb).
-
- { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
- { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
- { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
- { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
- { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence
- { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
- { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence
- { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+ { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
+ { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
+ { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+
+ { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasSSE2()) {
// pmuldq sequence.
if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
- return LT.first * 30;
+ return LT.first * 32;
if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
return LT.first * 15;
- if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
- LT.second))
- return LT.first * Entry->Cost;
+ // XOP has faster vXi8 shifts.
+ if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
+ !ST->hasXOP())
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
}
static const CostTblEntry AVX2UniformCostTable[] = {
@@ -430,18 +433,18 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRL, MVT::v2i64, 2 },
{ ISD::SRA, MVT::v2i64, 2 },
// 256bit shifts require splitting if AVX2 didn't catch them above.
- { ISD::SHL, MVT::v32i8, 2 },
- { ISD::SRL, MVT::v32i8, 4 },
- { ISD::SRA, MVT::v32i8, 4 },
- { ISD::SHL, MVT::v16i16, 2 },
- { ISD::SRL, MVT::v16i16, 4 },
- { ISD::SRA, MVT::v16i16, 4 },
- { ISD::SHL, MVT::v8i32, 2 },
- { ISD::SRL, MVT::v8i32, 4 },
- { ISD::SRA, MVT::v8i32, 4 },
- { ISD::SHL, MVT::v4i64, 2 },
- { ISD::SRL, MVT::v4i64, 4 },
- { ISD::SRA, MVT::v4i64, 4 },
+ { ISD::SHL, MVT::v32i8, 2+2 },
+ { ISD::SRL, MVT::v32i8, 4+2 },
+ { ISD::SRA, MVT::v32i8, 4+2 },
+ { ISD::SHL, MVT::v16i16, 2+2 },
+ { ISD::SRL, MVT::v16i16, 4+2 },
+ { ISD::SRA, MVT::v16i16, 4+2 },
+ { ISD::SHL, MVT::v8i32, 2+2 },
+ { ISD::SRL, MVT::v8i32, 4+2 },
+ { ISD::SRA, MVT::v8i32, 4+2 },
+ { ISD::SHL, MVT::v4i64, 2+2 },
+ { ISD::SRL, MVT::v4i64, 4+2 },
+ { ISD::SRA, MVT::v4i64, 4+2 },
};
// Look for XOP lowering tricks.
@@ -451,23 +454,28 @@ int X86TTIImpl::getArithmeticInstrCost(
static const CostTblEntry SSE2UniformShiftCostTable[] = {
// Uniform splats are cheaper for the following instructions.
- { ISD::SHL, MVT::v16i16, 2 }, // psllw.
- { ISD::SHL, MVT::v8i32, 2 }, // pslld
- { ISD::SHL, MVT::v4i64, 2 }, // psllq.
-
- { ISD::SRL, MVT::v16i16, 2 }, // psrlw.
- { ISD::SRL, MVT::v8i32, 2 }, // psrld.
- { ISD::SRL, MVT::v4i64, 2 }, // psrlq.
-
- { ISD::SRA, MVT::v16i16, 2 }, // psraw.
- { ISD::SRA, MVT::v8i32, 2 }, // psrad.
- { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
- { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
+ { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
+ { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
+ { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
+
+ { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
+ { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
+ { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
+
+ { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
+ { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
+ { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
+ { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
};
if (ST->hasSSE2() &&
((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
(Op2Info == TargetTransformInfo::OK_UniformValue))) {
+
+ // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
+ if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
+ return LT.first * 4; // 2*psrad + shuffle.
+
if (const auto *Entry =
CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
@@ -581,28 +589,28 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
static const CostTblEntry SSE41CostTable[] = {
- { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
- { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence.
- { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence.
- { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
- { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld
-
- { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
- { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence.
- { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SRL, MVT::v16i16, 2*14 }, // pblendvb sequence.
- { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v8i32, 2*11 }, // Shift each lane + blend.
-
- { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
- { ISD::SRA, MVT::v32i8, 2*24 }, // pblendvb sequence.
- { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence.
- { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend.
-
- { ISD::MUL, MVT::v4i32, 1 } // pmulld
+ { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
+ { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
+
+ { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
+
+ { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
+
+ { ISD::MUL, MVT::v4i32, 1 } // pmulld
};
if (ST->hasSSE41())
@@ -612,33 +620,33 @@ int X86TTIImpl::getArithmeticInstrCost(
static const CostTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
- { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
- { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
- { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
-
- { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
- { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
-
- { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
- { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
-
- { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v8i16, 1 }, // pmullw
- { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
- { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
-
- { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
- { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
- { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
- { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
+ { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
+
+ { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
+
+ { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
+
+ { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v8i16, 1 }, // pmullw
+ { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
+ { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
// It is not a good idea to vectorize division. We have to scalarize it and
// in the process we will often end up having to spilling regular
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 500b26b3be17..3ee14a0ff7b1 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -398,7 +398,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
/*isVarArg=*/false);
Function *Trampoline =
Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
- Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+ Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape(
ParentFunc->getName()),
TheModule);
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index b8742683a0c8..1da189c5cd31 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -409,7 +409,7 @@ static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
{
KnownBits Known;
DAG.computeKnownBits(Value, Known);
- return Known.Zero.countTrailingOnes() >= 2;
+ return Known.countMinTrailingZeros() >= 2;
}
SDValue XCoreTargetLowering::
@@ -1131,8 +1131,7 @@ SDValue XCoreTargetLowering::LowerCCCCallTo(
unsigned NumBytes = RetCCInfo.getNextStackOffset();
auto PtrVT = getPointerTy(DAG.getDataLayout());
- Chain = DAG.getCALLSEQ_START(Chain,
- DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
SmallVector<SDValue, 12> MemOpChains;
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index f1d52d5a191f..b87ba6548962 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -73,9 +73,10 @@ def XCoreLdwsp : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp,
[SDNPHasChain, SDNPMayLoad]>;
// These are target-independent nodes, but have target-specific formats.
-def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
def SDT_XCoreCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
- SDTCisVT<1, i32> ]>;
+ SDTCisVT<1, i32> ]>;
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
[SDNPHasChain, SDNPOutGlue]>;
@@ -323,9 +324,9 @@ class F2R_np<bits<6> opc, string OpcStr> :
//===----------------------------------------------------------------------===//
let Defs = [SP], Uses = [SP] in {
-def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
- "# ADJCALLSTACKDOWN $amt",
- [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt, i32imm:$amt2),
+ "# ADJCALLSTACKDOWN $amt, $amt2",
+ [(callseq_start timm:$amt, timm:$amt2)]>;
def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
"# ADJCALLSTACKUP $amt1",
[(callseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/ToolDrivers/CMakeLists.txt b/lib/ToolDrivers/CMakeLists.txt
new file mode 100644
index 000000000000..ad458450fda3
--- /dev/null
+++ b/lib/ToolDrivers/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(llvm-lib)
diff --git a/lib/ToolDrivers/LLVMBuild.txt b/lib/ToolDrivers/LLVMBuild.txt
new file mode 100644
index 000000000000..7da9a5c01005
--- /dev/null
+++ b/lib/ToolDrivers/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/ToolDrivers/LLVMBuild.txt --------------------------*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = llvm-lib
+
+[component_0]
+type = Group
+name = ToolDrivers
+parent = Libraries
diff --git a/lib/LibDriver/CMakeLists.txt b/lib/ToolDrivers/llvm-lib/CMakeLists.txt
index ab53a6843446..ab53a6843446 100644
--- a/lib/LibDriver/CMakeLists.txt
+++ b/lib/ToolDrivers/llvm-lib/CMakeLists.txt
diff --git a/lib/LibDriver/LLVMBuild.txt b/lib/ToolDrivers/llvm-lib/LLVMBuild.txt
index 799dc997c0bb..799dc997c0bb 100644
--- a/lib/LibDriver/LLVMBuild.txt
+++ b/lib/ToolDrivers/llvm-lib/LLVMBuild.txt
diff --git a/lib/LibDriver/LibDriver.cpp b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index c50629d71501..3bae3826d62e 100644
--- a/lib/LibDriver/LibDriver.cpp
+++ b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -12,7 +12,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/LibDriver/LibDriver.h"
+#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Object/ArchiveWriter.h"
#include "llvm/Option/Arg.h"
diff --git a/lib/LibDriver/Options.td b/lib/ToolDrivers/llvm-lib/Options.td
index 5a56ef7468d4..5a56ef7468d4 100644
--- a/lib/LibDriver/Options.td
+++ b/lib/ToolDrivers/llvm-lib/Options.td
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 19e6789dfa74..4480220f2cd4 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -177,7 +177,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
// consume. Note, that crossing coro.save also requires a spill, as any code
// between coro.save and coro.suspend may resume the coroutine and all of the
// state needs to be saved by that time.
- auto markSuspendBlock = [&](IntrinsicInst* BarrierInst) {
+ auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) {
BasicBlock *SuspendBlock = BarrierInst->getParent();
auto &B = getBlockData(SuspendBlock);
B.Suspend = true;
@@ -495,6 +495,78 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
return FramePtr;
}
+// Sets the unwind edge of an instruction to a particular successor.
+static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) {
+ if (auto *II = dyn_cast<InvokeInst>(TI))
+ II->setUnwindDest(Succ);
+ else if (auto *CS = dyn_cast<CatchSwitchInst>(TI))
+ CS->setUnwindDest(Succ);
+ else if (auto *CR = dyn_cast<CleanupReturnInst>(TI))
+ CR->setUnwindDest(Succ);
+ else
+ llvm_unreachable("unexpected terminator instruction");
+}
+
+// Replaces all uses of OldPred with the NewPred block in all PHINodes in a
+// block.
+static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred,
+ BasicBlock *NewPred,
+ PHINode *LandingPadReplacement) {
+ unsigned BBIdx = 0;
+ for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PN = cast<PHINode>(I);
+
+ // We manually update the LandingPadReplacement PHINode and it is the last
+ // PHI Node. So, if we find it, we are done.
+ if (LandingPadReplacement == PN)
+ break;
+
+ // Reuse the previous value of BBIdx if it lines up. In cases where we
+ // have multiple phi nodes with *lots* of predecessors, this is a speed
+ // win because we don't have to scan the PHI looking for TIBB. This
+ // happens because the BB list of PHI nodes are usually in the same
+ // order.
+ if (PN->getIncomingBlock(BBIdx) != OldPred)
+ BBIdx = PN->getBasicBlockIndex(OldPred);
+
+ assert(BBIdx != (unsigned)-1 && "Invalid PHI Index!");
+ PN->setIncomingBlock(BBIdx, NewPred);
+ }
+}
+
+// Uses SplitEdge unless the successor block is an EHPad, in which case do EH
+// specific handling.
+static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
+ LandingPadInst *OriginalPad,
+ PHINode *LandingPadReplacement) {
+ auto *PadInst = Succ->getFirstNonPHI();
+ if (!LandingPadReplacement && !PadInst->isEHPad())
+ return SplitEdge(BB, Succ);
+
+ auto *NewBB = BasicBlock::Create(BB->getContext(), "", BB->getParent(), Succ);
+ setUnwindEdgeTo(BB->getTerminator(), NewBB);
+ updatePhiNodes(Succ, BB, NewBB, LandingPadReplacement);
+
+ if (LandingPadReplacement) {
+ auto *NewLP = OriginalPad->clone();
+ auto *Terminator = BranchInst::Create(Succ, NewBB);
+ NewLP->insertBefore(Terminator);
+ LandingPadReplacement->addIncoming(NewLP, NewBB);
+ return NewBB;
+ }
+ Value *ParentPad = nullptr;
+ if (auto *FuncletPad = dyn_cast<FuncletPadInst>(PadInst))
+ ParentPad = FuncletPad->getParentPad();
+ else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(PadInst))
+ ParentPad = CatchSwitch->getParentPad();
+ else
+ llvm_unreachable("handling for other EHPads not implemented yet");
+
+ auto *NewCleanupPad = CleanupPadInst::Create(ParentPad, {}, "", NewBB);
+ CleanupReturnInst::Create(NewCleanupPad, Succ, NewBB);
+ return NewBB;
+}
+
static void rewritePHIs(BasicBlock &BB) {
// For every incoming edge we will create a block holding all
// incoming values in a single PHI nodes.
@@ -502,7 +574,7 @@ static void rewritePHIs(BasicBlock &BB) {
// loop:
// %n.val = phi i32[%n, %entry], [%inc, %loop]
//
- // It will create:
+ // It will create:
//
// loop.from.entry:
// %n.loop.pre = phi i32 [%n, %entry]
@@ -517,9 +589,22 @@ static void rewritePHIs(BasicBlock &BB) {
// TODO: Simplify PHINodes in the basic block to remove duplicate
// predecessors.
+ LandingPadInst *LandingPad = nullptr;
+ PHINode *ReplPHI = nullptr;
+ if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
+ // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
+ // We replace the original landing pad with a PHINode that will collect the
+ // results from all of them.
+ ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad);
+ ReplPHI->takeName(LandingPad);
+ LandingPad->replaceAllUsesWith(ReplPHI);
+ // We will erase the original landing pad at the end of this function after
+ // ehAwareSplitEdge cloned it in the transition blocks.
+ }
+
SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));
for (BasicBlock *Pred : Preds) {
- auto *IncomingBB = SplitEdge(Pred, &BB);
+ auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
auto *PN = cast<PHINode>(&BB.front());
do {
@@ -531,7 +616,14 @@ static void rewritePHIs(BasicBlock &BB) {
InputV->addIncoming(V, Pred);
PN->setIncomingValue(Index, InputV);
PN = dyn_cast<PHINode>(PN->getNextNode());
- } while (PN);
+ } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced
+ // the landing pad.
+ }
+
+ if (LandingPad) {
+ // Calls to ehAwareSplitEdge function cloned the original lading pad.
+ // No longer need it.
+ LandingPad->eraseFromParent();
}
}
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 7ed07d63c627..231487923fad 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -610,8 +610,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
return true;
// Lookup the linkage recorded in the summaries during global analysis.
- const auto &GS = DefinedGlobals.find(GV.getGUID());
- GlobalValue::LinkageTypes Linkage;
+ auto GS = DefinedGlobals.find(GV.getGUID());
if (GS == DefinedGlobals.end()) {
// Must have been promoted (possibly conservatively). Find original
// name so that we can access the correct summary and see if it can
@@ -623,7 +622,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
std::string OrigId = GlobalValue::getGlobalIdentifier(
OrigName, GlobalValue::InternalLinkage,
TheModule.getSourceFileName());
- const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
+ GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
if (GS == DefinedGlobals.end()) {
// Also check the original non-promoted non-globalized name. In some
// cases a preempted weak value is linked in as a local copy because
@@ -631,15 +630,11 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
// In that case, since it was originally not a local value, it was
// recorded in the index using the original name.
// FIXME: This may not be needed once PR27866 is fixed.
- const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
+ GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
assert(GS != DefinedGlobals.end());
- Linkage = GS->second->linkage();
- } else {
- Linkage = GS->second->linkage();
}
- } else
- Linkage = GS->second->linkage();
- return !GlobalValue::isLocalLinkage(Linkage);
+ }
+ return !GlobalValue::isLocalLinkage(GS->second->linkage());
};
// FIXME: See if we can just internalize directly here via linkage changes
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 6c83c99ae3be..673d3af0ab52 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -502,7 +502,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
InlinedArrayAllocasTy InlinedArrayAllocas;
- InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache);
+ InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI);
// Now that we have all of the call sites, loop over them and inline them if
// it looks profitable to do so.
@@ -872,7 +872,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
// Setup the data structure used to plumb customization into the
// `InlineFunction` routine.
InlineFunctionInfo IFI(
- /*cg=*/nullptr, &GetAssumptionCache,
+ /*cg=*/nullptr, &GetAssumptionCache, PSI,
&FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
&FAM.getResult<BlockFrequencyAnalysis>(Callee));
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 2db47b3b5622..8dff2fb3be8a 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined,
static cl::opt<bool>
DisablePartialInlining("disable-partial-inlining", cl::init(false),
cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+ cl::init(false), cl::ZeroOrMore,
+ cl::ReallyHidden,
+ cl::desc("Skip Cost Analysis"));
static cl::opt<unsigned> MaxNumInlineBlocks(
"max-num-inline-blocks", cl::init(5), cl::Hidden,
@@ -53,6 +59,15 @@ static cl::opt<int> MaxNumPartialInlining(
"max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
cl::desc("Max number of partial inlining. The default is unlimited"));
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+ OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+ cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Relative frequency of outline region to "
+ "the entry block"));
+
namespace {
struct FunctionOutliningInfo {
@@ -84,8 +99,6 @@ struct PartialInlinerImpl {
bool run(Module &M);
Function *unswitchFunction(Function *F);
- std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
-
private:
int NumPartialInlining = 0;
std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -93,11 +106,84 @@ private:
Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
ProfileSummaryInfo *PSI;
- bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE);
+ // Return the frequency of the OutlininingBB relative to F's entry point.
+ // The result is no larger than 1 and is represented using BP.
+ // (Note that the outlined region's 'head' block can only have incoming
+ // edges from the guarding entry blocks).
+ BranchProbability getOutliningCallBBRelativeFreq(Function *F,
+ FunctionOutliningInfo *OI,
+ Function *DuplicateFunction,
+ BlockFrequencyInfo *BFI,
+ BasicBlock *OutliningCallBB);
+
+ // Return true if the callee of CS should be partially inlined with
+ // profit.
+ bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI,
+ BlockFrequencyInfo *CalleeBFI,
+ BasicBlock *OutliningCallBB,
+ int OutliningCallOverhead,
+ OptimizationRemarkEmitter &ORE);
+
+ // Try to inline DuplicateFunction (cloned from F with call to
+ // the OutlinedFunction into its callers. Return true
+ // if there is any successful inlining.
+ bool tryPartialInline(Function *DuplicateFunction,
+ Function *F, /*orignal function */
+ FunctionOutliningInfo *OI, Function *OutlinedFunction,
+ BlockFrequencyInfo *CalleeBFI);
+
+ // Compute the mapping from use site of DuplicationFunction to the enclosing
+ // BB's profile count.
+ void computeCallsiteToProfCountMap(Function *DuplicateFunction,
+ DenseMap<User *, uint64_t> &SiteCountMap);
+
bool IsLimitReached() {
return (MaxNumPartialInlining != -1 &&
NumPartialInlining >= MaxNumPartialInlining);
}
+
+ CallSite getCallSite(User *U) {
+ CallSite CS;
+ if (CallInst *CI = dyn_cast<CallInst>(U))
+ CS = CallSite(CI);
+ else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+ CS = CallSite(II);
+ else
+ llvm_unreachable("All uses must be calls");
+ return CS;
+ }
+
+ CallSite getOneCallSiteTo(Function *F) {
+ User *User = *F->user_begin();
+ return getCallSite(User);
+ }
+
+ std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+ CallSite CS = getOneCallSiteTo(F);
+ DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+ BasicBlock *Block = CS.getParent();
+ return std::make_tuple(DLoc, Block);
+ }
+
+ // Returns the costs associated with function outlining:
+ // - The first value is the non-weighted runtime cost for making the call
+ // to the outlined function 'OutlinedFunction', including the addtional
+ // setup cost in the outlined function itself;
+ // - The second value is the estimated size of the new call sequence in
+ // basic block 'OutliningCallBB';
+ // - The third value is the estimated size of the original code from
+ // function 'F' that is extracted into the outlined function.
+ std::tuple<int, int, int>
+ computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
+ Function *OutlinedFunction,
+ BasicBlock *OutliningCallBB);
+ // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+ // approximate both the size and runtime cost (Note that in the current
+ // inline cost analysis, there is no clear distinction there either).
+ int computeBBInlineCost(BasicBlock *BB);
+
+ std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
+
};
struct PartialInlinerLegacyPass : public ModulePass {
@@ -157,7 +243,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
return isa<ReturnInst>(TI);
};
- auto GetReturnBlock = [=](BasicBlock *Succ1, BasicBlock *Succ2) {
+ auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
if (IsReturnBlock(Succ1))
return std::make_tuple(Succ1, Succ2);
if (IsReturnBlock(Succ2))
@@ -167,7 +253,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
};
// Detect a triangular shape:
- auto GetCommonSucc = [=](BasicBlock *Succ1, BasicBlock *Succ2) {
+ auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
if (IsSuccessor(Succ1, Succ2))
return std::make_tuple(Succ1, Succ2);
if (IsSuccessor(Succ2, Succ1))
@@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
// Do sanity check of the entries: threre should not
// be any successors (not in the entry set) other than
// {ReturnBlock, NonReturnBlock}
- assert(OutliningInfo->Entries[0] == &F->front());
+ assert(OutliningInfo->Entries[0] == &F->front() &&
+ "Function Entry must be the first in Entries vector");
DenseSet<BasicBlock *> Entries;
for (BasicBlock *E : OutliningInfo->Entries)
Entries.insert(E);
@@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
return OutliningInfo;
}
-bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
- OptimizationRemarkEmitter &ORE) {
- // TODO : more sharing with shouldInline in Inliner.cpp
+// Check if there is PGO data or user annoated branch data:
+static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
+ if (F->getEntryCount())
+ return true;
+ // Now check if any of the entry block has MD_prof data:
+ for (auto *E : OI->Entries) {
+ BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+ if (!BR || BR->isUnconditional())
+ continue;
+ uint64_t T, F;
+ if (BR->extractProfMetadata(T, F))
+ return true;
+ }
+ return false;
+}
+
+BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
+ Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction,
+ BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) {
+
+ auto EntryFreq =
+ BFI->getBlockFreq(&DuplicateFunction->getEntryBlock());
+ auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB);
+
+ auto OutlineRegionRelFreq =
+ BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
+ EntryFreq.getFrequency());
+
+ if (hasProfileData(F, OI))
+ return OutlineRegionRelFreq;
+
+ // When profile data is not available, we need to be very
+ // conservative in estimating the overall savings. We need to make sure
+ // the outline region relative frequency is not below the threshold
+ // specified by the option.
+ OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+ return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+ CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI,
+ BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB,
+ int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {
using namespace ore;
+ if (SkipCostAnalysis)
+ return true;
+
Instruction *Call = CS.getInstruction();
Function *Callee = CS.getCalledFunction();
Function *Caller = CS.getCaller();
@@ -302,36 +433,170 @@ bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
if (IC.isAlways()) {
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
- << NV("Callee", Callee)
+ << NV("Callee", F)
<< " should always be fully inlined, not partially");
return false;
}
if (IC.isNever()) {
ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
- << NV("Callee", Callee) << " not partially inlined into "
+ << NV("Callee", F) << " not partially inlined into "
<< NV("Caller", Caller)
<< " because it should never be inlined (cost=never)");
return false;
}
if (!IC) {
- ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
- << NV("Callee", Callee) << " not partially inlined into "
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+ << NV("Callee", F) << " not partially inlined into "
<< NV("Caller", Caller) << " because too costly to inline (cost="
<< NV("Cost", IC.getCost()) << ", threshold="
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return false;
}
+ const DataLayout &DL = Caller->getParent()->getDataLayout();
+ // The savings of eliminating the call:
+ int NonWeightedSavings = getCallsiteCost(CS, DL);
+ BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+ auto RelativeFreq =
+ getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB);
+ auto NormWeightedRcost =
+ BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
+
+ // Weighted saving is smaller than weighted cost, return false
+ if (NormWeightedSavings < NormWeightedRcost) {
+ ORE.emit(
+ OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
+ << NV("Callee", F) << " not partially inlined into "
+ << NV("Caller", Caller) << " runtime overhead (overhead="
+ << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
+ << ", savings="
+ << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
+ << " of making the outlined call is too high");
+
+ return false;
+ }
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
- << NV("Callee", Callee) << " can be partially inlined into "
+ << NV("Callee", F) << " can be partially inlined into "
<< NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
<< " (threshold="
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return true;
}
+// TODO: Ideally we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+ int InlineCost = 0;
+ const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ InlineCost += getCallsiteCost(CallSite(CI), DL);
+ continue;
+ }
+
+ if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+ InlineCost += getCallsiteCost(CallSite(II), DL);
+ continue;
+ }
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+ continue;
+ }
+ InlineCost += InlineConstants::InstrCost;
+ }
+ return InlineCost;
+}
+
+std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
+ Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
+ BasicBlock *OutliningCallBB) {
+ // First compute the cost of the outlined region 'OI' in the original
+ // function 'F':
+ int OutlinedRegionCost = 0;
+ for (BasicBlock &BB : *F) {
+ if (&BB != OI->ReturnBlock &&
+ // Assuming Entry set is small -- do a linear search here:
+ std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
+ OI->Entries.end()) {
+ OutlinedRegionCost += computeBBInlineCost(&BB);
+ }
+ }
+
+ // Now compute the cost of the call sequence to the outlined function
+ // 'OutlinedFunction' in BB 'OutliningCallBB':
+ int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
+
+ // Now compute the cost of the extracted/outlined function itself:
+ int OutlinedFunctionCost = 0;
+ for (BasicBlock &BB : *OutlinedFunction) {
+ OutlinedFunctionCost += computeBBInlineCost(&BB);
+ }
+
+ assert(OutlinedFunctionCost >= OutlinedRegionCost &&
+ "Outlined function cost should be no less than the outlined region");
+ int OutliningRuntimeOverhead =
+ OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost);
+
+ return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
+ OutlinedRegionCost);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+ Function *DuplicateFunction,
+ DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+ std::vector<User *> Users(DuplicateFunction->user_begin(),
+ DuplicateFunction->user_end());
+ Function *CurrentCaller = nullptr;
+ BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+ auto ComputeCurrBFI = [&,this](Function *Caller) {
+ // For the old pass manager:
+ if (!GetBFI) {
+ if (CurrentCallerBFI)
+ delete CurrentCallerBFI;
+ DominatorTree DT(*Caller);
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*Caller, LI);
+ CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI);
+ } else {
+ // New pass manager:
+ CurrentCallerBFI = &(*GetBFI)(*Caller);
+ }
+ };
+
+ for (User *User : Users) {
+ CallSite CS = getCallSite(User);
+ Function *Caller = CS.getCaller();
+ if (CurrentCaller != Caller) {
+ CurrentCaller = Caller;
+ ComputeCurrBFI(Caller);
+ } else {
+ assert(CurrentCallerBFI && "CallerBFI is not set");
+ }
+ BasicBlock *CallBB = CS.getInstruction()->getParent();
+ auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+ if (Count)
+ CallSiteToProfCountMap[User] = *Count;
+ else
+ CallSiteToProfCountMap[User] = 0;
+ }
+ if (!GetBFI) {
+ if (CurrentCallerBFI)
+ delete CurrentCallerBFI;
+ }
+}
+
Function *PartialInlinerImpl::unswitchFunction(Function *F) {
if (F->hasAddressTaken())
@@ -347,21 +612,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
if (PSI->isFunctionEntryCold(F))
return nullptr;
- std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
- computeOutliningInfo(F);
+ if (F->user_begin() == F->user_end())
+ return nullptr;
+
+ std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
- if (!OutliningInfo)
+ if (!OI)
return nullptr;
// Clone the function, so that we can hack away on it.
ValueToValueMapTy VMap;
Function *DuplicateFunction = CloneFunction(F, VMap);
- BasicBlock *NewReturnBlock =
- cast<BasicBlock>(VMap[OutliningInfo->ReturnBlock]);
- BasicBlock *NewNonReturnBlock =
- cast<BasicBlock>(VMap[OutliningInfo->NonReturnBlock]);
+ BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+ BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
DenseSet<BasicBlock *> NewEntries;
- for (BasicBlock *BB : OutliningInfo->Entries) {
+ for (BasicBlock *BB : OI->Entries) {
NewEntries.insert(cast<BasicBlock>(VMap[BB]));
}
@@ -390,7 +655,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
BasicBlock *PreReturn = NewReturnBlock;
// only split block when necessary:
PHINode *FirstPhi = getFirstPHI(PreReturn);
- unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size();
+ unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();
if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {
NewReturnBlock = NewReturnBlock->splitBasicBlock(
@@ -408,14 +673,14 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
Ins = NewReturnBlock->getFirstNonPHI();
RetPhi->addIncoming(&*I, PreReturn);
- for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) {
+ for (BasicBlock *E : OI->ReturnBlockPreds) {
BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);
OldPhi->removeIncomingValue(NewE);
}
++I;
}
- for (auto E : OutliningInfo->ReturnBlockPreds) {
+ for (auto E : OI->ReturnBlockPreds) {
BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
}
@@ -423,7 +688,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
// Returns true if the block is to be partial inlined into the caller
// (i.e. not to be extracted to the out of line function)
- auto ToBeInlined = [=](BasicBlock *BB) {
+ auto ToBeInlined = [&](BasicBlock *BB) {
return BB == NewReturnBlock || NewEntries.count(BB);
};
// Gather up the blocks that we're going to extract.
@@ -443,50 +708,113 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
// Extract the body of the if.
- Function *ExtractedFunction =
+ Function *OutlinedFunction =
CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
.extractCodeRegion();
- // Inline the top-level if test into all callers.
+ bool AnyInline =
+ tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI);
+
+ // Ditch the duplicate, since we're done with it, and rewrite all remaining
+ // users (function pointers, etc.) back to the original function.
+ DuplicateFunction->replaceAllUsesWith(F);
+ DuplicateFunction->eraseFromParent();
+
+ if (AnyInline)
+ return OutlinedFunction;
+
+ // Remove the function that is speculatively created:
+ if (OutlinedFunction)
+ OutlinedFunction->eraseFromParent();
+
+ return nullptr;
+}
+
+bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
+ Function *F,
+ FunctionOutliningInfo *OI,
+ Function *OutlinedFunction,
+ BlockFrequencyInfo *CalleeBFI) {
+ if (OutlinedFunction == nullptr)
+ return false;
+
+ int NonWeightedRcost;
+ int SizeCost;
+ int OutlinedRegionSizeCost;
+
+ auto OutliningCallBB =
+ getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent();
+
+ std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
+ computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB);
+
+ // The call sequence to the outlined function is larger than the original
+ // outlined region size, it does not increase the chances of inlining
+ // 'F' with outlining (The inliner usies the size increase to model the
+ // the cost of inlining a callee).
+ if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
+ OptimizationRemarkEmitter ORE(F);
+ DebugLoc DLoc;
+ BasicBlock *Block;
+ std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+ DLoc, Block)
+ << ore::NV("Function", F)
+ << " not partially inlined into callers (Original Size = "
+ << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
+ << ", Size of call sequence to outlined function = "
+ << ore::NV("NewSize", SizeCost) << ")");
+ return false;
+ }
+
+ assert(F->user_begin() == F->user_end() &&
+ "F's users should all be replaced!");
std::vector<User *> Users(DuplicateFunction->user_begin(),
DuplicateFunction->user_end());
+ DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+ if (F->getEntryCount())
+ computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap);
+
+ auto CalleeEntryCount = F->getEntryCount();
+ uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+ bool AnyInline = false;
for (User *User : Users) {
- CallSite CS;
- if (CallInst *CI = dyn_cast<CallInst>(User))
- CS = CallSite(CI);
- else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
- CS = CallSite(II);
- else
- llvm_unreachable("All uses must be calls");
+ CallSite CS = getCallSite(User);
if (IsLimitReached())
continue;
OptimizationRemarkEmitter ORE(CS.getCaller());
- if (!shouldPartialInline(CS, ORE))
+
+ if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB,
+ NonWeightedRcost, ORE))
continue;
- DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
- BasicBlock *Block = CS.getParent();
- ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block)
- << ore::NV("Callee", F) << " partially inlined into "
- << ore::NV("Caller", CS.getCaller()));
+ ORE.emit(
+ OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
+ << ore::NV("Callee", F) << " partially inlined into "
+ << ore::NV("Caller", CS.getCaller()));
- InlineFunctionInfo IFI(nullptr, GetAssumptionCache);
+ InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
InlineFunction(CS, IFI);
+
+ // Now update the entry count:
+ if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+ uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+ CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+ }
+
+ AnyInline = true;
NumPartialInlining++;
- // update stats
+ // Update the stats
NumPartialInlined++;
}
- // Ditch the duplicate, since we're done with it, and rewrite all remaining
- // users (function pointers, etc.) back to the original function.
- DuplicateFunction->replaceAllUsesWith(F);
- DuplicateFunction->eraseFromParent();
-
+ if (AnyInline && CalleeEntryCount)
+ F->setEntryCount(CalleeEntryCountV);
- return ExtractedFunction;
+ return AnyInline;
}
bool PartialInlinerImpl::run(Module &M) {
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index d3a3c24ce7b4..659cb9df00a2 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -16,6 +16,7 @@
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TypeMetadataUtils.h"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/IR/Constants.h"
@@ -178,7 +179,7 @@ void filterModule(
else
GO = new GlobalVariable(
*M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
- (Constant *)nullptr, "", (GlobalVariable *)nullptr,
+ nullptr, "", nullptr,
GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
GO->takeName(GA);
GA->replaceAllUsesWith(GO);
@@ -320,7 +321,8 @@ void splitAndWriteThinLTOBitcode(
// FIXME: Try to re-use BSI and PFI from the original module here.
- ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr);
+ ProfileSummaryInfo PSI(M);
+ ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
SmallVector<char, 0> Buffer;
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 153a186d5ed4..0ca62b7ae40c 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -847,92 +847,6 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
}
-/// \brief Return true if we can prove that adding the two values of the
-/// knownbits will not overflow.
-/// Otherwise return false.
-static bool checkRippleForAdd(const KnownBits &LHSKnown,
- const KnownBits &RHSKnown) {
- // Addition of two 2's complement numbers having opposite signs will never
- // overflow.
- if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
- (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
- return true;
-
- // If either of the values is known to be non-negative, adding them can only
- // overflow if the second is also non-negative, so we can assume that.
- // Two non-negative numbers will only overflow if there is a carry to the
- // sign bit, so we can check if even when the values are as big as possible
- // there is no overflow to the sign bit.
- if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
- APInt MaxLHS = ~LHSKnown.Zero;
- MaxLHS.clearSignBit();
- APInt MaxRHS = ~RHSKnown.Zero;
- MaxRHS.clearSignBit();
- APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
- return Result.isSignBitClear();
- }
-
- // If either of the values is known to be negative, adding them can only
- // overflow if the second is also negative, so we can assume that.
- // Two negative number will only overflow if there is no carry to the sign
- // bit, so we can check if even when the values are as small as possible
- // there is overflow to the sign bit.
- if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
- APInt MinLHS = LHSKnown.One;
- MinLHS.clearSignBit();
- APInt MinRHS = RHSKnown.One;
- MinRHS.clearSignBit();
- APInt Result = std::move(MinLHS) + std::move(MinRHS);
- return Result.isSignBitSet();
- }
-
- // If we reached here it means that we know nothing about the sign bits.
- // In this case we can't know if there will be an overflow, since by
- // changing the sign bits any two values can be made to overflow.
- return false;
-}
-
-/// Return true if we can prove that:
-/// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS))
-/// This basically requires proving that the add in the original type would not
-/// overflow to change the sign bit or have a carry out.
-bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
- Instruction &CxtI) {
- // There are different heuristics we can use for this. Here are some simple
- // ones.
-
- // If LHS and RHS each have at least two sign bits, the addition will look
- // like
- //
- // XX..... +
- // YY.....
- //
- // If the carry into the most significant position is 0, X and Y can't both
- // be 1 and therefore the carry out of the addition is also 0.
- //
- // If the carry into the most significant position is 1, X and Y can't both
- // be 0 and therefore the carry out of the addition is also 1.
- //
- // Since the carry into the most significant position is always equal to
- // the carry out of the addition, there is no signed overflow.
- if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
- ComputeNumSignBits(RHS, 0, &CxtI) > 1)
- return true;
-
- unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
- KnownBits LHSKnown(BitWidth);
- computeKnownBits(LHS, LHSKnown, 0, &CxtI);
-
- KnownBits RHSKnown(BitWidth);
- computeKnownBits(RHS, RHSKnown, 0, &CxtI);
-
- // Check if carry bit of addition will not cause overflow.
- if (checkRippleForAdd(LHSKnown, RHSKnown))
- return true;
-
- return false;
-}
-
/// \brief Return true if we can prove that:
/// (sub LHS, RHS) === (sub nsw LHS, RHS)
/// This basically requires proving that the add in the original type would not
@@ -968,13 +882,9 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
Instruction &CxtI) {
// If the LHS is negative and the RHS is non-negative, no unsigned wrap.
- bool LHSKnownNonNegative, LHSKnownNegative;
- bool RHSKnownNonNegative, RHSKnownNegative;
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0,
- &CxtI);
- ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0,
- &CxtI);
- if (LHSKnownNegative && RHSKnownNonNegative)
+ KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+ KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+ if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
return true;
return false;
@@ -1041,6 +951,57 @@ static Value *checkForNegativeOperand(BinaryOperator &I,
return nullptr;
}
+static Instruction *foldAddWithConstant(BinaryOperator &Add,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+ const APInt *C;
+ if (!match(Op1, m_APInt(C)))
+ return nullptr;
+
+ if (C->isSignMask()) {
+ // If wrapping is not allowed, then the addition must set the sign bit:
+ // X + (signmask) --> X | signmask
+ if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap())
+ return BinaryOperator::CreateOr(Op0, Op1);
+
+ // If wrapping is allowed, then the addition flips the sign bit of LHS:
+ // X + (signmask) --> X ^ signmask
+ return BinaryOperator::CreateXor(Op0, Op1);
+ }
+
+ Value *X;
+ const APInt *C2;
+ Type *Ty = Add.getType();
+
+ // Is this add the last step in a convoluted sext?
+ // add(zext(xor i16 X, -32768), -32768) --> sext X
+ if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
+ C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
+ return CastInst::Create(Instruction::SExt, X, Ty);
+
+ // (add (zext (add nuw X, C2)), C) --> (zext (add nuw X, C2 + C))
+ // FIXME: This should check hasOneUse to not increase the instruction count?
+ if (C->isNegative() &&
+ match(Op0, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2)))) &&
+ C->sge(-C2->sext(C->getBitWidth()))) {
+ Constant *NewC =
+ ConstantInt::get(X->getType(), *C2 + C->trunc(C2->getBitWidth()));
+ return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
+ }
+
+ // Shifts and add used to flip and mask off the low bit:
+ // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
+ const APInt *C3;
+ if (*C == 1 && match(Op0, m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)),
+ m_APInt(C3)))) &&
+ C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
+ Value *NotX = Builder.CreateNot(X);
+ return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
bool Changed = SimplifyAssociativeOrCommutative(I);
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
@@ -1056,41 +1017,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
if (Value *V = SimplifyUsingDistributiveLaws(I))
return replaceInstUsesWith(I, V);
- const APInt *RHSC;
- if (match(RHS, m_APInt(RHSC))) {
- if (RHSC->isSignMask()) {
- // If wrapping is not allowed, then the addition must set the sign bit:
- // X + (signmask) --> X | signmask
- if (I.hasNoSignedWrap() || I.hasNoUnsignedWrap())
- return BinaryOperator::CreateOr(LHS, RHS);
-
- // If wrapping is allowed, then the addition flips the sign bit of LHS:
- // X + (signmask) --> X ^ signmask
- return BinaryOperator::CreateXor(LHS, RHS);
- }
-
- // Is this add the last step in a convoluted sext?
- Value *X;
- const APInt *C;
- if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) &&
- C->isMinSignedValue() &&
- C->sext(LHS->getType()->getScalarSizeInBits()) == *RHSC) {
- // add(zext(xor i16 X, -32768), -32768) --> sext X
- return CastInst::Create(Instruction::SExt, X, LHS->getType());
- }
-
- if (RHSC->isNegative() &&
- match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
- RHSC->sge(-C->sext(RHSC->getBitWidth()))) {
- // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
- Constant *NewC =
- ConstantInt::get(X->getType(), *C + RHSC->trunc(C->getBitWidth()));
- return new ZExtInst(Builder->CreateNUWAdd(X, NewC), I.getType());
- }
- }
+ if (Instruction *X = foldAddWithConstant(I, *Builder))
+ return X;
- // FIXME: Use the match above instead of dyn_cast to allow these transforms
- // for splat vectors.
+ // FIXME: This should be moved into the above helper function to allow these
+ // transforms for splat vectors.
if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
// zext(bool) + C -> bool ? C + 1 : C
if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
@@ -1285,8 +1216,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
Constant *CI =
ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
if (ConstantExpr::getZExt(CI, I.getType()) == RHSC &&
- computeOverflowForUnsignedAdd(LHSConv->getOperand(0), CI, &I) ==
- OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedAdd(LHSConv->getOperand(0), CI, I)) {
// Insert the new, smaller add.
Value *NewAdd =
Builder->CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv");
@@ -1303,9 +1233,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
if (LHSConv->getOperand(0)->getType() ==
RHSConv->getOperand(0)->getType() &&
(LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
- computeOverflowForUnsignedAdd(LHSConv->getOperand(0),
- RHSConv->getOperand(0),
- &I) == OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedAdd(LHSConv->getOperand(0),
+ RHSConv->getOperand(0), I)) {
// Insert the new integer add.
Value *NewAdd = Builder->CreateNUWAdd(
LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv");
@@ -1347,15 +1276,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
}
// TODO(jingyue): Consider WillNotOverflowSignedAdd and
- // WillNotOverflowUnsignedAdd to reduce the number of invocations of
+ // willNotOverflowUnsignedAdd to reduce the number of invocations of
// computeKnownBits.
if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) {
Changed = true;
I.setHasNoSignedWrap(true);
}
- if (!I.hasNoUnsignedWrap() &&
- computeOverflowForUnsignedAdd(LHS, RHS, &I) ==
- OverflowResult::NeverOverflows) {
+ if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
Changed = true;
I.setHasNoUnsignedWrap(true);
}
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b114801cc1c0..82dc88f1b3ad 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -23,21 +23,6 @@ using namespace PatternMatch;
#define DEBUG_TYPE "instcombine"
-static inline Value *dyn_castNotVal(Value *V) {
- // If this is not(not(x)) don't return that this is a not: we want the two
- // not's to be folded first.
- if (BinaryOperator::isNot(V)) {
- Value *Operand = BinaryOperator::getNotArgument(V);
- if (!IsFreeToInvert(Operand, Operand->hasOneUse()))
- return Operand;
- }
-
- // Constants can be considered to be not'ed values...
- if (ConstantInt *C = dyn_cast<ConstantInt>(V))
- return ConstantInt::get(C->getType(), ~C->getValue());
- return nullptr;
-}
-
/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
/// a four bit mask.
static unsigned getFCmpCode(FCmpInst::Predicate CC) {
@@ -713,9 +698,8 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
}
// This simplification is only valid if the upper range is not negative.
- bool IsNegative, IsNotNegative;
- ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1);
- if (!IsNotNegative)
+ KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1);
+ if (!Known.isNonNegative())
return nullptr;
if (Inverted)
@@ -1013,26 +997,22 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
/// (~A & ~B) == (~(A | B))
/// (~A | ~B) == (~(A & B))
static Instruction *matchDeMorgansLaws(BinaryOperator &I,
- InstCombiner::BuilderTy *Builder) {
+ InstCombiner::BuilderTy &Builder) {
auto Opcode = I.getOpcode();
assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
"Trying to match De Morgan's Laws with something other than and/or");
+
// Flip the logic operation.
- if (Opcode == Instruction::And)
- Opcode = Instruction::Or;
- else
- Opcode = Instruction::And;
+ Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
- Value *Op0 = I.getOperand(0);
- Value *Op1 = I.getOperand(1);
- // TODO: Use pattern matchers instead of dyn_cast.
- if (Value *Op0NotVal = dyn_castNotVal(Op0))
- if (Value *Op1NotVal = dyn_castNotVal(Op1))
- if (Op0->hasOneUse() && Op1->hasOneUse()) {
- Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal,
- I.getName() + ".demorgan");
- return BinaryOperator::CreateNot(LogicOp);
- }
+ Value *A, *B;
+ if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
+ match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
+ !IsFreeToInvert(A, A->hasOneUse()) &&
+ !IsFreeToInvert(B, B->hasOneUse())) {
+ Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
+ return BinaryOperator::CreateNot(AndOr);
+ }
return nullptr;
}
@@ -1376,7 +1356,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
return FoldedLogic;
- if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+ if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))
return DeMorgan;
{
@@ -2005,18 +1985,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
if (Value *V = SimplifyBSwap(I))
return replaceInstUsesWith(I, V);
- if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
- ConstantInt *C1 = nullptr; Value *X = nullptr;
- // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
- if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) &&
- Op0->hasOneUse()) {
- Value *Or = Builder->CreateOr(X, RHS);
- Or->takeName(Op0);
- return BinaryOperator::CreateXor(Or,
- Builder->getInt(C1->getValue() & ~RHS->getValue()));
- }
- }
-
if (isa<Constant>(Op1))
if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
return FoldedLogic;
@@ -2167,7 +2135,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C));
- if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+ if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))
return DeMorgan;
// Canonicalize xor to the RHS.
@@ -2399,27 +2367,44 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
}
// Is this a 'not' (~) fed by a binary operator?
- BinaryOperator *NotOp;
- if (match(&I, m_Not(m_BinOp(NotOp)))) {
- if (NotOp->getOpcode() == Instruction::And ||
- NotOp->getOpcode() == Instruction::Or) {
+ BinaryOperator *NotVal;
+ if (match(&I, m_Not(m_BinOp(NotVal)))) {
+ if (NotVal->getOpcode() == Instruction::And ||
+ NotVal->getOpcode() == Instruction::Or) {
// Apply DeMorgan's Law when inverts are free:
// ~(X & Y) --> (~X | ~Y)
// ~(X | Y) --> (~X & ~Y)
- if (IsFreeToInvert(NotOp->getOperand(0),
- NotOp->getOperand(0)->hasOneUse()) &&
- IsFreeToInvert(NotOp->getOperand(1),
- NotOp->getOperand(1)->hasOneUse())) {
- Value *NotX = Builder->CreateNot(NotOp->getOperand(0), "notlhs");
- Value *NotY = Builder->CreateNot(NotOp->getOperand(1), "notrhs");
- if (NotOp->getOpcode() == Instruction::And)
+ if (IsFreeToInvert(NotVal->getOperand(0),
+ NotVal->getOperand(0)->hasOneUse()) &&
+ IsFreeToInvert(NotVal->getOperand(1),
+ NotVal->getOperand(1)->hasOneUse())) {
+ Value *NotX = Builder->CreateNot(NotVal->getOperand(0), "notlhs");
+ Value *NotY = Builder->CreateNot(NotVal->getOperand(1), "notrhs");
+ if (NotVal->getOpcode() == Instruction::And)
return BinaryOperator::CreateOr(NotX, NotY);
return BinaryOperator::CreateAnd(NotX, NotY);
}
- } else if (NotOp->getOpcode() == Instruction::AShr) {
- // ~(~X >>s Y) --> (X >>s Y)
- if (Value *Op0NotVal = dyn_castNotVal(NotOp->getOperand(0)))
- return BinaryOperator::CreateAShr(Op0NotVal, NotOp->getOperand(1));
+ }
+
+ // ~(~X >>s Y) --> (X >>s Y)
+ if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
+ return BinaryOperator::CreateAShr(X, Y);
+
+ // If we are inverting a right-shifted constant, we may be able to eliminate
+ // the 'not' by inverting the constant and using the opposite shift type.
+ // Canonicalization rules ensure that only a negative constant uses 'ashr',
+ // but we must check that in case that transform has not fired yet.
+ const APInt *C;
+ if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) {
+ // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
+ Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+ return BinaryOperator::CreateLShr(NotC, Y);
+ }
+
+ if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) {
+ // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
+ Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+ return BinaryOperator::CreateAShr(NotC, Y);
}
}
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 6989d67f0060..face7abcc95f 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1384,10 +1384,10 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
// Create a mask for bits above (ctlz) or below (cttz) the first known one.
bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
- unsigned PossibleZeros = IsTZ ? Known.One.countTrailingZeros()
- : Known.One.countLeadingZeros();
- unsigned DefiniteZeros = IsTZ ? Known.Zero.countTrailingOnes()
- : Known.Zero.countLeadingOnes();
+ unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
+ : Known.countMaxLeadingZeros();
+ unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
+ : Known.countMinLeadingZeros();
// If all bits above (ctlz) or below (cttz) the first known one are known
// zero, this value is constant.
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 312d9baae43a..001a4bcf16f3 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -559,6 +559,9 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
}
+ // FIXME: Maybe combine the next two transforms to handle the no cast case
+ // more efficiently. Support vector types. Cleanup code by using m_OneUse.
+
// Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
Value *A = nullptr; ConstantInt *Cst = nullptr;
if (Src->hasOneUse() &&
@@ -588,15 +591,20 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
// the sign bit of the original value; performing ashr instead of lshr
// generates bits of the same value as the sign bit.
if (Src->hasOneUse() &&
- match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) &&
- cast<Instruction>(Src)->getOperand(0)->hasOneUse()) {
+ match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) {
+ Value *SExt = cast<Instruction>(Src)->getOperand(0);
+ const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();
const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+ unsigned ShiftAmt = Cst->getZExtValue();
// This optimization can be only performed when zero bits generated by
// the original lshr aren't pulled into the value after truncation, so we
- // can only shift by values smaller than the size of destination type (in
- // bits).
- if (Cst->getValue().ult(ASize)) {
- Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue());
+ // can only shift by values no larger than the number of extension bits.
+ // FIXME: Instead of bailing when the shift is too large, use and to clear
+ // the extra bits.
+ if (SExt->hasOneUse() && ShiftAmt <= SExtSize - ASize) {
+ // If shifting by the size of the original value in bits or more, it is
+ // being filled with the sign bit, so shift by ASize-1 to avoid ub.
+ Value *Shift = Builder->CreateAShr(A, std::min(ShiftAmt, ASize-1));
Shift->takeName(Src);
return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
}
@@ -1180,9 +1188,8 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
// If we know that the value being extended is positive, we can use a zext
// instead.
- bool KnownZero, KnownOne;
- ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI);
- if (KnownZero) {
+ KnownBits Known = computeKnownBits(Src, 0, &CI);
+ if (Known.isNonNegative()) {
Value *ZExt = Builder->CreateZExt(Src, DestTy);
return replaceInstUsesWith(CI, ZExt);
}
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 34ce235b3fe2..60ed4057cedd 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2785,6 +2785,9 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
}
/// Try to fold icmp (binop), X or icmp X, (binop).
+/// TODO: A large part of this logic is duplicated in InstSimplify's
+/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
+/// duplication.
Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -2794,7 +2797,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
if (!BO0 && !BO1)
return nullptr;
- CmpInst::Predicate Pred = I.getPredicate();
+ const CmpInst::Predicate Pred = I.getPredicate();
bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
if (BO0 && isa<OverflowingBinaryOperator>(BO0))
NoOp0WrapProblem =
@@ -3029,21 +3032,20 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
case Instruction::Sub:
case Instruction::Xor:
if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
// icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
if (CI->getValue().isSignMask()) {
- ICmpInst::Predicate Pred =
+ ICmpInst::Predicate NewPred =
I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+ return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
}
if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
- ICmpInst::Predicate Pred =
+ ICmpInst::Predicate NewPred =
I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
- Pred = I.getSwappedPredicate(Pred);
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+ NewPred = I.getSwappedPredicate(NewPred);
+ return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
}
}
break;
@@ -3062,21 +3064,27 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
AP.getBitWidth() - AP.countTrailingZeros()));
Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);
Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask);
- return new ICmpInst(I.getPredicate(), And1, And2);
+ return new ICmpInst(Pred, And1, And2);
}
}
break;
+
case Instruction::UDiv:
case Instruction::LShr:
- if (I.isSigned())
+ if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
break;
- LLVM_FALLTHROUGH;
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
case Instruction::SDiv:
+ if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+ break;
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
case Instruction::AShr:
if (!BO0->isExact() || !BO1->isExact())
break;
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
case Instruction::Shl: {
bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
@@ -3084,8 +3092,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
break;
if (!NSW && I.isSigned())
break;
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
}
}
}
@@ -3096,7 +3103,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
auto BitwiseAnd =
m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value()));
- if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) {
+ if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
auto *Zero = Constant::getNullValue(BO0->getType());
return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
}
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 3be6419a129a..1424f61fe701 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -30,6 +30,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -388,10 +389,21 @@ private:
bool DoTransform = true);
Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
- bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI);
+ bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+ return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ };
+ bool willNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+ return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ };
bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI);
+ bool willNotOverflowUnsignedMul(Value *LHS, Value *RHS, Instruction &CxtI) {
+ return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ };
Value *EmitGEPOffset(User *GEP);
Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
@@ -492,7 +504,11 @@ public:
void computeKnownBits(Value *V, KnownBits &Known,
unsigned Depth, Instruction *CxtI) const {
- return llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+ llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+ }
+ KnownBits computeKnownBits(Value *V, unsigned Depth,
+ Instruction *CxtI) const {
+ return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
}
bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0,
@@ -503,11 +519,6 @@ public:
Instruction *CxtI = nullptr) const {
return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
}
- void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
- unsigned Depth = 0, Instruction *CxtI = nullptr) const {
- return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI,
- &DT);
- }
OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
const Instruction *CxtI) {
return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
@@ -516,6 +527,11 @@ public:
const Instruction *CxtI) {
return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
}
+ OverflowResult computeOverflowForSignedAdd(const Value *LHS,
+ const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
/// Maximum size of array considered when transforming.
uint64_t MaxArraySizeForCombine;
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 675553017838..a4d84ae81aa0 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -885,10 +885,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
// first non-zero index.
auto IsAllNonNegative = [&]() {
for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
- bool KnownNonNegative, KnownNegative;
- IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative,
- KnownNegative, 0, MemI);
- if (KnownNonNegative)
+ KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
+ if (Known.isNonNegative())
continue;
return false;
}
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index face9d9237ae..2a35259f2103 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -162,11 +162,9 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
// product is exactly the minimum negative number.
// E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
// For simplicity we just check if at least one side is not negative.
- bool LHSNonNegative, LHSNegative;
- bool RHSNonNegative, RHSNegative;
- ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI);
- ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI);
- if (LHSNonNegative || RHSNonNegative)
+ KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+ KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+ if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
return true;
}
return false;
@@ -422,8 +420,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
Constant *CI =
ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
if (ConstantExpr::getZExt(CI, I.getType()) == Op1C &&
- computeOverflowForUnsignedMul(Op0Conv->getOperand(0), CI, &I) ==
- OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedMul(Op0Conv->getOperand(0), CI, I)) {
// Insert the new, smaller mul.
Value *NewMul =
Builder->CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv");
@@ -440,9 +437,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
if (Op0Conv->getOperand(0)->getType() ==
Op1Conv->getOperand(0)->getType() &&
(Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
- computeOverflowForUnsignedMul(Op0Conv->getOperand(0),
- Op1Conv->getOperand(0),
- &I) == OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedMul(Op0Conv->getOperand(0),
+ Op1Conv->getOperand(0), I)) {
// Insert the new integer mul.
Value *NewMul = Builder->CreateNUWMul(
Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
@@ -456,9 +452,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
I.setHasNoSignedWrap(true);
}
- if (!I.hasNoUnsignedWrap() &&
- computeOverflowForUnsignedMul(Op0, Op1, &I) ==
- OverflowResult::NeverOverflows) {
+ if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
Changed = true;
I.setHasNoUnsignedWrap(true);
}
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 05b01774cd5e..4028a92771a4 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -611,7 +611,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
return I;
- unsigned Leaders = Known2.Zero.countLeadingOnes();
+ unsigned Leaders = Known2.countMinLeadingZeros();
Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
break;
}
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1792cb585f87..65b1148cb03b 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2212,9 +2212,9 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
// Canonicalize fcmp_one -> fcmp_oeq
FCmpInst::Predicate FPred; Value *Y;
- if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)),
- TrueDest, FalseDest)) &&
- BI.getCondition()->hasOneUse())
+ if (match(&BI, m_Br(m_OneUse(m_FCmp(FPred, m_Value(X), m_Value(Y))),
+ TrueDest, FalseDest))) {
+ // TODO: Why are we only transforming these 3 predicates?
if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
FPred == FCmpInst::FCMP_OGE) {
FCmpInst *Cond = cast<FCmpInst>(BI.getCondition());
@@ -2225,12 +2225,12 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
Worklist.Add(Cond);
return &BI;
}
+ }
// Canonicalize icmp_ne -> icmp_eq
ICmpInst::Predicate IPred;
- if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)),
- TrueDest, FalseDest)) &&
- BI.getCondition()->hasOneUse())
+ if (match(&BI, m_Br(m_OneUse(m_ICmp(IPred, m_Value(X), m_Value(Y))),
+ TrueDest, FalseDest))) {
if (IPred == ICmpInst::ICMP_NE || IPred == ICmpInst::ICMP_ULE ||
IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
IPred == ICmpInst::ICMP_SGE) {
@@ -2241,6 +2241,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
Worklist.Add(Cond);
return &BI;
}
+ }
return nullptr;
}
@@ -2264,8 +2265,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();
KnownBits Known(BitWidth);
computeKnownBits(Cond, Known, 0, &SI);
- unsigned LeadingKnownZeros = Known.Zero.countLeadingOnes();
- unsigned LeadingKnownOnes = Known.One.countLeadingOnes();
+ unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
+ unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
// Compute the number of leading bits we can ignore.
// TODO: A better way to determine this would use ComputeNumSignBits().
@@ -3141,7 +3142,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
// Lower dbg.declare intrinsics otherwise their value may be clobbered
// by instcombiner.
- bool DbgDeclaresChanged = LowerDbgDeclare(F);
+ bool MadeIRChange = LowerDbgDeclare(F);
// Iterate while there is work to do.
int Iteration = 0;
@@ -3150,18 +3151,17 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
<< F.getName() << "\n");
- bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
+ MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,
AA, AC, TLI, DT, DL, LI);
IC.MaxArraySizeForCombine = MaxArraySize;
- Changed |= IC.run();
- if (!Changed)
+ if (!IC.run())
break;
}
- return DbgDeclaresChanged || Iteration > 1;
+ return MadeIRChange || Iteration > 1;
}
PreservedAnalyses InstCombinePass::run(Function &F,
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b034ccc46933..7eea44d6aca0 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -613,7 +613,15 @@ public:
bool UseGlobalsGC = true)
: ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),
Recover(Recover || ClRecover),
- UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC) {}
+ UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+ // Not a typo: ClWithComdat is almost completely pointless without
+ // ClUseGlobalsGC (because then it only works on modules without
+ // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
+ // and both suffer from gold PR19002 for which UseGlobalsGC constructor
+ // argument is designed as workaround. Therefore, disable both
+ // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
+ // do globals-gc.
+ UseCtorComdat(UseGlobalsGC && ClWithComdat) {}
bool runOnModule(Module &M) override;
static char ID; // Pass identification, replacement for typeid
StringRef getPassName() const override { return "AddressSanitizerModule"; }
@@ -656,6 +664,7 @@ private:
bool CompileKernel;
bool Recover;
bool UseGlobalsGC;
+ bool UseCtorComdat;
Type *IntptrTy;
LLVMContext *C;
Triple TargetTriple;
@@ -1677,7 +1686,7 @@ AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
: GlobalVariable::PrivateLinkage;
GlobalVariable *Metadata = new GlobalVariable(
M, Initializer->getType(), false, Linkage, Initializer,
- Twine("__asan_global_") + GlobalValue::getRealLinkageName(OriginalName));
+ Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
Metadata->setSection(getGlobalMetadataSection());
return Metadata;
}
@@ -1782,7 +1791,7 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
// On recent Mach-O platforms, use a structure which binds the liveness of
// the global variable to the metadata struct. Keep the list of "Liveness" GV
// created to be added to llvm.compiler.used
- StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr);
+ StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
@@ -1793,9 +1802,9 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
// On recent Mach-O platforms, we emit the global metadata in a way that
// allows the linker to properly strip dead globals.
- auto LivenessBinder = ConstantStruct::get(
- LivenessTy, Initializer->getAggregateElement(0u),
- ConstantExpr::getPointerCast(Metadata, IntptrTy), nullptr);
+ auto LivenessBinder =
+ ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
+ ConstantExpr::getPointerCast(Metadata, IntptrTy));
GlobalVariable *Liveness = new GlobalVariable(
M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
Twine("__asan_binder_") + G->getName());
@@ -1893,7 +1902,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
// We initialize an array of such structures and pass it to a run-time call.
StructType *GlobalStructTy =
StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
- IntptrTy, IntptrTy, IntptrTy, nullptr);
+ IntptrTy, IntptrTy, IntptrTy);
SmallVector<GlobalVariable *, 16> NewGlobals(n);
SmallVector<Constant *, 16> Initializers(n);
@@ -1929,10 +1938,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
- StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr);
- Constant *NewInitializer =
- ConstantStruct::get(NewTy, G->getInitializer(),
- Constant::getNullValue(RightRedZoneTy), nullptr);
+ StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
+ Constant *NewInitializer = ConstantStruct::get(
+ NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
// Create a new global variable with enough space for a redzone.
GlobalValue::LinkageTypes Linkage = G->getLinkage();
@@ -2013,7 +2021,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
ConstantExpr::getPointerCast(Name, IntptrTy),
ConstantExpr::getPointerCast(ModuleName, IntptrTy),
ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
- ConstantExpr::getPointerCast(ODRIndicator, IntptrTy), nullptr);
+ ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
@@ -2073,7 +2081,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
// Put the constructor and destructor in comdat if both
// (1) global instrumentation is not TU-specific
// (2) target is ELF.
- if (ClWithComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
+ if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,
AsanCtorFunction);
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 8786781933ea..e2e3cbdbc295 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -388,7 +388,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
ArgTypes.push_back(ShadowPtrTy);
Type *RetType = T->getReturnType();
if (!RetType->isVoidTy())
- RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr);
+ RetType = StructType::get(RetType, ShadowTy);
return FunctionType::get(RetType, ArgTypes, T->isVarArg());
}
@@ -476,16 +476,14 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
GetArgTLS = ConstantExpr::getIntToPtr(
ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
PointerType::getUnqual(
- FunctionType::get(PointerType::getUnqual(ArgTLSTy),
- (Type *)nullptr)));
+ FunctionType::get(PointerType::getUnqual(ArgTLSTy), false)));
}
if (GetRetvalTLSPtr) {
RetvalTLS = nullptr;
GetRetvalTLS = ConstantExpr::getIntToPtr(
ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
PointerType::getUnqual(
- FunctionType::get(PointerType::getUnqual(ShadowTy),
- (Type *)nullptr)));
+ FunctionType::get(PointerType::getUnqual(ShadowTy), false)));
}
ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 7dea1dee756a..e89384c559fe 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -398,8 +398,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
// u64 *ArrayCounter;
// };
auto *StructInfoTy =
- StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
- Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr);
+ StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
+ Int8PtrPtrTy, Int64PtrTy, Int64PtrTy);
auto *StructInfoPtrTy = StructInfoTy->getPointerTo();
// This structure should be kept consistent with the CacheFragInfo struct
// in the runtime library.
@@ -408,8 +408,7 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
// u32 NumStructs;
// StructInfo *Structs;
// };
- auto *CacheFragInfoTy =
- StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr);
+ auto *CacheFragInfoTy = StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy);
std::vector<StructType *> Vec = M.getIdentifiedStructTypes();
unsigned NumStructs = 0;
@@ -457,24 +456,23 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
ArrayCounterIdx[1] = ConstantInt::get(Int32Ty,
getArrayCounterIdx(StructTy));
- Initializers.push_back(
- ConstantStruct::get(
- StructInfoTy,
- ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
- ConstantInt::get(Int32Ty,
- DL.getStructLayout(StructTy)->getSizeInBytes()),
- ConstantInt::get(Int32Ty, StructTy->getNumElements()),
- Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
- ConstantExpr::getPointerCast(Offset, Int32PtrTy),
- Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
- ConstantExpr::getPointerCast(Size, Int32PtrTy),
- TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) :
- ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
- ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
- FieldCounterIdx),
- ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
- ArrayCounterIdx),
- nullptr));
+ Initializers.push_back(ConstantStruct::get(
+ StructInfoTy,
+ ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
+ ConstantInt::get(Int32Ty,
+ DL.getStructLayout(StructTy)->getSizeInBytes()),
+ ConstantInt::get(Int32Ty, StructTy->getNumElements()),
+ Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+ : ConstantExpr::getPointerCast(Offset, Int32PtrTy),
+ Size == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+ : ConstantExpr::getPointerCast(Size, Int32PtrTy),
+ TypeName == nullptr
+ ? ConstantPointerNull::get(Int8PtrPtrTy)
+ : ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
+ ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+ FieldCounterIdx),
+ ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+ ArrayCounterIdx)));
}
// Structs.
Constant *StructInfo;
@@ -491,11 +489,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
auto *CacheFragInfoGV = new GlobalVariable(
M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage,
- ConstantStruct::get(CacheFragInfoTy,
- UnitName,
- ConstantInt::get(Int32Ty, NumStructs),
- StructInfo,
- nullptr));
+ ConstantStruct::get(CacheFragInfoTy, UnitName,
+ ConstantInt::get(Int32Ty, NumStructs), StructInfo));
return CacheFragInfoGV;
}
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 15333a5317dd..ff753c20a94a 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1576,13 +1576,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
bool Signed = false) {
Type *srcTy = V->getType();
+ size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+ size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+ if (srcSizeInBits > 1 && dstSizeInBits == 1)
+ return IRB.CreateICmpNE(V, getCleanShadow(V));
+
if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
return IRB.CreateIntCast(V, dstTy, Signed);
if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
return IRB.CreateIntCast(V, dstTy, Signed);
- size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
- size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
Value *V2 =
IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 3f1a77b49a44..ee493a8ec7e1 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -442,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
bool Changed = false;
if (!NUW) {
- ConstantRange NUWRange =
- LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
- OBO::NoUnsignedWrap);
+ ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinaryOperator::Add, LRange, OBO::NoUnsignedWrap);
if (!NUWRange.isEmptySet()) {
bool NewNUW = NUWRange.contains(LazyRRange());
AddOp->setHasNoUnsignedWrap(NewNUW);
@@ -452,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
}
}
if (!NSW) {
- ConstantRange NSWRange =
- LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
- OBO::NoSignedWrap);
+ ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinaryOperator::Add, LRange, OBO::NoSignedWrap);
if (!NSWRange.isEmptySet()) {
bool NewNSW = NSWRange.contains(LazyRRange());
AddOp->setHasNoSignedWrap(NewNSW);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 48d5ae88cda9..6693a26e8890 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -144,6 +144,10 @@ private:
bool recognizePopcount();
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
PHINode *CntPhi, Value *Var);
+ bool recognizeAndInsertCTLZ();
+ void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
+ PHINode *CntPhi, Value *Var, const DebugLoc DL,
+ bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
/// @}
};
@@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
}
bool LoopIdiomRecognize::runOnNoncountableLoop() {
- return recognizePopcount();
+ return recognizePopcount() || recognizeAndInsertCTLZ();
}
/// Check if the given conditional branch is based on the comparison between
@@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
return true;
}
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+/// or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+/// or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// if (x0 == 0)
+/// goto loop-exit // the precondition of the loop
+/// cnt0 = init-val;
+/// do {
+/// x = phi (x0, x.next); //PhiX
+/// cnt = phi(cnt0, cnt.next);
+///
+/// cnt.next = cnt + 1;
+/// ...
+/// x.next = x >> 1; // DefX
+/// ...
+/// } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
+ Instruction *&CntInst, PHINode *&CntPhi,
+ Instruction *&DefX) {
+ BasicBlock *LoopEntry;
+ Value *VarX = nullptr;
+
+ DefX = nullptr;
+ PhiX = nullptr;
+ CntInst = nullptr;
+ CntPhi = nullptr;
+ LoopEntry = *(CurLoop->block_begin());
+
+ // step 1: Check if the loop-back branch is in desirable form.
+ if (Value *T = matchCondition(
+ dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+ DefX = dyn_cast<Instruction>(T);
+ else
+ return false;
+
+ // step 2: detect instructions corresponding to "x.next = x >> 1"
+ if (!DefX || DefX->getOpcode() != Instruction::AShr)
+ return false;
+ if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
+ if (!Shft || !Shft->isOne())
+ return false;
+ VarX = DefX->getOperand(0);
+
+ // step 3: Check the recurrence of variable X
+ PhiX = dyn_cast<PHINode>(VarX);
+ if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX))
+ return false;
+
+ // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+ // TODO: We can skip the step. If loop trip count is known (CTLZ),
+ // then all uses of "cnt.next" could be optimized to the trip count
+ // plus "cnt0". Currently it is not optimized.
+ // This step could be used to detect POPCNT instruction:
+ // cnt.next = cnt + (x.next & 1)
+ for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+ IterE = LoopEntry->end();
+ Iter != IterE; Iter++) {
+ Instruction *Inst = &*Iter;
+ if (Inst->getOpcode() != Instruction::Add)
+ continue;
+
+ ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+ if (!Inc || !Inc->isOne())
+ continue;
+
+ PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+ if (!Phi || Phi->getParent() != LoopEntry)
+ continue;
+
+ CntInst = Inst;
+ CntPhi = Phi;
+ break;
+ }
+ if (!CntInst)
+ return false;
+
+ return true;
+}
+
+/// Recognize CTLZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ trip count).
+/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+ return false;
+
+ Instruction *CntInst, *DefX;
+ PHINode *CntPhi, *PhiX;
+ if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
+ return false;
+
+ bool IsCntPhiUsedOutsideLoop = false;
+ for (User *U : CntPhi->users())
+ if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ IsCntPhiUsedOutsideLoop = true;
+ break;
+ }
+ bool IsCntInstUsedOutsideLoop = false;
+ for (User *U : CntInst->users())
+ if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ IsCntInstUsedOutsideLoop = true;
+ break;
+ }
+ // If both CntInst and CntPhi are used outside the loop the profitability
+ // is questionable.
+ if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+ return false;
+
+ // For some CPUs result of CTLZ(X) intrinsic is undefined
+ // when X is 0. If we can not guarantee X != 0, we need to check this
+ // when expand.
+ bool ZeroCheck = false;
+ // It is safe to assume Preheader exist as it was checked in
+ // parent function RunOnLoop.
+ BasicBlock *PH = CurLoop->getLoopPreheader();
+ Value *InitX = PhiX->getIncomingValueForBlock(PH);
+ // If we check X != 0 before entering the loop we don't need a zero
+ // check in CTLZ intrinsic.
+ if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+ if (BranchInst *PreCondBr =
+ dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+ if (matchCondition(PreCondBr, PH) == InitX)
+ ZeroCheck = true;
+ }
+
+ // Check if CTLZ intrinsic is profitable. Assume it is always profitable
+ // if we delete the loop (the loop has only 6 instructions):
+ // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+ // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+ // %shr = ashr %n.addr.0, 1
+ // %tobool = icmp eq %shr, 0
+ // %inc = add nsw %i.0, 1
+ // br i1 %tobool
+
+ IRBuilder<> Builder(PH->getTerminator());
+ SmallVector<const Value *, 2> Ops =
+ {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
+ ArrayRef<const Value *> Args(Ops);
+ if (CurLoop->getHeader()->size() != 6 &&
+ TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
+ TargetTransformInfo::TCC_Basic)
+ return false;
+
+ const DebugLoc DL = DefX->getDebugLoc();
+ transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+ IsCntPhiUsedOutsideLoop);
+ return true;
+}
+
/// Recognizes a population count idiom in a non-countable loop.
///
/// If detected, transforms the relevant code to issue the popcount intrinsic
@@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
return CI;
}
+static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+ const DebugLoc &DL, bool ZeroCheck) {
+ Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+ Type *Tys[] = {Val->getType()};
+
+ Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+ Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
+ CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+ CI->setDebugLoc(DL);
+
+ return CI;
+}
+
+/// Transform the following loop:
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+// LOOP_BODY
+/// Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+/// Count = CountPrev + 1
+/// else
+/// Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// PhiCount = PHI [Count, Dec]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+/// Dec = PhiCount - 1
+/// LOOP_BODY
+/// Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+ BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
+ const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+ BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+
+ // Step 1: Insert the CTLZ instruction at the end of the preheader block
+ // Count = BitWidth - CTLZ(InitX);
+ // If there are uses of CntPhi create:
+ // CountPrev = BitWidth - CTLZ(InitX >> 1);
+ IRBuilder<> Builder(PreheaderBr);
+ Builder.SetCurrentDebugLocation(DL);
+ Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
+
+ if (IsCntPhiUsedOutsideLoop)
+ InitXNext = Builder.CreateAShr(InitX,
+ ConstantInt::get(InitX->getType(), 1));
+ else
+ InitXNext = InitX;
+ CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
+ Count = Builder.CreateSub(
+ ConstantInt::get(CTLZ->getType(),
+ CTLZ->getType()->getIntegerBitWidth()),
+ CTLZ);
+ if (IsCntPhiUsedOutsideLoop) {
+ CountPrev = Count;
+ Count = Builder.CreateAdd(
+ CountPrev,
+ ConstantInt::get(CountPrev->getType(), 1));
+ }
+ if (IsCntPhiUsedOutsideLoop)
+ NewCount = Builder.CreateZExtOrTrunc(CountPrev,
+ cast<IntegerType>(CntInst->getType()));
+ else
+ NewCount = Builder.CreateZExtOrTrunc(Count,
+ cast<IntegerType>(CntInst->getType()));
+
+ // If the CTLZ counter's initial value is not zero, insert Add Inst.
+ Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+ ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+ if (!InitConst || !InitConst->isZero())
+ NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+
+ // Step 2: Insert new IV and loop condition:
+ // loop:
+ // ...
+ // PhiCount = PHI [Count, Dec]
+ // ...
+ // Dec = PhiCount - 1
+ // ...
+ // Br: loop if (Dec != 0)
+ BasicBlock *Body = *(CurLoop->block_begin());
+ auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+ Type *Ty = Count->getType();
+
+ PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+ Builder.SetInsertPoint(LbCond);
+ Instruction *TcDec = cast<Instruction>(
+ Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+ "tcdec", false, true));
+
+ TcPhi->addIncoming(Count, Preheader);
+ TcPhi->addIncoming(TcDec, Body);
+
+ CmpInst::Predicate Pred =
+ (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+ LbCond->setPredicate(Pred);
+ LbCond->setOperand(0, TcDec);
+ LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+ // Step 3: All the references to the original counter outside
+ // the loop are replaced with the NewCount -- the value returned from
+ // __builtin_ctlz(x).
+ if (IsCntPhiUsedOutsideLoop)
+ CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+ else
+ CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+ // step 4: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+ SE->forgetLoop(CurLoop);
+}
+
void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
Instruction *CntInst,
PHINode *CntPhi, Value *Var) {
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 3c9850b156ac..5e0a705782ea 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -283,7 +283,6 @@ public:
// Forward propagation info
const Expression *getDefiningExpr() const { return DefiningExpr; }
- void setDefiningExpr(const Expression *E) { DefiningExpr = E; }
// Value member set
bool empty() const { return Members.empty(); }
@@ -317,6 +316,9 @@ public:
--StoreCount;
}
+ // True if this class has no memory members.
+ bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
// Return true if two congruence classes are equivalent to each other. This
// means
// that every field but the ID number and the dead field are equivalent.
@@ -401,9 +403,12 @@ class NewGVN {
MemorySSAWalker *MSSAWalker;
const DataLayout &DL;
std::unique_ptr<PredicateInfo> PredInfo;
- BumpPtrAllocator ExpressionAllocator;
- ArrayRecycler<Value *> ArgRecycler;
- TarjanSCC SCCFinder;
+
+ // These are the only two things the create* functions should have
+ // side-effects on due to allocating memory.
+ mutable BumpPtrAllocator ExpressionAllocator;
+ mutable ArrayRecycler<Value *> ArgRecycler;
+ mutable TarjanSCC SCCFinder;
const SimplifyQuery SQ;
// Number of function arguments, used by ranking
@@ -430,11 +435,12 @@ class NewGVN {
// In order to correctly ensure propagation, we must keep track of what
// comparisons we used, so that when the values of the comparisons change, we
// propagate the information to the places we used the comparison.
- DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers;
- // Mapping from MemoryAccess we used to the MemoryAccess we used it with. Has
+ mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+ PredicateToUsers;
// the same reasoning as PredicateToUsers. When we skip MemoryAccesses for
// stores, we no longer can rely solely on the def-use chains of MemorySSA.
- DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> MemoryToUsers;
+ mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+ MemoryToUsers;
// A table storing which memorydefs/phis represent a memory state provably
// equivalent to another memory state.
@@ -457,7 +463,7 @@ class NewGVN {
DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle };
- DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
+ mutable DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
// Expression to class mapping.
using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
ExpressionClassMap ExpressionToClass;
@@ -511,21 +517,24 @@ public:
private:
// Expression handling.
- const Expression *createExpression(Instruction *);
- const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *);
- PHIExpression *createPHIExpression(Instruction *, bool &HasBackedge,
- bool &AllConstant);
- const VariableExpression *createVariableExpression(Value *);
- const ConstantExpression *createConstantExpression(Constant *);
- const Expression *createVariableOrConstant(Value *V);
- const UnknownExpression *createUnknownExpression(Instruction *);
+ const Expression *createExpression(Instruction *) const;
+ const Expression *createBinaryExpression(unsigned, Type *, Value *,
+ Value *) const;
+ PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
+ bool &AllConstant) const;
+ const VariableExpression *createVariableExpression(Value *) const;
+ const ConstantExpression *createConstantExpression(Constant *) const;
+ const Expression *createVariableOrConstant(Value *V) const;
+ const UnknownExpression *createUnknownExpression(Instruction *) const;
const StoreExpression *createStoreExpression(StoreInst *,
- const MemoryAccess *);
+ const MemoryAccess *) const;
LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
- const MemoryAccess *);
- const CallExpression *createCallExpression(CallInst *, const MemoryAccess *);
- const AggregateValueExpression *createAggregateValueExpression(Instruction *);
- bool setBasicExpressionInfo(Instruction *, BasicExpression *);
+ const MemoryAccess *) const;
+ const CallExpression *createCallExpression(CallInst *,
+ const MemoryAccess *) const;
+ const AggregateValueExpression *
+ createAggregateValueExpression(Instruction *) const;
+ bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
// Congruence class handling.
CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
@@ -560,17 +569,18 @@ private:
// Symbolic evaluation.
const Expression *checkSimplificationResults(Expression *, Instruction *,
- Value *);
- const Expression *performSymbolicEvaluation(Value *);
+ Value *) const;
+ const Expression *performSymbolicEvaluation(Value *) const;
const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
- Instruction *, MemoryAccess *);
- const Expression *performSymbolicLoadEvaluation(Instruction *);
- const Expression *performSymbolicStoreEvaluation(Instruction *);
- const Expression *performSymbolicCallEvaluation(Instruction *);
- const Expression *performSymbolicPHIEvaluation(Instruction *);
- const Expression *performSymbolicAggrValueEvaluation(Instruction *);
- const Expression *performSymbolicCmpEvaluation(Instruction *);
- const Expression *performSymbolicPredicateInfoEvaluation(Instruction *);
+ Instruction *,
+ MemoryAccess *) const;
+ const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+ const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+ const Expression *performSymbolicCallEvaluation(Instruction *) const;
+ const Expression *performSymbolicPHIEvaluation(Instruction *) const;
+ const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+ const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+ const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
// Congruence finding.
bool someEquivalentDominates(const Instruction *, const Instruction *) const;
@@ -620,8 +630,8 @@ private:
void markPredicateUsersTouched(Instruction *);
void markValueLeaderChangeTouched(CongruenceClass *CC);
void markMemoryLeaderChangeTouched(CongruenceClass *CC);
- void addPredicateUsers(const PredicateBase *, Instruction *);
- void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U);
+ void addPredicateUsers(const PredicateBase *, Instruction *) const;
+ void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
// Main loop of value numbering
void iterateTouchedInstructions();
@@ -634,7 +644,7 @@ private:
void verifyIterationSettled(Function &F);
bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
BasicBlock *getBlockForValue(Value *V) const;
- void deleteExpression(const Expression *E);
+ void deleteExpression(const Expression *E) const;
unsigned InstrToDFSNum(const Value *V) const {
assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
return InstrDFS.lookup(V);
@@ -654,7 +664,7 @@ private:
? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
: InstrDFS.lookup(MA);
}
- bool isCycleFree(const PHINode *PN);
+ bool isCycleFree(const PHINode *PN) const;
template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
// Debug counter info. When verifying, we have to reset the value numbering
// debug counter to the same state it started in to get the same results.
@@ -702,7 +712,7 @@ BasicBlock *NewGVN::getBlockForValue(Value *V) const {
// Delete a definitely dead expression, so it can be reused by the expression
// allocator. Some of these are not in creation functions, so we have to accept
// const versions.
-void NewGVN::deleteExpression(const Expression *E) {
+void NewGVN::deleteExpression(const Expression *E) const {
assert(isa<BasicExpression>(E));
auto *BE = cast<BasicExpression>(E);
const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
@@ -710,7 +720,7 @@ void NewGVN::deleteExpression(const Expression *E) {
}
PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
- bool &AllConstant) {
+ bool &AllConstant) const {
BasicBlock *PHIBlock = I->getParent();
auto *PN = cast<PHINode>(I);
auto *E =
@@ -722,30 +732,46 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock));
+ // NewGVN assumes the operands of a PHI node are in a consistent order across
+ // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
+ // this in LLVM at some point we don't want GVN to find wrong congruences.
+ // Therefore, here we sort uses in predecessor order.
+ // We're sorting the values by pointer. In theory this might be cause of
+ // non-determinism, but here we don't rely on the ordering for anything
+ // significant, e.g. we don't create new instructions based on it so we're
+ // fine.
+ SmallVector<const Use *, 4> PHIOperands;
+ for (const Use &U : PN->operands())
+ PHIOperands.push_back(&U);
+ std::sort(PHIOperands.begin(), PHIOperands.end(),
+ [&](const Use *U1, const Use *U2) {
+ return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2);
+ });
+
// Filter out unreachable phi operands.
- auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) {
- return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock});
+ auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
+ return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock});
});
std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
- [&](const Use &U) -> Value * {
- auto *BB = PN->getIncomingBlock(U);
+ [&](const Use *U) -> Value * {
+ auto *BB = PN->getIncomingBlock(*U);
auto *DTN = DT->getNode(BB);
if (RPOOrdering.lookup(DTN) >= PHIRPO)
HasBackedge = true;
- AllConstant &= isa<UndefValue>(U) || isa<Constant>(U);
+ AllConstant &= isa<UndefValue>(*U) || isa<Constant>(*U);
// Don't try to transform self-defined phis.
- if (U == PN)
+ if (*U == PN)
return PN;
- return lookupOperandLeader(U);
+ return lookupOperandLeader(*U);
});
return E;
}
// Set basic expression info (Arguments, type, opcode) for Expression
// E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
bool AllConstant = true;
if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
E->setType(GEP->getSourceElementType());
@@ -766,7 +792,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
}
const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
- Value *Arg1, Value *Arg2) {
+ Value *Arg1,
+ Value *Arg2) const {
auto *E = new (ExpressionAllocator) BasicExpression(2);
E->setType(T);
@@ -795,7 +822,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
// TODO: Once finished, this should not take an Instruction, we only
// use it for printing.
const Expression *NewGVN::checkSimplificationResults(Expression *E,
- Instruction *I, Value *V) {
+ Instruction *I,
+ Value *V) const {
if (!V)
return nullptr;
if (auto *C = dyn_cast<Constant>(V)) {
@@ -827,7 +855,7 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
return nullptr;
}
-const Expression *NewGVN::createExpression(Instruction *I) {
+const Expression *NewGVN::createExpression(Instruction *I) const {
auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
bool AllConstant = setBasicExpressionInfo(I, E);
@@ -913,7 +941,7 @@ const Expression *NewGVN::createExpression(Instruction *I) {
}
const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I) {
+NewGVN::createAggregateValueExpression(Instruction *I) const {
if (auto *II = dyn_cast<InsertValueInst>(I)) {
auto *E = new (ExpressionAllocator)
AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
@@ -932,32 +960,32 @@ NewGVN::createAggregateValueExpression(Instruction *I) {
llvm_unreachable("Unhandled type of aggregate value operation");
}
-const VariableExpression *NewGVN::createVariableExpression(Value *V) {
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
auto *E = new (ExpressionAllocator) VariableExpression(V);
E->setOpcode(V->getValueID());
return E;
}
-const Expression *NewGVN::createVariableOrConstant(Value *V) {
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
if (auto *C = dyn_cast<Constant>(V))
return createConstantExpression(C);
return createVariableExpression(V);
}
-const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
auto *E = new (ExpressionAllocator) ConstantExpression(C);
E->setOpcode(C->getValueID());
return E;
}
-const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
auto *E = new (ExpressionAllocator) UnknownExpression(I);
E->setOpcode(I->getOpcode());
return E;
}
-const CallExpression *NewGVN::createCallExpression(CallInst *CI,
- const MemoryAccess *MA) {
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
// FIXME: Add operand bundles for calls.
auto *E =
new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
@@ -1017,9 +1045,8 @@ Value *NewGVN::lookupOperandLeader(Value *V) const {
const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
auto *CC = getMemoryClass(MA);
assert(CC->getMemoryLeader() &&
- "Every MemoryAccess should be mapped to a "
- "congruence class with a represenative memory "
- "access");
+ "Every MemoryAccess should be mapped to a congruence class with a "
+ "representative memory access");
return CC->getMemoryLeader();
}
@@ -1032,7 +1059,7 @@ bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const {
LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
LoadInst *LI,
- const MemoryAccess *MA) {
+ const MemoryAccess *MA) const {
auto *E =
new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
E->allocateOperands(ArgRecycler, ExpressionAllocator);
@@ -1050,8 +1077,8 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
return E;
}
-const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
- const MemoryAccess *MA) {
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
auto *E = new (ExpressionAllocator)
StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
@@ -1068,7 +1095,7 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
return E;
}
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
// Unlike loads, we never try to eliminate stores, so we do not check if they
// are simple and avoid value numbering them.
auto *SI = cast<StoreInst>(I);
@@ -1126,7 +1153,7 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
const Expression *
NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
LoadInst *LI, Instruction *DepInst,
- MemoryAccess *DefiningAccess) {
+ MemoryAccess *DefiningAccess) const {
assert((!LI || LI->isSimple()) && "Not a simple load");
if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
// Can't forward from non-atomic to atomic without violating memory model.
@@ -1201,7 +1228,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
return nullptr;
}
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
auto *LI = cast<LoadInst>(I);
// We can eliminate in favor of non-simple loads, but we won't be able to
@@ -1239,7 +1266,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
}
const Expression *
-NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
auto *PI = PredInfo->getPredicateInfoFor(I);
if (!PI)
return nullptr;
@@ -1284,7 +1311,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
return nullptr;
if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
- DEBUG(dbgs() << "Copy is not of any condition operands!");
+ DEBUG(dbgs() << "Copy is not of any condition operands!\n");
return nullptr;
}
Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
@@ -1329,7 +1356,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
}
// Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
auto *CI = cast<CallInst>(I);
if (auto *II = dyn_cast<IntrinsicInst>(I)) {
// Instrinsics with the returned attribute are copies of arguments.
@@ -1366,8 +1393,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
DEBUG(dbgs() << "Setting " << *From);
DEBUG(dbgs() << " equivalent to congruence class ");
DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
- DEBUG(dbgs() << *NewClass->getMemoryLeader());
- DEBUG(dbgs() << "\n");
+ DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
auto LookupResult = MemoryAccessToClass.find(From);
bool Changed = false;
@@ -1381,7 +1407,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
NewClass->memory_insert(MP);
// This may have killed the class if it had no non-memory members
if (OldClass->getMemoryLeader() == From) {
- if (OldClass->memory_empty()) {
+ if (OldClass->definesNoMemory()) {
OldClass->setMemoryLeader(nullptr);
} else {
OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
@@ -1406,7 +1432,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
// Determine if a phi is cycle-free. That means the values in the phi don't
// depend on any expressions that can change value as a result of the phi.
// For example, a non-cycle free phi would be v = phi(0, v+1).
-bool NewGVN::isCycleFree(const PHINode *PN) {
+bool NewGVN::isCycleFree(const PHINode *PN) const {
// In order to compute cycle-freeness, we do SCC finding on the phi, and see
// what kind of SCC it ends up in. If it is a singleton, it is cycle-free.
// If it is not in a singleton, it is only cycle free if the other members are
@@ -1436,7 +1462,7 @@ bool NewGVN::isCycleFree(const PHINode *PN) {
}
// Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
// True if one of the incoming phi edges is a backedge.
bool HasBackedge = false;
// All constant tracks the state of whether all the *original* phi operands
@@ -1510,7 +1536,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
return E;
}
-const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
+const Expression *
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
@@ -1548,7 +1575,7 @@ const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
return createAggregateValueExpression(I);
}
-const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
auto *CI = dyn_cast<CmpInst>(I);
// See if our operands are equal to those of a previous predicate, and if so,
// if it implies true or false.
@@ -1663,7 +1690,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
}
// Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V) {
+const Expression *NewGVN::performSymbolicEvaluation(Value *V) const {
const Expression *E = nullptr;
if (auto *C = dyn_cast<Constant>(V))
E = createConstantExpression(C);
@@ -1749,7 +1776,7 @@ void NewGVN::markUsersTouched(Value *V) {
}
}
-void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) {
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
MemoryToUsers[To].insert(U);
}
@@ -1772,7 +1799,7 @@ void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
}
// Add I to the set of users of a given predicate.
-void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) {
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
PredicateToUsers[PBranch->Condition].insert(I);
else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
@@ -1825,8 +1852,7 @@ const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
// TODO: If this ends up to slow, we can maintain a next memory leader like we
// do for regular leaders.
// Make sure there will be a leader to find
- assert((CC->getStoreCount() > 0 || !CC->memory_empty()) &&
- "Can't get next leader if there is none");
+ assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
if (CC->getStoreCount() > 0) {
if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
return MSSA->getMemoryAccess(NL);
@@ -1898,7 +1924,7 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
setMemoryClass(InstMA, NewClass);
// Now, fixup the old class if necessary
if (OldClass->getMemoryLeader() == InstMA) {
- if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) {
+ if (!OldClass->definesNoMemory()) {
OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
DEBUG(dbgs() << "Memory class leader change for class "
<< OldClass->getID() << " to "
@@ -1956,10 +1982,9 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
// If it's a store expression we are using, it means we are not equivalent
// to something earlier.
- if (isa<StoreExpression>(E)) {
- assert(lookupOperandLeader(SI->getValueOperand()) !=
- NewClass->getLeader());
- NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+ if (auto *SE = dyn_cast<StoreExpression>(E)) {
+ assert(SE->getStoredValue() != NewClass->getLeader());
+ NewClass->setStoredValue(SE->getStoredValue());
markValueLeaderChangeTouched(NewClass);
// Shift the new class leader to be the store
DEBUG(dbgs() << "Changing leader of congruence class "
@@ -1985,7 +2010,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// See if we destroyed the class or need to swap leaders.
if (OldClass->empty() && OldClass != TOPClass) {
if (OldClass->getDefiningExpr()) {
- DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr()
+ DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
<< " from table\n");
ExpressionToClass.erase(OldClass->getDefiningExpr());
}
@@ -2064,7 +2089,7 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
} else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
StoreInst *SI = SE->getStoreInst();
NewClass->setLeader(SI);
- NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+ NewClass->setStoredValue(SE->getStoredValue());
// The RepMemoryAccess field will be filled in properly by the
// moveValueToNewCongruenceClass call.
} else {
@@ -2523,6 +2548,19 @@ void NewGVN::verifyMemoryCongruency() const {
return false;
if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+ // We could have phi nodes which operands are all trivially dead,
+ // so we don't process them.
+ if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+ for (auto &U : MemPHI->incoming_values()) {
+ if (Instruction *I = dyn_cast<Instruction>(U.get())) {
+ if (!isInstructionTriviallyDead(I))
+ return true;
+ }
+ }
+ return false;
+ }
+
return true;
};
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index fb1b47c48276..4f608c97147d 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -55,7 +55,7 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
/// Update the dominator tree after removing one exiting predecessor of a loop
/// exit block.
static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L,
- DominatorTree &DT) {
+ DominatorTree &DT) {
assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&
"Cannot have empty predecessors of the loop exit block if we split "
"off a block to unswitch!");
@@ -137,6 +137,98 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
}
}
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+ BasicBlock &ExitBB) {
+ for (Instruction &I : ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ return true;
+
+ // If the incoming value for this edge isn't loop invariant the unswitch
+ // won't be trivial.
+ if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+ return false;
+ }
+ llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ for (Instruction &I : UnswitchedBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ break;
+
+ // When the loop exit is directly unswitched we just need to update the
+ // incoming basic block. We loop to handle weird cases with repeated
+ // incoming blocks, but expect to typically only have one operand here.
+ for (auto i : llvm::seq<int>(0, PN->getNumOperands())) {
+ assert(PN->getIncomingBlock(i) == &OldExitingBB &&
+ "Found incoming block different from unique predecessor!");
+ PN->setIncomingBlock(i, &OldPH);
+ }
+ }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+ BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ assert(&ExitBB != &UnswitchedBB &&
+ "Must have different loop exit and unswitched blocks!");
+ Instruction *InsertPt = &*UnswitchedBB.begin();
+ for (Instruction &I : ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ break;
+
+ auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2,
+ PN->getName() + ".split", InsertPt);
+
+ // Walk backwards over the old PHI node's inputs to minimize the cost of
+ // removing each one. We have to do this weird loop manually so that we
+ // create the same number of new incoming edges in the new PHI as we expect
+ // each case-based edge to be included in the unswitched switch in some
+ // cases.
+ // FIXME: This is really, really gross. It would be much cleaner if LLVM
+ // allowed us to create a single entry for a predecessor block without
+ // having separate entries for each "edge" even though these edges are
+ // required to produce identical results.
+ for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+ if (PN->getIncomingBlock(i) != &OldExitingBB)
+ continue;
+
+ Value *Incoming = PN->removeIncomingValue(i);
+ NewPN->addIncoming(Incoming, &OldPH);
+ }
+
+ // Now replace the old PHI with the new one and wire the old one in as an
+ // input to the new one.
+ PN->replaceAllUsesWith(NewPN);
+ NewPN->addIncoming(PN, &ExitBB);
+ }
+}
+
/// Unswitch a trivial branch if the condition is loop invariant.
///
/// This routine should only be called when loop code leading to the branch has
@@ -187,10 +279,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
assert(L.contains(ContinueBB) &&
"Cannot have both successors exit and still be in the loop!");
- // If the loop exit block contains phi nodes, this isn't trivial.
- // FIXME: We should examine the PHI to determine whether or not we can handle
- // it trivially.
- if (isa<PHINode>(LoopExitBB->begin()))
+ auto *ParentBB = BI.getParent();
+ if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
return false;
DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal
@@ -209,14 +299,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
BasicBlock *UnswitchedBB;
if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
(void)PredBB;
- assert(PredBB == BI.getParent() && "A branch's parent is't a predecessor!");
+ assert(PredBB == BI.getParent() &&
+ "A branch's parent isn't a predecessor!");
UnswitchedBB = LoopExitBB;
} else {
UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
}
- BasicBlock *ParentBB = BI.getParent();
-
// Now splice the branch to gate reaching the new preheader and re-point its
// successors.
OldPH->getInstList().splice(std::prev(OldPH->end()),
@@ -229,6 +318,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
// terminator.
BranchInst::Create(ContinueBB, ParentBB);
+ // Rewrite the relevant PHI nodes.
+ if (UnswitchedBB == LoopExitBB)
+ rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+ else
+ rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+ *ParentBB, *OldPH);
+
// Now we need to update the dominator tree.
updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
// But if we split something off of the loop exit block then we also removed
@@ -278,6 +374,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (!L.isLoopInvariant(LoopCond))
return false;
+ auto *ParentBB = SI.getParent();
+
// FIXME: We should compute this once at the start and update it!
SmallVector<BasicBlock *, 16> ExitBlocks;
L.getExitBlocks(ExitBlocks);
@@ -287,12 +385,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
SmallVector<int, 4> ExitCaseIndices;
for (auto Case : SI.cases()) {
auto *SuccBB = Case.getCaseSuccessor();
- if (ExitBlockSet.count(SuccBB) && !isa<PHINode>(SuccBB->begin()))
+ if (ExitBlockSet.count(SuccBB) &&
+ areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
ExitCaseIndices.push_back(Case.getCaseIndex());
}
BasicBlock *DefaultExitBB = nullptr;
if (ExitBlockSet.count(SI.getDefaultDest()) &&
- !isa<PHINode>(SI.getDefaultDest()->begin()) &&
+ areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
!isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
DefaultExitBB = SI.getDefaultDest();
else if (ExitCaseIndices.empty())
@@ -330,7 +429,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (CommonSuccBB) {
SI.setDefaultDest(CommonSuccBB);
} else {
- BasicBlock *ParentBB = SI.getParent();
BasicBlock *UnreachableBB = BasicBlock::Create(
ParentBB->getContext(),
Twine(ParentBB->getName()) + ".unreachable_default",
@@ -358,30 +456,44 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
// Now add the unswitched switch.
auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
- // Split any exit blocks with remaining in-loop predecessors. We walk in
- // reverse so that we split in the same order as the cases appeared. This is
- // purely for convenience of reading the resulting IR, but it doesn't cost
- // anything really.
+ // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+ // First, we split any exit blocks with remaining in-loop predecessors. Then
+ // we update the PHIs in one of two ways depending on if there was a split.
+ // We walk in reverse so that we split in the same order as the cases
+ // appeared. This is purely for convenience of reading the resulting IR, but
+ // it doesn't cost anything really.
+ SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
// Handle the default exit if necessary.
// FIXME: It'd be great if we could merge this with the loop below but LLVM's
// ranges aren't quite powerful enough yet.
- if (DefaultExitBB && !pred_empty(DefaultExitBB)) {
- auto *SplitBB =
- SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
- updateLoopExitIDom(DefaultExitBB, L, DT);
- DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+ if (DefaultExitBB) {
+ if (pred_empty(DefaultExitBB)) {
+ UnswitchedExitBBs.insert(DefaultExitBB);
+ rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+ } else {
+ auto *SplitBB =
+ SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
+ rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+ *ParentBB, *OldPH);
+ updateLoopExitIDom(DefaultExitBB, L, DT);
+ DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+ }
}
// Note that we must use a reference in the for loop so that we update the
// container.
for (auto &CasePair : reverse(ExitCases)) {
// Grab a reference to the exit block in the pair so that we can update it.
- BasicBlock *&ExitBB = CasePair.second;
+ BasicBlock *ExitBB = CasePair.second;
// If this case is the last edge into the exit block, we can simply reuse it
// as it will no longer be a loop exit. No mapping necessary.
- if (pred_empty(ExitBB))
+ if (pred_empty(ExitBB)) {
+ // Only rewrite once.
+ if (UnswitchedExitBBs.insert(ExitBB).second)
+ rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
continue;
+ }
// Otherwise we need to split the exit block so that we retain an exit
// block from the loop and a target for the unswitched condition.
@@ -389,9 +501,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (!SplitExitBB) {
// If this is the first time we see this, do the split and remember it.
SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+ rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+ *ParentBB, *OldPH);
updateLoopExitIDom(ExitBB, L, DT);
}
- ExitBB = SplitExitBB;
+ // Update the case pair to point to the split block.
+ CasePair.second = SplitExitBB;
}
// Now add the unswitched cases. We do this in reverse order as we built them
diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp
index a0fc966cee2c..a7c308b59877 100644
--- a/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -208,6 +208,47 @@ bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
return false;
}
+static unsigned ComputeSpeculationCost(const Instruction *I,
+ const TargetTransformInfo &TTI) {
+ switch (Operator::getOpcode(I)) {
+ case Instruction::GetElementPtr:
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Select:
+ case Instruction::Shl:
+ case Instruction::Sub:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::Call:
+ case Instruction::BitCast:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::AddrSpaceCast:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPExt:
+ case Instruction::FPTrunc:
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return TTI.getUserCost(I);
+
+ default:
+ return UINT_MAX; // Disallow anything not whitelisted.
+ }
+}
+
bool SpeculativeExecutionPass::considerHoistingFromTo(
BasicBlock &FromBlock, BasicBlock &ToBlock) {
SmallSet<const Instruction *, 8> NotHoisted;
@@ -223,7 +264,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
unsigned TotalSpeculationCost = 0;
for (auto& I : FromBlock) {
- const unsigned Cost = TTI->getUserCost(&I);
+ const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
AllPrecedingUsesFromBlockHoisted(&I)) {
TotalSpeculationCost += Cost;
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index 7ffdad597a9b..83ec7f55d1af 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -261,10 +261,10 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V,
computeKnownBits(V, Known, DL);
- if (Known.Zero.countLeadingOnes() >= HiBits)
+ if (Known.countMinLeadingZeros() >= HiBits)
return VALRNG_KNOWN_SHORT;
- if (Known.One.countLeadingZeros() < HiBits)
+ if (Known.countMaxLeadingZeros() < HiBits)
return VALRNG_LIKELY_LONG;
// Long integer divisions are often used in hashtable implementations. It's
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index d5124ac89016..4aa26fd14fee 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -41,6 +41,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
ValueToValueMapTy &VMap,
const Twine &NameSuffix, Function *F,
ClonedCodeInfo *CodeInfo) {
+ DenseMap<const MDNode *, MDNode *> Cache;
BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
@@ -50,6 +51,9 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
II != IE; ++II) {
Instruction *NewInst = II->clone();
+ if (F && F->getSubprogram())
+ DebugLoc::reparentDebugInfo(*NewInst, BB->getParent()->getSubprogram(),
+ F->getSubprogram(), Cache);
if (II->hasName())
NewInst->setName(II->getName()+NameSuffix);
NewBB->getInstList().push_back(NewInst);
@@ -120,12 +124,28 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
OldFunc->getAllMetadata(MDs);
- for (auto MD : MDs)
- NewFunc->addMetadata(
- MD.first,
- *MapMetadata(MD.second, VMap,
- ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
- TypeMapper, Materializer));
+ for (auto MD : MDs) {
+ MDNode *NewMD;
+ bool MustCloneSP =
+ (MD.first == LLVMContext::MD_dbg && OldFunc->getParent() &&
+ OldFunc->getParent() == NewFunc->getParent());
+ if (MustCloneSP) {
+ auto *SP = cast<DISubprogram>(MD.second);
+ NewMD = DISubprogram::getDistinct(
+ NewFunc->getContext(), SP->getScope(), SP->getName(),
+ NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(),
+ SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(),
+ SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(),
+ SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(),
+ SP->getUnit(), SP->getTemplateParams(), SP->getDeclaration(),
+ SP->getVariables(), SP->getThrownTypes());
+ } else
+ NewMD =
+ MapMetadata(MD.second, VMap,
+ ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+ TypeMapper, Materializer);
+ NewFunc->addMetadata(MD.first, *NewMD);
+ }
// Loop over all of the basic blocks in the function, cloning them as
// appropriate. Note that we save BE this way in order to handle cloning of
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 4e9d67252d6c..5444b752de82 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -96,7 +96,7 @@ std::unique_ptr<Module> llvm::CloneModule(
else
GV = new GlobalVariable(
*New, I->getValueType(), false, GlobalValue::ExternalLinkage,
- (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr,
+ nullptr, I->getName(), nullptr,
I->getThreadLocalMode(), I->getType()->getAddressSpace());
VMap[&*I] = GV;
// We do not copy attributes (mainly because copying between different
diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp
index 8c2386554da5..78d7474e5b95 100644
--- a/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -67,8 +67,7 @@ IRBuilder<> *EscapeEnumerator::Next() {
// Create a cleanup block.
LLVMContext &C = F.getContext();
BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
- Type *ExnTy =
- StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr);
+ Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));
if (!F.hasPersonalityFn()) {
Constant *PersFn = getDefaultPersonalityFn(F.getParent());
F.setPersonalityFn(PersFn);
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 6d56e08af99f..9cb4762b683c 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -1302,41 +1302,6 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
return false;
}
-/// Rebuild the entire inlined-at chain for this instruction so that the top of
-/// the chain now is inlined-at the new call site.
-static DebugLoc
-updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode,
- LLVMContext &Ctx,
- DenseMap<const DILocation *, DILocation *> &IANodes) {
- SmallVector<DILocation *, 3> InlinedAtLocations;
- DILocation *Last = InlinedAtNode;
- DILocation *CurInlinedAt = DL;
-
- // Gather all the inlined-at nodes
- while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
- // Skip any we've already built nodes for
- if (DILocation *Found = IANodes[IA]) {
- Last = Found;
- break;
- }
-
- InlinedAtLocations.push_back(IA);
- CurInlinedAt = IA;
- }
-
- // Starting from the top, rebuild the nodes to point to the new inlined-at
- // location (then rebuilding the rest of the chain behind it) and update the
- // map of already-constructed inlined-at nodes.
- for (const DILocation *MD : reverse(InlinedAtLocations)) {
- Last = IANodes[MD] = DILocation::getDistinct(
- Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
- }
-
- // And finally create the normal location for this instruction, referring to
- // the new inlined-at chain.
- return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last);
-}
-
/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
/// block. Allocas used in inalloca calls and allocas of dynamic array size
/// cannot be static.
@@ -1364,14 +1329,16 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
// Cache the inlined-at nodes as they're built so they are reused, without
// this every instruction's inlined-at chain would become distinct from each
// other.
- DenseMap<const DILocation *, DILocation *> IANodes;
+ DenseMap<const MDNode *, MDNode *> IANodes;
for (; FI != Fn->end(); ++FI) {
for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
BI != BE; ++BI) {
if (DebugLoc DL = BI->getDebugLoc()) {
- BI->setDebugLoc(
- updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
+ auto IA = DebugLoc::appendInlinedAt(DL, InlinedAtNode, BI->getContext(),
+ IANodes);
+ auto IDL = DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), IA);
+ BI->setDebugLoc(IDL);
continue;
}
@@ -1429,11 +1396,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
/// Update the branch metadata for cloned call instructions.
static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
const Optional<uint64_t> &CalleeEntryCount,
- const Instruction *TheCall) {
+ const Instruction *TheCall,
+ ProfileSummaryInfo *PSI) {
if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
return;
Optional<uint64_t> CallSiteCount =
- ProfileSummaryInfo::getProfileCount(TheCall, nullptr);
+ PSI ? PSI->getProfileCount(TheCall, nullptr) : None;
uint64_t CallCount =
std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
CalleeEntryCount.getValue());
@@ -1456,16 +1424,16 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
/// The callsite's block count is subtracted from the callee's function entry
/// count.
static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
- Instruction *CallInst, Function *Callee) {
+ Instruction *CallInst, Function *Callee,
+ ProfileSummaryInfo *PSI) {
// If the callee has a original count of N, and the estimated count of
// callsite is M, the new callee count is set to N - M. M is estimated from
// the caller's entry count, its entry block frequency and the block frequency
// of the callsite.
Optional<uint64_t> CalleeCount = Callee->getEntryCount();
- if (!CalleeCount.hasValue())
+ if (!CalleeCount.hasValue() || !PSI)
return;
- Optional<uint64_t> CallCount =
- ProfileSummaryInfo::getProfileCount(CallInst, CallerBFI);
+ Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);
if (!CallCount.hasValue())
return;
// Since CallSiteCount is an estimate, it could exceed the original callee
@@ -1668,9 +1636,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
CalledFunc->front());
- updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall);
+ updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall,
+ IFI.PSI);
// Update the profile count of callee.
- updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc);
+ updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI);
// Inject byval arguments initialization.
for (std::pair<Value*, Value*> &Init : ByValInit)
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index 8a1973d1db05..53b432fcafd4 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -26,16 +26,15 @@ namespace {
InstNamer() : FunctionPass(ID) {
initializeInstNamerPass(*PassRegistry::getPassRegistry());
}
-
+
void getAnalysisUsage(AnalysisUsage &Info) const override {
Info.setPreservesAll();
}
bool runOnFunction(Function &F) override {
- for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
- AI != AE; ++AI)
- if (!AI->hasName() && !AI->getType()->isVoidTy())
- AI->setName("arg");
+ for (auto &Arg : F.args())
+ if (!Arg.hasName())
+ Arg.setName("arg");
for (BasicBlock &BB : F) {
if (!BB.hasName())
@@ -48,11 +47,11 @@ namespace {
return true;
}
};
-
+
char InstNamer::ID = 0;
}
-INITIALIZE_PASS(InstNamer, "instnamer",
+INITIALIZE_PASS(InstNamer, "instnamer",
"Assign names to anonymous instructions", false, false)
char &llvm::InstructionNamerID = InstNamer::ID;
//===----------------------------------------------------------------------===//
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index ce6b703f3528..1ca509472b5f 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1041,7 +1041,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
KnownBits Known(BitWidth);
computeKnownBits(V, Known, DL, 0, AC, CxtI, DT);
- unsigned TrailZ = Known.Zero.countTrailingOnes();
+ unsigned TrailZ = Known.countMinTrailingZeros();
// Avoid trouble with ridiculously large TrailZ values, such as
// those computed from a null pointer.
@@ -1105,8 +1105,9 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
StoreInst *SI, DIBuilder &Builder) {
auto *DIVar = DDI->getVariable();
- auto *DIExpr = DDI->getExpression();
assert(DIVar && "Missing variable");
+ auto *DIExpr = DDI->getExpression();
+ Value *DV = SI->getOperand(0);
// If an argument is zero extended then use argument directly. The ZExt
// may be zapped by an optimization pass in future.
@@ -1116,34 +1117,28 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
if (ExtendedArg) {
- // We're now only describing a subset of the variable. The fragment we're
- // describing will always be smaller than the variable size, because
- // VariableSize == Size of Alloca described by DDI. Since SI stores
- // to the alloca described by DDI, if it's first operand is an extend,
- // we're guaranteed that before extension, the value was narrower than
- // the size of the alloca, hence the size of the described variable.
- SmallVector<uint64_t, 3> Ops;
- unsigned FragmentOffset = 0;
- // If this already is a bit fragment, we drop the bit fragment from the
- // expression and record the offset.
- auto Fragment = DIExpr->getFragmentInfo();
- if (Fragment) {
- Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
- FragmentOffset = Fragment->OffsetInBits;
- } else {
- Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+ // If this DDI was already describing only a fragment of a variable, ensure
+ // that fragment is appropriately narrowed here.
+ // But if a fragment wasn't used, describe the value as the original
+ // argument (rather than the zext or sext) so that it remains described even
+ // if the sext/zext is optimized away. This widens the variable description,
+ // leaving it up to the consumer to know how the smaller value may be
+ // represented in a larger register.
+ if (auto Fragment = DIExpr->getFragmentInfo()) {
+ unsigned FragmentOffset = Fragment->OffsetInBits;
+ SmallVector<uint64_t, 3> Ops(DIExpr->elements_begin(),
+ DIExpr->elements_end() - 3);
+ Ops.push_back(dwarf::DW_OP_LLVM_fragment);
+ Ops.push_back(FragmentOffset);
+ const DataLayout &DL = DDI->getModule()->getDataLayout();
+ Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
+ DIExpr = Builder.createExpression(Ops);
}
- Ops.push_back(dwarf::DW_OP_LLVM_fragment);
- Ops.push_back(FragmentOffset);
- const DataLayout &DL = DDI->getModule()->getDataLayout();
- Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
- auto NewDIExpr = Builder.createExpression(Ops);
- if (!LdStHasDebugValue(DIVar, NewDIExpr, SI))
- Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr,
- DDI->getDebugLoc(), SI);
- } else if (!LdStHasDebugValue(DIVar, DIExpr, SI))
- Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr,
- DDI->getDebugLoc(), SI);
+ DV = ExtendedArg;
+ }
+ if (!LdStHasDebugValue(DIVar, DIExpr, SI))
+ Builder.insertDbgValueIntrinsic(DV, 0, DIVar, DIExpr, DDI->getDebugLoc(),
+ SI);
}
/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
@@ -1781,44 +1776,43 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) {
combineMetadata(K, J, KnownIDs);
}
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
- DominatorTree &DT,
- const BasicBlockEdge &Root) {
+template <typename RootType, typename DominatesFn>
+static unsigned replaceDominatedUsesWith(Value *From, Value *To,
+ const RootType &Root,
+ const DominatesFn &Dominates) {
assert(From->getType() == To->getType());
-
+
unsigned Count = 0;
for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
- UI != UE; ) {
+ UI != UE;) {
Use &U = *UI++;
- if (DT.dominates(Root, U)) {
- U.set(To);
- DEBUG(dbgs() << "Replace dominated use of '"
- << From->getName() << "' as "
- << *To << " in " << *U << "\n");
- ++Count;
- }
+ if (!Dominates(Root, U))
+ continue;
+ U.set(To);
+ DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
+ << *To << " in " << *U << "\n");
+ ++Count;
}
return Count;
}
unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
DominatorTree &DT,
- const BasicBlock *BB) {
- assert(From->getType() == To->getType());
+ const BasicBlockEdge &Root) {
+ auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) {
+ return DT.dominates(Root, U);
+ };
+ return ::replaceDominatedUsesWith(From, To, Root, Dominates);
+}
- unsigned Count = 0;
- for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
- UI != UE;) {
- Use &U = *UI++;
- auto *I = cast<Instruction>(U.getUser());
- if (DT.properlyDominates(BB, I->getParent())) {
- U.set(To);
- DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
- << *To << " in " << *U << "\n");
- ++Count;
- }
- }
- return Count;
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+ DominatorTree &DT,
+ const BasicBlock *BB) {
+ auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) {
+ auto *I = cast<Instruction>(U.getUser())->getParent();
+ return DT.properlyDominates(BB, I);
+ };
+ return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);
}
bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 175d013a011d..81f033e7d51a 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -18,6 +18,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -1112,3 +1113,203 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
else
return (FalseVal + (TrueVal / 2)) / TrueVal;
}
+
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+ if (isa<FPMathOperator>(V)) {
+ FastMathFlags Flags;
+ Flags.setUnsafeAlgebra();
+ cast<Instruction>(V)->setFastMathFlags(Flags);
+ }
+ return V;
+}
+
+// Helper to generate a log2 shuffle reduction.
+Value *
+llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+ ArrayRef<Value *> RedOps) {
+ unsigned VF = Src->getType()->getVectorNumElements();
+ // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+ // and vector ops, reducing the set of values being computed by half each
+ // round.
+ assert(isPowerOf2_32(VF) &&
+ "Reduction emission only supported for pow2 vectors!");
+ Value *TmpVec = Src;
+ SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+ for (unsigned i = VF; i != 1; i >>= 1) {
+ // Move the upper half of the vector to the lower half.
+ for (unsigned j = 0; j != i / 2; ++j)
+ ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+
+ // Fill the rest of the mask with undef.
+ std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
+ UndefValue::get(Builder.getInt32Ty()));
+
+ Value *Shuf = Builder.CreateShuffleVector(
+ TmpVec, UndefValue::get(TmpVec->getType()),
+ ConstantVector::get(ShuffleMask), "rdx.shuf");
+
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ // Floating point operations had to be 'fast' to enable the reduction.
+ TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op,
+ TmpVec, Shuf, "bin.rdx"));
+ } else {
+ assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+ "Invalid min/max");
+ TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec,
+ Shuf);
+ }
+ if (!RedOps.empty())
+ propagateIRFlags(TmpVec, RedOps);
+ }
+ // The result is in the first element of the vector.
+ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+}
+
+/// Create a simple vector reduction specified by an opcode and some
+/// flags (if generating min/max reductions).
+Value *llvm::createSimpleTargetReduction(
+ IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
+ Value *Src, TargetTransformInfo::ReductionFlags Flags,
+ ArrayRef<Value *> RedOps) {
+ assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
+
+ Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
+ std::function<Value*()> BuildFunc;
+ using RD = RecurrenceDescriptor;
+ RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
+ // TODO: Support creating ordered reductions.
+ FastMathFlags FMFUnsafe;
+ FMFUnsafe.setUnsafeAlgebra();
+
+ switch (Opcode) {
+ case Instruction::Add:
+ BuildFunc = [&]() { return Builder.CreateAddReduce(Src); };
+ break;
+ case Instruction::Mul:
+ BuildFunc = [&]() { return Builder.CreateMulReduce(Src); };
+ break;
+ case Instruction::And:
+ BuildFunc = [&]() { return Builder.CreateAndReduce(Src); };
+ break;
+ case Instruction::Or:
+ BuildFunc = [&]() { return Builder.CreateOrReduce(Src); };
+ break;
+ case Instruction::Xor:
+ BuildFunc = [&]() { return Builder.CreateXorReduce(Src); };
+ break;
+ case Instruction::FAdd:
+ BuildFunc = [&]() {
+ auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
+ cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+ return Rdx;
+ };
+ break;
+ case Instruction::FMul:
+ BuildFunc = [&]() {
+ auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
+ cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+ return Rdx;
+ };
+ break;
+ case Instruction::ICmp:
+ if (Flags.IsMaxOp) {
+ MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax;
+ BuildFunc = [&]() {
+ return Builder.CreateIntMaxReduce(Src, Flags.IsSigned);
+ };
+ } else {
+ MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin;
+ BuildFunc = [&]() {
+ return Builder.CreateIntMinReduce(Src, Flags.IsSigned);
+ };
+ }
+ break;
+ case Instruction::FCmp:
+ if (Flags.IsMaxOp) {
+ MinMaxKind = RD::MRK_FloatMax;
+ BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); };
+ } else {
+ MinMaxKind = RD::MRK_FloatMin;
+ BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); };
+ }
+ break;
+ default:
+ llvm_unreachable("Unhandled opcode");
+ break;
+ }
+ if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
+ return BuildFunc();
+ return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
+}
+
+/// Create a vector reduction using a given recurrence descriptor.
+Value *llvm::createTargetReduction(IRBuilder<> &Builder,
+ const TargetTransformInfo *TTI,
+ RecurrenceDescriptor &Desc, Value *Src,
+ bool NoNaN) {
+ // TODO: Support in-order reductions based on the recurrence descriptor.
+ RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind();
+ TargetTransformInfo::ReductionFlags Flags;
+ Flags.NoNaN = NoNaN;
+ auto getSimpleRdx = [&](unsigned Opc) {
+ return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags);
+ };
+ switch (RecKind) {
+ case RecurrenceDescriptor::RK_FloatAdd:
+ return getSimpleRdx(Instruction::FAdd);
+ case RecurrenceDescriptor::RK_FloatMult:
+ return getSimpleRdx(Instruction::FMul);
+ case RecurrenceDescriptor::RK_IntegerAdd:
+ return getSimpleRdx(Instruction::Add);
+ case RecurrenceDescriptor::RK_IntegerMult:
+ return getSimpleRdx(Instruction::Mul);
+ case RecurrenceDescriptor::RK_IntegerAnd:
+ return getSimpleRdx(Instruction::And);
+ case RecurrenceDescriptor::RK_IntegerOr:
+ return getSimpleRdx(Instruction::Or);
+ case RecurrenceDescriptor::RK_IntegerXor:
+ return getSimpleRdx(Instruction::Xor);
+ case RecurrenceDescriptor::RK_IntegerMinMax: {
+ switch (Desc.getMinMaxRecurrenceKind()) {
+ case RecurrenceDescriptor::MRK_SIntMax:
+ Flags.IsSigned = true;
+ Flags.IsMaxOp = true;
+ break;
+ case RecurrenceDescriptor::MRK_UIntMax:
+ Flags.IsMaxOp = true;
+ break;
+ case RecurrenceDescriptor::MRK_SIntMin:
+ Flags.IsSigned = true;
+ break;
+ case RecurrenceDescriptor::MRK_UIntMin:
+ break;
+ default:
+ llvm_unreachable("Unhandled MRK");
+ }
+ return getSimpleRdx(Instruction::ICmp);
+ }
+ case RecurrenceDescriptor::RK_FloatMinMax: {
+ Flags.IsMaxOp =
+ Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax;
+ return getSimpleRdx(Instruction::FCmp);
+ }
+ default:
+ llvm_unreachable("Unhandled RecKind");
+ }
+}
+
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+ if (auto *VecOp = dyn_cast<Instruction>(I)) {
+ if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
+ // VecOVp is initialized to the 0th scalar, so start counting from index
+ // '1'.
+ VecOp->copyIRFlags(I0);
+ for (int i = 1, e = VL.size(); i < e; ++i) {
+ if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
+ VecOp->andIRFlags(Scalar);
+ }
+ }
+ }
+}
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index 29d334f2968f..2ef3d6336ae2 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -35,7 +35,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
// Upgrade a 2-field global array type to the new 3-field format if needed.
if (Data && OldEltTy->getNumElements() < 3)
EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
- IRB.getInt8PtrTy(), nullptr);
+ IRB.getInt8PtrTy());
else
EltTy = OldEltTy;
if (Constant *Init = GVCtor->getInitializer()) {
@@ -44,10 +44,10 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
for (unsigned i = 0; i != n; ++i) {
auto Ctor = cast<Constant>(Init->getOperand(i));
if (EltTy != OldEltTy)
- Ctor = ConstantStruct::get(
- EltTy, Ctor->getAggregateElement((unsigned)0),
- Ctor->getAggregateElement(1),
- Constant::getNullValue(IRB.getInt8PtrTy()), nullptr);
+ Ctor =
+ ConstantStruct::get(EltTy, Ctor->getAggregateElement((unsigned)0),
+ Ctor->getAggregateElement(1),
+ Constant::getNullValue(IRB.getInt8PtrTy()));
CurrentCtors.push_back(Ctor);
}
}
@@ -55,7 +55,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
} else {
// Use the new three-field struct if there isn't one already.
EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
- IRB.getInt8PtrTy(), nullptr);
+ IRB.getInt8PtrTy());
}
// Build a 2 or 3 field global_ctor entry. We don't take a comdat key.
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 9e71d746de34..1de579ed41b0 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1450,11 +1450,11 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
// x86_64 can't use {float, float} since that would be returned in both
// xmm0 and xmm1, which isn't what a real struct would do.
ResTy = T.getArch() == Triple::x86_64
- ? static_cast<Type *>(VectorType::get(ArgTy, 2))
- : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
+ ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+ : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
} else {
Name = "__sincospi_stret";
- ResTy = StructType::get(ArgTy, ArgTy, nullptr);
+ ResTy = StructType::get(ArgTy, ArgTy);
}
Module *M = OrigCallee->getParent();
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
index 83bd29dbca65..60d9ede2c487 100644
--- a/lib/Transforms/Utils/VNCoercion.cpp
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -303,6 +303,15 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
const DataLayout &DL) {
LLVMContext &Ctx = SrcVal->getType()->getContext();
+ // If two pointers are in the same address space, they have the same size,
+ // so we don't need to do any truncation, etc. This avoids introducing
+ // ptrtoint instructions for pointers that may be non-integral.
+ if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() &&
+ cast<PointerType>(SrcVal->getType())->getAddressSpace() ==
+ cast<PointerType>(LoadTy)->getAddressSpace()) {
+ return SrcVal;
+ }
+
uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
// Compute which bits of the stored value are being used by the load. Convert
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 84d89f103a2f..930972924c3c 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -949,11 +949,10 @@ void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
Constant *NewV;
if (IsOldCtorDtor) {
auto *S = cast<ConstantStruct>(V);
- auto *E1 = mapValue(S->getOperand(0));
- auto *E2 = mapValue(S->getOperand(1));
- Value *Null = Constant::getNullValue(VoidPtrTy);
- NewV =
- ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+ auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
+ auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
+ Constant *Null = Constant::getNullValue(VoidPtrTy);
+ NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
} else {
NewV = cast_or_null<Constant>(mapValue(V));
}
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 97dcb40a1d72..9cf66382b581 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -346,7 +346,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
if (!Safe) {
KnownBits Known(BitWidth);
computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
- if (Known.Zero.countTrailingZeros() < (BitWidth - 1))
+ if (Known.countMaxTrailingOnes() < (BitWidth - 1))
Safe = true;
}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3fde0a453962..516ab7d03a88 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -391,13 +391,14 @@ public:
TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
AddedSafetyChecks(false) {}
- // Perform the actual loop widening (vectorization).
- void vectorize() {
- // Create a new empty loop. Unlink the old loop and connect the new one.
- createEmptyLoop();
- // Widen each instruction in the old loop to a new one in the new loop.
- vectorizeLoop();
- }
+ /// Create a new empty loop. Unlink the old loop and connect the new one.
+ void createVectorizedLoopSkeleton();
+
+ /// Vectorize a single instruction within the innermost loop.
+ void vectorizeInstruction(Instruction &I);
+
+ /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+ void fixVectorizedLoop();
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -425,9 +426,6 @@ protected:
EdgeMaskCacheTy;
typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
- /// Create an empty loop, based on the loop ranges of the old loop.
- void createEmptyLoop();
-
/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *CountRoundDown, Value *EndValue,
@@ -436,8 +434,6 @@ protected:
/// Create a new induction variable inside L.
PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
Value *Step, Instruction *DL);
- /// Copy and widen the instructions from the old loop.
- virtual void vectorizeLoop();
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs();
@@ -450,10 +446,10 @@ protected:
/// vectorizing this phi node.
void fixReduction(PHINode *Phi);
- /// \brief The Loop exit block may have single value PHI nodes where the
- /// incoming value is 'Undef'. While vectorizing we only handled real values
- /// that were defined inside the loop. Here we fix the 'undef case'.
- /// See PR14725.
+ /// \brief The Loop exit block may have single value PHI nodes with some
+ /// incoming value. While vectorizing we only handled real values
+ /// that were defined inside the loop and we should have one value for
+ /// each predecessor of its parent basic block. See PR14725.
void fixLCSSAPHIs();
/// Iteratively sink the scalarized operands of a predicated instruction into
@@ -464,11 +460,6 @@ protected:
/// respective conditions.
void predicateInstructions();
- /// Collect the instructions from the original loop that would be trivially
- /// dead in the vectorized loop if generated.
- void collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();
@@ -481,10 +472,6 @@ protected:
/// and DST.
VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
- /// A helper function to vectorize a single instruction within the innermost
- /// loop.
- void vectorizeInstruction(Instruction &I);
-
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
@@ -1700,6 +1687,9 @@ public:
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+ // Returns true if the NoNaN attribute is set on the function.
+ bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
+
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
@@ -2185,7 +2175,10 @@ public:
/// passed Legality checks.
class LoopVectorizationPlanner {
public:
- LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {}
+ LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM)
+ : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
~LoopVectorizationPlanner() {}
@@ -2193,7 +2186,25 @@ public:
LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
unsigned UserVF);
+ /// Generate the IR code for the vectorized loop.
+ void executePlan(InnerLoopVectorizer &ILV);
+
+protected:
+ /// Collect the instructions from the original loop that would be trivially
+ /// dead in the vectorized loop if generated.
+ void collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
private:
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
/// The profitablity analysis.
LoopVectorizationCostModel &CM;
};
@@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
LVer->prepareNoAliasMetadata();
}
-void InnerLoopVectorizer::createEmptyLoop() {
+void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
@@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
}
}
-void InnerLoopVectorizer::vectorizeLoop() {
- //===------------------------------------------------===//
- //
- // Notice: any optimization or new instruction that go
- // into the code below should be also be implemented in
- // the cost-model.
- //
- //===------------------------------------------------===//
-
- // Collect instructions from the original loop that will become trivially dead
- // in the vectorized loop. We don't need to vectorize these instructions. For
- // example, original induction update instructions can become dead because we
- // separately emit induction "steps" when generating code for the new loop.
- // Similarly, we create a new latch condition when setting up the structure
- // of the new loop, so the old one can become dead.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- collectTriviallyDeadInstructions(DeadInstructions);
-
- // Scan the loop in a topological order to ensure that defs are vectorized
- // before users.
- LoopBlocksDFS DFS(OrigLoop);
- DFS.perform(LI);
-
- // Vectorize all instructions in the original loop that will not become
- // trivially dead when vectorized.
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
- for (Instruction &I : *BB)
- if (!DeadInstructions.count(&I))
- vectorizeInstruction(I);
-
+void InnerLoopVectorizer::fixVectorizedLoop() {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
if (VF > 1)
@@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// Set the insertion point after the previous value if it is an instruction.
// Note that the previous value may have been constant-folded so it is not
- // guaranteed to be an instruction in the vector loop.
- if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]))
+ // guaranteed to be an instruction in the vector loop. Also, if the previous
+ // value is a phi node, we should insert after all the phi nodes to avoid
+ // breaking basic block verification.
+ if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) ||
+ isa<PHINode>(PreviousParts[UF - 1]))
Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
else
Builder.SetInsertPoint(
@@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
if (VF > 1) {
- // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
- // and vector ops, reducing the set of values being computed by half each
- // round.
- assert(isPowerOf2_32(VF) &&
- "Reduction emission only supported for pow2 vectors!");
- Value *TmpVec = ReducedPartRdx;
- SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
- for (unsigned i = VF; i != 1; i >>= 1) {
- // Move the upper half of the vector to the lower half.
- for (unsigned j = 0; j != i / 2; ++j)
- ShuffleMask[j] = Builder.getInt32(i / 2 + j);
-
- // Fill the rest of the mask with undef.
- std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
- UndefValue::get(Builder.getInt32Ty()));
-
- Value *Shuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()),
- ConstantVector::get(ShuffleMask), "rdx.shuf");
-
- if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- // Floating point operations had to be 'fast' to enable the reduction.
- TmpVec = addFastMathFlag(Builder.CreateBinOp(
- (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
- else
- TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
- TmpVec, Shuf);
- }
-
- // The result is in the first element of the vector.
+ bool NoNaN = Legal->hasFunNoNaNAttr();
ReducedPartRdx =
- Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-
+ createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (Phi->getType() != RdxDesc.getRecurrenceType())
@@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
if (!LCSSAPhi)
break;
- if (LCSSAPhi->getNumIncomingValues() == 1)
- LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
- LoopMiddleBlock);
- }
-}
-
-void InnerLoopVectorizer::collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions) {
- BasicBlock *Latch = OrigLoop->getLoopLatch();
-
- // We create new control-flow for the vectorized loop, so the original
- // condition will be dead after vectorization if it's only used by the
- // branch.
- auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
- if (Cmp && Cmp->hasOneUse())
- DeadInstructions.insert(Cmp);
-
- // We create new "steps" for induction variable updates to which the original
- // induction variables map. An original update instruction will be dead if
- // all its users except the induction variable are dead.
- for (auto &Induction : *Legal->getInductionVars()) {
- PHINode *Ind = Induction.first;
- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
- if (all_of(IndUpdate->users(), [&](User *U) -> bool {
- return U == Ind || DeadInstructions.count(cast<Instruction>(U));
- }))
- DeadInstructions.insert(IndUpdate);
+ if (LCSSAPhi->getNumIncomingValues() == 1) {
+ assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
+ "Incoming value isn't loop invariant");
+ LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
+ }
}
}
@@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
return CM.selectVectorizationFactor(MaxVF);
}
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+ // Perform the actual loop transformation.
+
+ // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+ ILV.createVectorizedLoopSkeleton();
+
+ //===------------------------------------------------===//
+ //
+ // Notice: any optimization or new instruction that go
+ // into the code below should also be implemented in
+ // the cost-model.
+ //
+ //===------------------------------------------------===//
+
+ // 2. Copy and widen instructions from the old loop into the new loop.
+
+ // Collect instructions from the original loop that will become trivially dead
+ // in the vectorized loop. We don't need to vectorize these instructions. For
+ // example, original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
+ collectTriviallyDeadInstructions(DeadInstructions);
+
+ // Scan the loop in a topological order to ensure that defs are vectorized
+ // before users.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
+
+ // Vectorize all instructions in the original loop that will not become
+ // trivially dead when vectorized.
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+ for (Instruction &I : *BB)
+ if (!DeadInstructions.count(&I))
+ ILV.vectorizeInstruction(I);
+
+ // 3. Fix the vectorized code: take care of header phi's, live-outs,
+ // predication, updating analyses.
+ ILV.fixVectorizedLoop();
+}
+
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+ // We create new control-flow for the vectorized loop, so the original
+ // condition will be dead after vectorization if it's only used by the
+ // branch.
+ auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+ if (Cmp && Cmp->hasOneUse())
+ DeadInstructions.insert(Cmp);
+
+ // We create new "steps" for induction variable updates to which the original
+ // induction variables map. An original update instruction will be dead if
+ // all its users except the induction variable are dead.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ PHINode *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+ if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+ return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+ }))
+ DeadInstructions.insert(IndUpdate);
+ }
+}
+
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
auto *SI = dyn_cast<StoreInst>(Instr);
bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
@@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectValuesToIgnore();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(CM);
+ LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();
@@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM);
- Unroller.vectorize();
+ LVP.executePlan(Unroller);
ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
L->getHeader())
@@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// If we decided that it is *legal* to vectorize the loop, then do it.
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
&LVL, &CM);
- LB.vectorize();
+ LVP.executePlan(LB);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there are
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f112c555205c..64013d6d687d 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -40,7 +40,9 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <memory>
@@ -212,23 +214,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
return Opcode;
}
-/// Get the intersection (logical and) of all of the potential IR flags
-/// of each scalar operation (VL) that will be converted into a vector (I).
-/// Flag set: NSW, NUW, exact, and all of fast-math.
-static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
- if (auto *VecOp = dyn_cast<Instruction>(I)) {
- if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
- // VecOVp is initialized to the 0th scalar, so start counting from index
- // '1'.
- VecOp->copyIRFlags(I0);
- for (int i = 1, e = VL.size(); i < e; ++i) {
- if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
- VecOp->andIRFlags(Scalar);
- }
- }
- }
-}
-
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
@@ -315,10 +300,10 @@ public:
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
- const DataLayout *DL)
+ const DataLayout *DL, OptimizationRemarkEmitter *ORE)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
- DL(DL), Builder(Se->getContext()) {
+ DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
@@ -331,7 +316,10 @@ public:
else
MaxVecRegSize = TTI->getRegisterBitWidth(true);
- MinVecRegSize = MinVectorRegSizeOption;
+ if (MinVectorRegSizeOption.getNumOccurrences())
+ MinVecRegSize = MinVectorRegSizeOption;
+ else
+ MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
}
/// \brief Vectorize the tree that starts with the elements in \p VL.
@@ -377,6 +365,8 @@ public:
MinBWs.clear();
}
+ unsigned getTreeSize() const { return VectorizableTree.size(); }
+
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
@@ -415,6 +405,8 @@ public:
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable();
+ OptimizationRemarkEmitter *getORE() { return ORE; }
+
private:
struct TreeEntry;
@@ -944,6 +936,8 @@ private:
AssumptionCache *AC;
DemandedBits *DB;
const DataLayout *DL;
+ OptimizationRemarkEmitter *ORE;
+
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
/// Instruction builder to construct the vectorized tree.
@@ -1835,11 +1829,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
- int ScalarCost = VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
- Op2VK, Op1VP, Op2VP);
+ SmallVector<const Value *, 4> Operands(VL0->operand_values());
+ int ScalarCost =
+ VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+ Op2VP, Operands);
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
- Op1VP, Op2VP);
+ Op1VP, Op2VP, Operands);
return VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
@@ -3703,10 +3699,8 @@ void BoUpSLP::computeMinimumValueSizes() {
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
- bool KnownZero = false;
- bool KnownOne = false;
- ComputeSignBit(R, KnownZero, KnownOne, *DL);
- return KnownZero;
+ KnownBits Known = computeKnownBits(R, *DL);
+ return Known.isNonNegative();
});
// Determine the maximum number of bits required to store the scalar
@@ -3786,8 +3780,9 @@ struct SLPVectorizer : public FunctionPass {
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+ return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3799,6 +3794,7 @@ struct SLPVectorizer : public FunctionPass {
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
@@ -3817,8 +3813,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+ auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+ bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
if (!Changed)
return PreservedAnalyses::all();
@@ -3833,7 +3830,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
LoopInfo *LI_, DominatorTree *DT_,
- AssumptionCache *AC_, DemandedBits *DB_) {
+ AssumptionCache *AC_, DemandedBits *DB_,
+ OptimizationRemarkEmitter *ORE_) {
SE = SE_;
TTI = TTI_;
TLI = TLI_;
@@ -3861,7 +3859,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
- BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
@@ -3950,6 +3948,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ using namespace ore;
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+ cast<StoreInst>(Chain[i]))
+ << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+ << " and with tree size "
+ << NV("TreeSize", R.getTreeSize()));
+
R.vectorizeTree();
// Move to the next bundle.
@@ -4163,6 +4168,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+ cast<Instruction>(Ops[0]))
+ << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+ << " and with tree size "
+ << ore::NV("TreeSize", R.getTreeSize()));
+
Value *VectorizedRoot = R.vectorizeTree();
// Reconstruct the build vector by extracting the vectorized root. This
@@ -4506,6 +4517,12 @@ public:
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
<< ". (HorRdx)\n");
+ auto *I0 = cast<Instruction>(VL[0]);
+ V.getORE()->emit(
+ OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize()));
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
@@ -4513,7 +4530,7 @@ public:
// Emit a reduction.
Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
@@ -4583,33 +4600,31 @@ private:
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
- unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
+ unsigned ReduxWidth, ArrayRef<Value *> RedOps,
+ const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
+ if (!IsPairwiseReduction)
+ return createSimpleTargetReduction(
+ Builder, TTI, ReductionOpcode, VectorizedValue,
+ TargetTransformInfo::ReductionFlags(), RedOps);
+
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
- if (IsPairwiseReduction) {
- Value *LeftMask =
+ Value *LeftMask =
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
- Value *RightMask =
+ Value *RightMask =
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
- Value *LeftShuf = Builder.CreateShuffleVector(
+ Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
- Value *RightShuf = Builder.CreateShuffleVector(
+ Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
- TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,
- "bin.rdx");
- } else {
- Value *UpperHalf =
- createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
- Value *Shuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
- TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
- }
+ TmpVec =
+ Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
propagateIRFlags(TmpVec, RedOps);
}
@@ -5162,6 +5177,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
namespace llvm {
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index d2984697c8a9..6677063f944f 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -115,6 +115,7 @@ struct FDRState {
uint16_t CPUId;
uint16_t ThreadId;
uint64_t BaseTSC;
+
/// Encode some of the state transitions for the FDR log reader as explicit
/// checks. These are expectations for the next Record in the stream.
enum class Token {
@@ -123,8 +124,10 @@ struct FDRState {
NEW_CPU_ID_RECORD,
FUNCTION_SEQUENCE,
SCAN_TO_END_OF_THREAD_BUF,
+ CUSTOM_EVENT_DATA,
};
Token Expects;
+
// Each threads buffer may have trailing garbage to scan over, so we track our
// progress.
uint64_t CurrentBufferSize;
@@ -143,6 +146,8 @@ Twine fdrStateToTwine(const FDRState::Token &state) {
return "FUNCTION_SEQUENCE";
case FDRState::Token::SCAN_TO_END_OF_THREAD_BUF:
return "SCAN_TO_END_OF_THREAD_BUF";
+ case FDRState::Token::CUSTOM_EVENT_DATA:
+ return "CUSTOM_EVENT_DATA";
}
return "UNKNOWN";
}
@@ -212,13 +217,32 @@ Error processFDRWallTimeRecord(FDRState &State, uint8_t RecordFirstByte,
return Error::success();
}
+/// State transition when a CustomEventMarker is encountered.
+Error processCustomEventMarker(FDRState &State, uint8_t RecordFirstByte,
+ DataExtractor &RecordExtractor,
+ size_t &RecordSize) {
+ // We can encounter a CustomEventMarker anywhere in the log, so we can handle
+ // it regardless of the expectation. However, we do se the expectation to read
+ // a set number of fixed bytes, as described in the metadata.
+ uint32_t OffsetPtr = 1; // Read after the first byte.
+ uint32_t DataSize = RecordExtractor.getU32(&OffsetPtr);
+ uint64_t TSC = RecordExtractor.getU64(&OffsetPtr);
+
+ // FIXME: Actually represent the record through the API. For now we only skip
+ // through the data.
+ (void)TSC;
+ RecordSize = 16 + DataSize;
+ return Error::success();
+}
+
/// Advances the state machine for reading the FDR record type by reading one
/// Metadata Record and updating the State appropriately based on the kind of
/// record encountered. The RecordKind is encoded in the first byte of the
/// Record, which the caller should pass in because they have already read it
/// to determine that this is a metadata record as opposed to a function record.
Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
- DataExtractor &RecordExtractor) {
+ DataExtractor &RecordExtractor,
+ size_t &RecordSize) {
// The remaining 7 bits are the RecordKind enum.
uint8_t RecordKind = RecordFirstByte >> 1;
switch (RecordKind) {
@@ -247,6 +271,11 @@ Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
processFDRWallTimeRecord(State, RecordFirstByte, RecordExtractor))
return E;
break;
+ case 5: // CustomEventMarker
+ if (auto E = processCustomEventMarker(State, RecordFirstByte,
+ RecordExtractor, RecordSize))
+ return E;
+ break;
default:
// Widen the record type to uint16_t to prevent conversion to char.
return make_error<StringError>(
@@ -400,7 +429,8 @@ Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
bool isMetadataRecord = BitField & 0x01uL;
if (isMetadataRecord) {
RecordSize = 16;
- if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor))
+ if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor,
+ RecordSize))
return E;
State.CurrentBufferConsumed += RecordSize;
} else { // Process Function Record