From 93c91e39b29142dec1d03a30df9f6e757f56c193 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Wed, 19 Jul 2017 07:02:10 +0000 Subject: Vendor import of llvm trunk r308421: https://llvm.org/svn/llvm-project/llvm/trunk@308421 --- RELEASE_TESTERS.TXT | 7 +- docs/AliasAnalysis.rst | 3 +- docs/CodingStandards.rst | 6 +- docs/CommandGuide/lit.rst | 7 + include/llvm/Analysis/DominanceFrontier.h | 59 +- include/llvm/Analysis/DominanceFrontierImpl.h | 36 +- include/llvm/Analysis/IteratedDominanceFrontier.h | 20 +- include/llvm/Analysis/LazyCallGraph.h | 28 +- include/llvm/Analysis/LoopInfo.h | 9 +- include/llvm/Analysis/LoopInfoImpl.h | 17 +- include/llvm/Analysis/PostDominators.h | 6 +- include/llvm/Analysis/ScalarEvolution.h | 48 +- include/llvm/Analysis/TargetTransformInfo.h | 11 + include/llvm/Analysis/TargetTransformInfoImpl.h | 6 + include/llvm/CodeGen/BasicTTIImpl.h | 12 + include/llvm/CodeGen/MachineDominanceFrontier.h | 31 +- include/llvm/CodeGen/MachineDominators.h | 15 +- include/llvm/CodeGen/MachinePostDominators.h | 2 +- include/llvm/DebugInfo/CodeView/CVTypeVisitor.h | 16 +- include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h | 2 +- include/llvm/DebugInfo/CodeView/Formatters.h | 10 +- include/llvm/DebugInfo/CodeView/GUID.h | 55 + include/llvm/DebugInfo/CodeView/SymbolRecord.h | 2 +- include/llvm/DebugInfo/CodeView/TypeRecord.h | 13 +- .../llvm/DebugInfo/CodeView/TypeServerHandler.h | 38 - include/llvm/DebugInfo/CodeView/TypeStreamMerger.h | 14 +- include/llvm/DebugInfo/DWARF/DWARFUnit.h | 28 + include/llvm/DebugInfo/DWARF/DWARFVerifier.h | 44 +- include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h | 2 +- include/llvm/DebugInfo/PDB/GenericError.h | 1 + include/llvm/DebugInfo/PDB/IPDBRawSymbol.h | 2 +- include/llvm/DebugInfo/PDB/Native/Formatters.h | 7 - include/llvm/DebugInfo/PDB/Native/InfoStream.h | 5 +- .../llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h | 4 +- .../llvm/DebugInfo/PDB/Native/NativeExeSymbol.h | 2 +- .../llvm/DebugInfo/PDB/Native/NativeRawSymbol.h | 2 +- .../DebugInfo/PDB/Native/PDBTypeServerHandler.h | 46 - include/llvm/DebugInfo/PDB/Native/RawTypes.h | 14 +- include/llvm/DebugInfo/PDB/Native/TpiHashing.h | 73 +- include/llvm/DebugInfo/PDB/PDBExtras.h | 1 - .../ExecutionEngine/Orc/CompileOnDemandLayer.h | 24 +- include/llvm/ExecutionEngine/RTDyldMemoryManager.h | 5 +- include/llvm/IR/CallingConv.h | 14 +- include/llvm/IR/Constants.h | 8 + include/llvm/IR/DIBuilder.h | 37 +- include/llvm/IR/DebugInfoMetadata.h | 28 +- include/llvm/IR/Dominators.h | 49 +- include/llvm/IR/IntrinsicsHexagon.td | 26 +- include/llvm/IR/IntrinsicsSystemZ.td | 43 + include/llvm/MC/LaneBitmask.h | 3 + include/llvm/MC/MCFixup.h | 2 +- include/llvm/MC/MCInstrDesc.h | 9 + include/llvm/Object/COFFImportFile.h | 2 +- include/llvm/Object/COFFModuleDefinition.h | 8 +- include/llvm/ObjectYAML/CodeViewYAMLTypes.h | 2 + include/llvm/Support/AArch64TargetParser.def | 5 +- include/llvm/Support/BinaryItemStream.h | 39 +- include/llvm/Support/Format.h | 25 +- include/llvm/Support/GenericDomTree.h | 160 +- include/llvm/Support/GenericDomTreeConstruction.h | 670 +++- include/llvm/Support/TargetParser.h | 4 +- include/llvm/Support/YAMLTraits.h | 4 + .../llvm/Target/GlobalISel/SelectionDAGCompat.td | 1 + include/llvm/Target/TargetLowering.h | 29 + .../llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h | 24 + lib/Analysis/CGSCCPassManager.cpp | 9 +- lib/Analysis/DominanceFrontier.cpp | 3 +- lib/Analysis/InstCount.cpp | 8 - lib/Analysis/InstructionSimplify.cpp | 21 +- lib/Analysis/IteratedDominanceFrontier.cpp | 8 +- lib/Analysis/LazyCallGraph.cpp | 49 +- lib/Analysis/LoopInfo.cpp | 2 +- lib/Analysis/MemorySSA.cpp | 1 - lib/Analysis/PostDominators.cpp | 2 + lib/Analysis/ScalarEvolution.cpp | 385 ++- lib/Analysis/TargetTransformInfo.cpp | 5 + lib/AsmParser/LLLexer.cpp | 2 +- lib/AsmParser/LLParser.cpp | 10 +- lib/AsmParser/LLToken.h | 2 +- lib/Bitcode/Reader/MetadataLoader.cpp | 8 +- lib/Bitcode/Writer/BitcodeWriter.cpp | 1 + lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 5 +- lib/CodeGen/CodeGenPrepare.cpp | 81 +- lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 3 + lib/CodeGen/MachineCombiner.cpp | 4 +- lib/CodeGen/MachineDominanceFrontier.cpp | 3 +- lib/CodeGen/MachineDominators.cpp | 6 +- lib/CodeGen/MachinePostDominators.cpp | 7 +- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 161 +- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 21 +- lib/CodeGen/XRayInstrumentation.cpp | 6 +- lib/DebugInfo/CodeView/CVTypeVisitor.cpp | 77 +- lib/DebugInfo/CodeView/CodeViewRecordIO.cpp | 9 +- lib/DebugInfo/CodeView/Formatters.cpp | 7 + lib/DebugInfo/CodeView/SymbolDumper.cpp | 2 +- lib/DebugInfo/CodeView/TypeDumpVisitor.cpp | 2 +- lib/DebugInfo/CodeView/TypeStreamMerger.cpp | 173 +- lib/DebugInfo/DWARF/DWARFVerifier.cpp | 195 +- lib/DebugInfo/PDB/CMakeLists.txt | 1 - lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp | 12 +- lib/DebugInfo/PDB/GenericError.cpp | 2 + lib/DebugInfo/PDB/Native/InfoStream.cpp | 2 +- lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp | 2 +- lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp | 4 +- lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp | 4 +- lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp | 126 - lib/DebugInfo/PDB/Native/TpiHashing.cpp | 128 +- lib/DebugInfo/PDB/Native/TpiStream.cpp | 1 - lib/DebugInfo/PDB/PDBExtras.cpp | 6 - .../RuntimeDyld/Targets/RuntimeDyldMachOARM.h | 1 - lib/Fuzzer/CMakeLists.txt | 1 - lib/Fuzzer/FuzzerCorpus.h | 40 +- lib/Fuzzer/FuzzerDriver.cpp | 20 +- lib/Fuzzer/FuzzerFlags.def | 3 + lib/Fuzzer/FuzzerInternal.h | 14 +- lib/Fuzzer/FuzzerLoop.cpp | 30 +- lib/Fuzzer/FuzzerMerge.cpp | 8 +- lib/Fuzzer/FuzzerMutate.cpp | 23 +- lib/Fuzzer/FuzzerMutate.h | 6 - lib/Fuzzer/FuzzerTracePC.cpp | 83 + lib/Fuzzer/FuzzerTracePC.h | 23 + lib/Fuzzer/FuzzerTraceState.cpp | 181 -- lib/Fuzzer/FuzzerUtil.cpp | 7 + lib/Fuzzer/FuzzerUtil.h | 10 + lib/Fuzzer/afl/afl_driver.cpp | 4 +- lib/Fuzzer/test/CMakeLists.txt | 3 +- lib/Fuzzer/test/FlagsTest.cpp | 32 + lib/Fuzzer/test/FuzzerUnittest.cpp | 31 +- lib/Fuzzer/test/fuzzer-flags.test | 17 +- lib/Fuzzer/test/fuzzer-traces-hooks.test | 2 +- lib/Fuzzer/test/reduce_inputs.test | 3 +- lib/IR/AsmWriter.cpp | 3 +- lib/IR/Constants.cpp | 86 +- lib/IR/Core.cpp | 1 - lib/IR/DIBuilder.cpp | 26 +- lib/IR/DebugInfoMetadata.cpp | 9 +- lib/IR/Dominators.cpp | 42 +- lib/IR/LLVMContextImpl.h | 16 +- lib/IR/LegacyPassManager.cpp | 51 +- lib/IR/Module.cpp | 2 +- lib/Object/ArchiveWriter.cpp | 3 +- lib/Object/COFFImportFile.cpp | 144 +- lib/Object/COFFModuleDefinition.cpp | 36 +- lib/Object/COFFObjectFile.cpp | 5 +- lib/ObjectYAML/CodeViewYAMLTypes.cpp | 27 + lib/Option/OptTable.cpp | 25 +- lib/Support/ErrorHandling.cpp | 22 + lib/Support/Host.cpp | 41 +- lib/Support/Path.cpp | 2 - lib/Support/TargetParser.cpp | 2 + lib/Support/YAMLTraits.cpp | 8 + lib/Support/raw_ostream.cpp | 29 +- lib/Target/AArch64/AArch64.h | 4 + lib/Target/AArch64/AArch64.td | 4 + lib/Target/AArch64/AArch64CallingConvention.td | 7 + .../AArch64/AArch64DeadRegisterDefinitionsPass.cpp | 4 + lib/Target/AArch64/AArch64FalkorHWPFFix.cpp | 790 +++++ lib/Target/AArch64/AArch64FastISel.cpp | 1 + lib/Target/AArch64/AArch64FrameLowering.cpp | 12 +- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 15 +- lib/Target/AArch64/AArch64ISelLowering.cpp | 64 +- lib/Target/AArch64/AArch64ISelLowering.h | 16 + lib/Target/AArch64/AArch64InstrAtomics.td | 10 + lib/Target/AArch64/AArch64InstrInfo.cpp | 13 +- lib/Target/AArch64/AArch64InstrInfo.h | 12 +- lib/Target/AArch64/AArch64InstrInfo.td | 2 + lib/Target/AArch64/AArch64LegalizerInfo.cpp | 2 +- lib/Target/AArch64/AArch64RegisterInfo.cpp | 4 +- lib/Target/AArch64/AArch64Subtarget.cpp | 7 +- lib/Target/AArch64/AArch64Subtarget.h | 13 + lib/Target/AArch64/AArch64TargetMachine.cpp | 23 +- lib/Target/AArch64/AArch64TargetMachine.h | 2 + lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 21 +- lib/Target/AArch64/CMakeLists.txt | 1 + .../AArch64/MCTargetDesc/AArch64AsmBackend.cpp | 6 +- .../MCTargetDesc/AArch64WinCOFFObjectWriter.cpp | 61 +- lib/Target/AMDGPU/AMDGPU.h | 2 +- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp | 252 +- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 9 - lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 32 +- lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 2 +- lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 + lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 2 +- .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 5 + .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 1 + lib/Target/AMDGPU/SIFoldOperands.cpp | 1 + lib/Target/AMDGPU/SIFrameLowering.cpp | 11 +- lib/Target/AMDGPU/SIISelLowering.cpp | 119 +- lib/Target/AMDGPU/SIISelLowering.h | 2 + lib/Target/AMDGPU/SIInstrInfo.cpp | 24 +- lib/Target/AMDGPU/SIInstrInfo.h | 19 +- lib/Target/AMDGPU/SIInstrInfo.td | 2 +- lib/Target/AMDGPU/SIInstructions.td | 2 +- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 68 +- lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +- lib/Target/AMDGPU/SIRegisterInfo.cpp | 4 + lib/Target/AMDGPU/SIRegisterInfo.td | 17 +- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 18 +- lib/Target/AMDGPU/VOP2Instructions.td | 11 +- lib/Target/AMDGPU/VOP3Instructions.td | 11 +- lib/Target/AMDGPU/VOP3PInstructions.td | 10 +- lib/Target/AMDGPU/VOPCInstructions.td | 24 +- lib/Target/AMDGPU/VOPInstructions.td | 2 + lib/Target/ARM/ARM.td | 431 +-- lib/Target/ARM/ARMBaseRegisterInfo.cpp | 4 +- lib/Target/ARM/ARMFastISel.cpp | 43 +- lib/Target/ARM/ARMISelDAGToDAG.cpp | 20 +- lib/Target/ARM/ARMInstructionSelector.cpp | 23 + lib/Target/ARM/ARMLegalizerInfo.cpp | 8 +- lib/Target/ARM/ARMMCInstLower.cpp | 4 +- lib/Target/ARM/ARMRegisterBankInfo.cpp | 9 + lib/Target/ARM/ARMTargetMachine.h | 2 + lib/Target/BPF/BPFISelLowering.cpp | 45 +- lib/Target/BPF/BPFInstrInfo.td | 5 + lib/Target/Hexagon/HexagonBitSimplify.cpp | 4 +- lib/Target/Hexagon/HexagonDepInstrInfo.td | 4 + lib/Target/Hexagon/HexagonEarlyIfConv.cpp | 4 +- lib/Target/Hexagon/HexagonExpandCondsets.cpp | 4 +- lib/Target/Hexagon/HexagonFrameLowering.cpp | 4 +- lib/Target/Hexagon/HexagonGenInsert.cpp | 4 +- lib/Target/Hexagon/HexagonGenPredicate.cpp | 4 +- lib/Target/Hexagon/HexagonISelLowering.cpp | 147 +- lib/Target/Hexagon/HexagonISelLowering.h | 4 +- lib/Target/Hexagon/HexagonIntrinsics.td | 12 + lib/Target/Hexagon/HexagonOptAddrMode.cpp | 4 +- lib/Target/Hexagon/HexagonPatterns.td | 92 +- lib/Target/Hexagon/HexagonTargetObjectFile.cpp | 47 + lib/Target/Hexagon/HexagonTargetObjectFile.h | 6 + lib/Target/Mips/AsmParser/MipsAsmParser.cpp | 219 -- .../Mips/MCTargetDesc/MipsTargetStreamer.cpp | 15 - lib/Target/Mips/Mips.td | 3 + lib/Target/Mips/MipsISelLowering.cpp | 146 +- lib/Target/Mips/MipsInstrFPU.td | 9 + lib/Target/Mips/MipsMTInstrFormats.td | 21 - lib/Target/Mips/MipsMTInstrInfo.td | 110 - lib/Target/Mips/MipsSEISelDAGToDAG.cpp | 99 +- lib/Target/Mips/MipsSEISelDAGToDAG.h | 3 +- lib/Target/Mips/MipsSEISelLowering.cpp | 203 +- lib/Target/Mips/MipsSchedule.td | 4 - lib/Target/Mips/MipsSubtarget.h | 5 + lib/Target/Mips/MipsTargetStreamer.h | 3 - .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp | 3 +- lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 28 +- lib/Target/PowerPC/PPCISelLowering.cpp | 18 +- lib/Target/PowerPC/PPCISelLowering.h | 2 +- lib/Target/PowerPC/PPCInstrInfo.cpp | 10 +- lib/Target/PowerPC/PPCInstrInfo.td | 22 +- lib/Target/PowerPC/PPCInstrVSX.td | 90 +- lib/Target/PowerPC/PPCRegisterInfo.cpp | 30 +- lib/Target/PowerPC/PPCTargetMachine.h | 2 + lib/Target/Sparc/Sparc.td | 6 +- lib/Target/Sparc/SparcISelLowering.cpp | 13 + lib/Target/Sparc/SparcInstrInfo.td | 3 + lib/Target/Sparc/SparcSubtarget.cpp | 1 + lib/Target/Sparc/SparcSubtarget.h | 2 + lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp | 15 +- lib/Target/SystemZ/LLVMBuild.txt | 2 +- lib/Target/SystemZ/SystemZFeatures.td | 58 +- lib/Target/SystemZ/SystemZISelLowering.cpp | 104 +- lib/Target/SystemZ/SystemZISelLowering.h | 7 + lib/Target/SystemZ/SystemZInstrFP.td | 88 +- lib/Target/SystemZ/SystemZInstrFormats.td | 330 +- lib/Target/SystemZ/SystemZInstrInfo.cpp | 32 + lib/Target/SystemZ/SystemZInstrInfo.td | 68 +- lib/Target/SystemZ/SystemZInstrSystem.td | 4 + lib/Target/SystemZ/SystemZInstrVector.td | 366 ++- lib/Target/SystemZ/SystemZOperators.td | 20 + lib/Target/SystemZ/SystemZPatterns.td | 7 + lib/Target/SystemZ/SystemZProcessors.td | 3 + lib/Target/SystemZ/SystemZRegisterInfo.td | 14 +- lib/Target/SystemZ/SystemZSchedule.td | 3 +- lib/Target/SystemZ/SystemZScheduleZ14.td | 1611 ++++++++++ lib/Target/SystemZ/SystemZScheduleZ196.td | 163 +- lib/Target/SystemZ/SystemZScheduleZEC12.td | 161 +- lib/Target/SystemZ/SystemZShortenInst.cpp | 40 + lib/Target/SystemZ/SystemZSubtarget.cpp | 4 + lib/Target/SystemZ/SystemZSubtarget.h | 34 + lib/Target/SystemZ/SystemZTargetMachine.cpp | 4 +- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp | 3 + lib/Target/SystemZ/SystemZTargetTransformInfo.h | 4 + lib/Target/X86/CMakeLists.txt | 1 + lib/Target/X86/X86.h | 3 + lib/Target/X86/X86.td | 6 +- lib/Target/X86/X86CallingConv.td | 4 +- lib/Target/X86/X86CmovConversion.cpp | 611 ++++ lib/Target/X86/X86FastISel.cpp | 4 +- lib/Target/X86/X86FixupBWInsts.cpp | 2 +- lib/Target/X86/X86ISelLowering.cpp | 66 +- lib/Target/X86/X86InstrAVX512.td | 231 +- lib/Target/X86/X86RegisterInfo.cpp | 6 +- lib/Target/X86/X86Schedule.td | 1 + lib/Target/X86/X86ScheduleBtVer2.td | 16 + lib/Target/X86/X86ScheduleZnver1.td | 223 ++ lib/Target/X86/X86Subtarget.h | 2 +- lib/Target/X86/X86TargetMachine.cpp | 1 + lib/Target/X86/X86TargetMachine.h | 2 + lib/ToolDrivers/CMakeLists.txt | 1 + lib/ToolDrivers/LLVMBuild.txt | 2 +- lib/ToolDrivers/llvm-dlltool/CMakeLists.txt | 9 + lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp | 160 + lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt | 22 + lib/ToolDrivers/llvm-dlltool/Options.td | 26 + lib/Transforms/IPO/GlobalOpt.cpp | 18 + lib/Transforms/IPO/Inliner.cpp | 2 +- lib/Transforms/IPO/SampleProfile.cpp | 11 +- lib/Transforms/InstCombine/InstCombineAndOrXor.cpp | 56 +- lib/Transforms/InstCombine/InstCombineCompares.cpp | 36 +- .../InstCombine/InstCombineLoadStoreAlloca.cpp | 5 +- .../InstCombine/InstCombineSimplifyDemanded.cpp | 6 +- .../InstCombine/InstructionCombining.cpp | 76 +- .../Instrumentation/AddressSanitizer.cpp | 38 + lib/Transforms/Instrumentation/MemorySanitizer.cpp | 4 +- .../Instrumentation/SanitizerCoverage.cpp | 10 + lib/Transforms/Scalar/EarlyCSE.cpp | 16 +- lib/Transforms/Scalar/GVN.cpp | 1 + .../Scalar/InductiveRangeCheckElimination.cpp | 72 +- lib/Transforms/Scalar/JumpThreading.cpp | 82 +- lib/Transforms/Scalar/LoopInterchange.cpp | 119 +- lib/Transforms/Scalar/TailRecursionElimination.cpp | 15 +- lib/Transforms/Utils/LoopUnrollRuntime.cpp | 30 +- lib/Transforms/Vectorize/LoopVectorize.cpp | 64 +- lib/Transforms/Vectorize/SLPVectorizer.cpp | 30 +- runtimes/CMakeLists.txt | 9 +- test/Analysis/CostModel/SystemZ/fp-arith.ll | 53 +- test/Assembler/diimportedentity.ll | 4 +- test/Bitcode/DIGlobalVariableExpression.ll | 2 +- test/Bitcode/compatibility-3.6.ll | 6 +- test/Bitcode/compatibility-3.7.ll | 6 +- test/Bitcode/compatibility-3.8.ll | 6 +- test/Bitcode/compatibility-3.9.ll | 6 +- test/Bitcode/compatibility-4.0.ll | 6 +- test/Bitcode/compatibility.ll | 6 +- test/Bitcode/upgrade-importedentity.ll | 15 + test/Bitcode/upgrade-importedentity.ll.bc | Bin 0 -> 1216 bytes test/CMakeLists.txt | 3 + test/CodeGen/AArch64/GlobalISel/select-fma.mir | 41 + test/CodeGen/AArch64/aarch64_win64cc_vararg.ll | 74 + test/CodeGen/AArch64/arm64-abi-varargs.ll | 3 +- test/CodeGen/AArch64/arm64-abi_align.ll | 32 +- .../AArch64/arm64-alloca-frame-pointer-offset.ll | 6 +- test/CodeGen/AArch64/arm64-extern-weak.ll | 2 +- test/CodeGen/AArch64/arm64-inline-asm.ll | 10 + test/CodeGen/AArch64/arm64-platform-reg.ll | 1 + test/CodeGen/AArch64/arm64-vext.ll | 8 +- test/CodeGen/AArch64/atomic-ops-lse.ll | 161 + test/CodeGen/AArch64/dag-combine-invaraints.ll | 2 +- test/CodeGen/AArch64/extern-weak.ll | 2 +- test/CodeGen/AArch64/falkor-hwpf-fix.ll | 67 + test/CodeGen/AArch64/falkor-hwpf-fix.mir | 52 + test/CodeGen/AArch64/falkor-hwpf.ll | 106 + .../AArch64/preferred-function-alignment.ll | 2 +- test/CodeGen/AArch64/swifterror.ll | 12 +- test/CodeGen/AArch64/win64_vararg.ll | 95 + .../AMDGPU/annotate-kernel-features-hsa-call.ll | 312 ++ .../CodeGen/AMDGPU/annotate-kernel-features-hsa.ll | 11 + .../AMDGPU/attr-amdgpu-flat-work-group-size.ll | 2 +- test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll | 14 +- test/CodeGen/AMDGPU/fcanonicalize-elimination.ll | 62 +- test/CodeGen/AMDGPU/function-args.ll | 16 + test/CodeGen/AMDGPU/hsa.ll | 10 +- .../AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll | 12 + test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll | 4 +- test/CodeGen/AMDGPU/move-to-valu-worklist.ll | 29 + test/CodeGen/AMDGPU/mubuf-offset-private.ll | 26 +- test/CodeGen/AMDGPU/parallelandifcollapse.ll | 2 +- test/CodeGen/AMDGPU/parallelorifcollapse.ll | 2 +- test/CodeGen/AMDGPU/private-access-no-objects.ll | 10 +- .../rename-independent-subregs-mac-operands.mir | 2 +- test/CodeGen/AMDGPU/scratch-simple.ll | 72 +- test/CodeGen/AMDGPU/sdwa-peephole-instr.mir | 35 +- test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir | 4 +- test/CodeGen/AMDGPU/trap.ll | 8 +- .../CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir | 2 +- .../vgpr-spill-emergency-stack-slot-compute.ll | 36 +- .../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 6 +- .../ARM/GlobalISel/arm-instruction-select.mir | 39 + test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll | 52 + test/CodeGen/ARM/GlobalISel/arm-isel.ll | 39 + .../CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir | 174 + test/CodeGen/ARM/GlobalISel/arm-legalizer.mir | 36 + test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir | 30 + test/CodeGen/ARM/atomic-op.ll | 15 + test/CodeGen/AVR/branch-relaxation.ll | 4 +- test/CodeGen/BPF/select_ri.ll | 27 + test/CodeGen/BPF/setcc.ll | 4 +- test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll | 1 - .../CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll | 3 - test/CodeGen/Generic/print-mul-exp.ll | 1 + test/CodeGen/Generic/print-mul.ll | 1 + test/CodeGen/Generic/print-shift.ll | 1 + test/CodeGen/Generic/v-split.ll | 3 - test/CodeGen/Generic/vector-redux.ll | 3 - test/CodeGen/Generic/vector.ll | 3 - test/CodeGen/Hexagon/intrinsics/system_user.ll | 76 +- .../CodeGen/Hexagon/switch-lut-explicit-section.ll | 32 + .../CodeGen/Hexagon/switch-lut-function-section.ll | 30 + .../Hexagon/switch-lut-multiple-functions.ll | 42 + test/CodeGen/Hexagon/switch-lut-text-section.ll | 27 + test/CodeGen/Hexagon/v6vec-vprint.ll | 2 +- test/CodeGen/Hexagon/vect/vect-load-v4i16.ll | 23 + test/CodeGen/Hexagon/vect/vect-loadv4i16.ll | 73 - test/CodeGen/Hexagon/vect/vect-v4i16.ll | 73 + test/CodeGen/MIR/AArch64/target-memoperands.mir | 4 + test/CodeGen/MIR/AMDGPU/fold-multiple.mir | 40 + test/CodeGen/MSP430/vararg.ll | 4 +- test/CodeGen/Mips/2008-06-05-Carry.ll | 13 +- test/CodeGen/Mips/dins.ll | 4 +- test/CodeGen/Mips/dsp-patterns.ll | 4 +- test/CodeGen/Mips/llcarry.ll | 11 +- test/CodeGen/Mips/llvm-ir/add.ll | 394 ++- test/CodeGen/Mips/llvm-ir/sub.ll | 174 +- test/CodeGen/Mips/long-calls.ll | 57 + test/CodeGen/Mips/madd-msub.ll | 81 +- test/CodeGen/Mips/msa/f16-llvm-ir.ll | 12 +- test/CodeGen/PowerPC/PR33671.ll | 32 + test/CodeGen/PowerPC/build-vector-tests.ll | 40 +- test/CodeGen/PowerPC/ppc64-i128-abi.ll | 6 +- test/CodeGen/PowerPC/swaps-le-6.ll | 8 +- test/CodeGen/PowerPC/vsx-p9.ll | 48 +- test/CodeGen/SPARC/soft-mul-div.ll | 65 + test/CodeGen/SystemZ/branch-11.ll | 56 + test/CodeGen/SystemZ/fp-abs-03.ll | 43 + test/CodeGen/SystemZ/fp-abs-04.ll | 46 + test/CodeGen/SystemZ/fp-add-01.ll | 6 +- test/CodeGen/SystemZ/fp-add-04.ll | 17 + test/CodeGen/SystemZ/fp-cmp-01.ll | 102 +- test/CodeGen/SystemZ/fp-cmp-06.ll | 33 + test/CodeGen/SystemZ/fp-const-11.ll | 40 + test/CodeGen/SystemZ/fp-conv-15.ll | 50 + test/CodeGen/SystemZ/fp-conv-16.ll | 99 + test/CodeGen/SystemZ/fp-copysign-02.ll | 81 + test/CodeGen/SystemZ/fp-div-01.ll | 6 +- test/CodeGen/SystemZ/fp-div-04.ll | 17 + test/CodeGen/SystemZ/fp-move-13.ll | 46 + test/CodeGen/SystemZ/fp-mul-01.ll | 6 +- test/CodeGen/SystemZ/fp-mul-06.ll | 31 +- test/CodeGen/SystemZ/fp-mul-08.ll | 31 +- test/CodeGen/SystemZ/fp-mul-10.ll | 43 + test/CodeGen/SystemZ/fp-mul-11.ll | 32 + test/CodeGen/SystemZ/fp-mul-12.ll | 72 + test/CodeGen/SystemZ/fp-neg-02.ll | 41 + test/CodeGen/SystemZ/fp-round-03.ll | 207 ++ test/CodeGen/SystemZ/fp-sqrt-01.ll | 8 +- test/CodeGen/SystemZ/fp-sqrt-04.ll | 17 + test/CodeGen/SystemZ/fp-sub-01.ll | 6 +- test/CodeGen/SystemZ/fp-sub-04.ll | 17 + test/CodeGen/SystemZ/int-add-17.ll | 95 + test/CodeGen/SystemZ/int-mul-09.ll | 95 + test/CodeGen/SystemZ/int-mul-10.ll | 165 + test/CodeGen/SystemZ/int-mul-11.ll | 32 + test/CodeGen/SystemZ/int-sub-10.ll | 95 + test/CodeGen/SystemZ/tdc-07.ll | 18 + test/CodeGen/SystemZ/vec-abs-06.ll | 47 + test/CodeGen/SystemZ/vec-add-02.ll | 24 + test/CodeGen/SystemZ/vec-and-04.ll | 47 + test/CodeGen/SystemZ/vec-cmp-07.ll | 349 ++ test/CodeGen/SystemZ/vec-ctpop-02.ll | 45 + test/CodeGen/SystemZ/vec-div-02.ll | 24 + test/CodeGen/SystemZ/vec-intrinsics-01.ll | 3335 ++++++++++++++++++++ test/CodeGen/SystemZ/vec-intrinsics-02.ll | 441 +++ test/CodeGen/SystemZ/vec-intrinsics.ll | 3335 -------------------- test/CodeGen/SystemZ/vec-max-05.ll | 175 + test/CodeGen/SystemZ/vec-min-05.ll | 175 + test/CodeGen/SystemZ/vec-move-18.ll | 24 + test/CodeGen/SystemZ/vec-mul-03.ll | 24 + test/CodeGen/SystemZ/vec-mul-04.ll | 31 + test/CodeGen/SystemZ/vec-mul-05.ll | 63 + test/CodeGen/SystemZ/vec-neg-02.ll | 23 + test/CodeGen/SystemZ/vec-or-03.ll | 91 + test/CodeGen/SystemZ/vec-round-02.ll | 118 + test/CodeGen/SystemZ/vec-sqrt-02.ll | 23 + test/CodeGen/SystemZ/vec-sub-02.ll | 31 + test/CodeGen/SystemZ/vec-xor-02.ll | 47 + test/CodeGen/Thumb/litpoolremat.ll | 28 + test/CodeGen/Thumb/select.ll | 4 +- test/CodeGen/WebAssembly/indirect-import.ll | 9 +- test/CodeGen/WebAssembly/userstack.ll | 10 +- test/CodeGen/X86/2008-01-08-SchedulerCrash.ll | 2 +- test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll | 2 +- test/CodeGen/X86/2011-10-19-widen_vselect.ll | 7 +- test/CodeGen/X86/DynamicCalleeSavedRegisters.ll | 2 +- test/CodeGen/X86/alias-static-alloca.ll | 37 + test/CodeGen/X86/atomic-minmax-i6432.ll | 16 +- test/CodeGen/X86/atomic128.ll | 64 +- test/CodeGen/X86/avx-schedule.ll | 508 +-- test/CodeGen/X86/avx2-arith.ll | 8 +- test/CodeGen/X86/avx2-schedule.ll | 116 +- test/CodeGen/X86/avx2-vector-shifts.ll | 4 +- test/CodeGen/X86/avx512-cvt.ll | 2 +- test/CodeGen/X86/avx512-mask-op.ll | 5 +- test/CodeGen/X86/avx512-rotate.ll | 256 ++ test/CodeGen/X86/avx512-shift.ll | 148 +- test/CodeGen/X86/bmi-schedule.ll | 529 ++++ test/CodeGen/X86/bmi2-schedule.ll | 180 ++ test/CodeGen/X86/bool-ext-inc.ll | 8 +- test/CodeGen/X86/bswap-rotate.ll | 27 + test/CodeGen/X86/clobber-fi0.ll | 14 +- test/CodeGen/X86/combine-rotates.ll | 27 +- test/CodeGen/X86/combine-shl.ll | 12 +- test/CodeGen/X86/combine-srl.ll | 8 +- test/CodeGen/X86/combine-udiv.ll | 2 +- test/CodeGen/X86/combine-urem.ll | 8 +- test/CodeGen/X86/f16c-schedule.ll | 144 + test/CodeGen/X86/fast-isel-x86-64.ll | 2 +- test/CodeGen/X86/hipe-cc.ll | 6 +- test/CodeGen/X86/hipe-cc64.ll | 6 +- test/CodeGen/X86/lea32-schedule.ll | 653 ++++ test/CodeGen/X86/lea64-schedule.ll | 534 ++++ test/CodeGen/X86/legalize-shift-64.ll | 8 +- test/CodeGen/X86/lzcnt-schedule.ll | 119 + test/CodeGen/X86/machine-outliner-debuginfo.ll | 1 + test/CodeGen/X86/machine-outliner.ll | 1 + test/CodeGen/X86/memcmp-minsize.ll | 721 +++++ test/CodeGen/X86/memcmp-optsize.ll | 871 +++++ test/CodeGen/X86/memcmp.ll | 827 +++-- test/CodeGen/X86/pmul.ll | 6 +- test/CodeGen/X86/popcnt-schedule.ll | 167 + test/CodeGen/X86/pr32282.ll | 104 + test/CodeGen/X86/pr32515.ll | 29 + test/CodeGen/X86/pr33772.ll | 15 + test/CodeGen/X86/pr33828.ll | 48 + test/CodeGen/X86/regparm.ll | 2 +- test/CodeGen/X86/rotate_vec.ll | 54 + test/CodeGen/X86/sibcall-win64.ll | 22 +- test/CodeGen/X86/sse-schedule.ll | 327 +- test/CodeGen/X86/sse2-schedule.ll | 824 ++++- test/CodeGen/X86/sse3-schedule.ll | 64 +- test/CodeGen/X86/sse41-schedule.ll | 311 +- test/CodeGen/X86/sse42-schedule.ll | 81 +- test/CodeGen/X86/sse4a-schedule.ll | 40 +- test/CodeGen/X86/ssse3-schedule.ll | 98 +- test/CodeGen/X86/statepoint-invoke.ll | 2 +- test/CodeGen/X86/statepoint-stack-usage.ll | 42 +- test/CodeGen/X86/statepoint-vector.ll | 4 +- test/CodeGen/X86/vec_cmp_uint-128.ll | 8 +- test/CodeGen/X86/vector-idiv-sdiv-128.ll | 6 +- test/CodeGen/X86/vector-idiv-sdiv-256.ll | 6 +- test/CodeGen/X86/vector-idiv-udiv-128.ll | 6 +- test/CodeGen/X86/vector-idiv-udiv-256.ll | 6 +- test/CodeGen/X86/vector-idiv.ll | 2 +- test/CodeGen/X86/vector-rotate-128.ll | 203 +- test/CodeGen/X86/vector-rotate-256.ll | 256 +- test/CodeGen/X86/vector-rotate-512.ll | 831 +++++ test/CodeGen/X86/vector-shift-ashr-256.ll | 10 +- test/CodeGen/X86/vector-tzcnt-128.ll | 4 +- test/CodeGen/X86/vector-tzcnt-256.ll | 8 +- test/CodeGen/X86/vector-tzcnt-512.ll | 8 +- test/CodeGen/X86/vselect-avx.ll | 8 +- test/CodeGen/X86/widen_arith-2.ll | 15 +- test/CodeGen/X86/widen_cast-4.ll | 34 +- test/CodeGen/X86/win64-nosse-csrs.ll | 2 +- test/CodeGen/X86/win64_nonvol.ll | 2 +- test/CodeGen/X86/win64_params.ll | 2 +- test/CodeGen/X86/win_chkstk.ll | 2 +- test/CodeGen/X86/win_coreclr_chkstk.ll | 4 +- test/CodeGen/X86/x86-64-ms_abi-vararg.ll | 14 +- test/CodeGen/X86/x86-cmov-converter.ll | 321 ++ test/CodeGen/XCore/varargs.ll | 8 +- test/DebugInfo/Generic/namespace.ll | 41 +- test/DebugInfo/PDB/pdbdump-headers.test | 184 +- test/DebugInfo/X86/DIModule.ll | 2 +- test/DebugInfo/X86/DIModuleContext.ll | 2 +- test/DebugInfo/X86/fission-inline.ll | 2 +- test/DebugInfo/X86/gnu-public-names.ll | 4 +- test/DebugInfo/X86/lexical-block-file-inline.ll | 2 +- test/DebugInfo/X86/pr19307.ll | 12 +- test/DllTool/coff-exports.def | 13 + test/DllTool/coff-weak-exports.def | 19 + test/DllTool/lit.local.cfg | 1 + test/FileCheck/regex-scope.txt | 2 +- test/Instrumentation/AddressSanitizer/basic.ll | 20 + .../AddressSanitizer/stack-poisoning-byval-args.ll | 48 + .../unordered_atomic_mem_intrins.ll | 37 + .../EfficiencySanitizer/working_set_basic.ll | 33 + .../EfficiencySanitizer/working_set_slow.ll | 32 + test/Instrumentation/MemorySanitizer/msan_basic.ll | 35 + .../SanitizerCoverage/cmp-tracing-api-x86_32.ll | 22 + .../SanitizerCoverage/cmp-tracing-api-x86_64.ll | 22 + test/Linker/pr26037.ll | 4 +- test/MC/AArch64/coff-relocations.s | 52 + test/MC/AArch64/invalid-instructions-spellcheck.s | 37 + test/MC/AMDGPU/gfx9_asm_all.s | 351 ++ test/MC/AMDGPU/vop3-errs.s | 38 +- test/MC/ARM/virtexts-thumb.s | 2 +- test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt | 405 +++ test/MC/Disassembler/Mips/mt/valid-r2-el.txt | 21 +- test/MC/Disassembler/Mips/mt/valid-r2.txt | 21 +- test/MC/Disassembler/SystemZ/insns-z14.txt | 3253 +++++++++++++++++++ test/MC/Mips/mt/invalid-wrong-error.s | 3 - test/MC/Mips/mt/invalid.s | 38 +- .../mt/mftr-mttr-aliases-invalid-wrong-error.s | 18 - test/MC/Mips/mt/mftr-mttr-aliases-invalid.s | 23 - test/MC/Mips/mt/mftr-mttr-aliases.s | 47 - test/MC/Mips/mt/mftr-mttr-reserved-valid.s | 8 - test/MC/Mips/mt/valid.s | 42 +- test/MC/SystemZ/insn-bad-z13.s | 705 +++++ test/MC/SystemZ/insn-bad-z14.s | 752 +++++ test/MC/SystemZ/insn-good-z14.s | 2674 ++++++++++++++++ test/MC/SystemZ/invalid-instructions-spellcheck.s | 66 + test/MC/X86/pr22028.s | 4 +- test/Object/no-section-table.test | 2 +- test/Object/readobj-shared-object.test | 12 +- test/ObjectYAML/CodeView/guid.yaml | 59 + test/Other/cgscc-libcall-update.ll | 61 + test/Other/new-pass-manager.ll | 2 + test/ThinLTO/X86/debuginfo-cu-import.ll | 4 +- test/Transforms/CodeGenPrepare/X86/memcmp.ll | 1635 +++++++++- .../Transforms/CodeGenPrepare/X86/sink-addrmode.ll | 35 +- test/Transforms/EarlyCSE/globalsaa-memoryssa.ll | 25 + .../GVN/PRE/2017-06-28-pre-load-dbgloc.ll | 79 + test/Transforms/GVN/PRE/phi-translate.ll | 4 +- test/Transforms/GlobalOpt/pr33686.ll | 17 + test/Transforms/IRCE/eq_ne.ll | 257 ++ test/Transforms/IRCE/pre_post_loops.ll | 117 + test/Transforms/Inline/AArch64/ext.ll | 249 ++ test/Transforms/Inline/PowerPC/ext.ll | 140 + test/Transforms/Inline/PowerPC/lit.local.cfg | 3 + test/Transforms/Inline/X86/ext.ll | 201 ++ .../Transforms/InstCombine/2017-07-07-UMul-ZExt.ll | 24 +- test/Transforms/InstCombine/and-not-or.ll | 34 - test/Transforms/InstCombine/and.ll | 192 ++ test/Transforms/InstCombine/and2.ll | 85 +- .../InstCombine/element-atomic-memintrins.ll | 98 + test/Transforms/InstCombine/icmp-logical.ll | 165 +- test/Transforms/InstCombine/or-xor.ll | 28 +- test/Transforms/InstCombine/or.ll | 291 +- test/Transforms/InstCombine/pr33765.ll | 32 + test/Transforms/JumpThreading/select.ll | 77 +- .../LoopInterchange/current-limitations-lcssa.ll | 76 + .../LoopInterchange/interchange-flow-dep-outer.ll | 118 + .../LoopInterchange/interchange-not-profitable.ll | 66 + .../interchange-output-dependencies.ll | 86 + .../interchange-simple-count-down.ll | 69 + .../LoopInterchange/interchange-simple-count-up.ll | 86 + test/Transforms/LoopInterchange/interchange.ll | 749 ----- .../loop-interchange-optimization-remarks.ll | 220 ++ .../not-interchanged-dependencies-1.ll | 64 + .../not-interchanged-loop-nest-3.ll | 87 + .../not-interchanged-tightly-nested.ll | 143 + .../runtime-loop-multiexit-dom-verify.ll | 126 + .../LoopVectorize/X86/float-induction-x86.ll | 6 +- test/Transforms/LoopVectorize/debugloc.ll | 2 +- .../LoopVectorize/first-order-recurrence.ll | 8 +- test/Transforms/LoopVectorize/float-induction.ll | 14 +- .../Transforms/LoopVectorize/if-conversion-nest.ll | 25 +- test/Transforms/LoopVectorize/induction-step.ll | 4 +- test/Transforms/LoopVectorize/induction.ll | 4 +- .../interleaved-accesses-pred-stores.ll | 6 +- .../LoopVectorize/interleaved-accesses.ll | 10 +- test/Transforms/LoopVectorize/iv_outside_user.ll | 2 +- test/Transforms/LoopVectorize/miniters.ll | 4 +- .../LoopVectorize/pr30654-phiscev-sext-trunc.ll | 240 ++ .../LoopVectorize/runtime-check-readonly.ll | 1 - test/Transforms/LoopVectorize/runtime-check.ll | 2 +- test/tools/llvm-cov/showTabsHTML.cpp | 4 +- test/tools/llvm-dwarfdump/X86/verify_debug_info.s | 193 ++ .../llvm-dwarfdump/X86/verify_unit_header_chain.s | 81 + test/tools/llvm-mt/help.test | 7 + .../AArch64/Inputs/reloc-addend.obj.macho-aarch64 | Bin 0 -> 424 bytes .../llvm-objdump/AArch64/macho-reloc-addend.test | 6 + .../tools/llvm-readobj/Inputs/dynamic-table-so.x86 | Bin 8280 -> 8256 bytes test/tools/llvm-readobj/Inputs/dynamic-table.c | 4 +- test/tools/llvm-readobj/dynamic.test | 39 +- test/tools/llvm-readobj/gnu-sections.test | 10 +- tools/llvm-ar/CMakeLists.txt | 2 + tools/llvm-ar/llvm-ar.cpp | 6 +- tools/llvm-mt/CMakeLists.txt | 13 + tools/llvm-mt/LLVMBuild.txt | 22 + tools/llvm-mt/Opts.td | 29 + tools/llvm-mt/llvm-mt.cpp | 117 + tools/llvm-objdump/llvm-objdump.cpp | 5 +- tools/llvm-pdbutil/DumpOutputStyle.cpp | 4 +- tools/llvm-pdbutil/MinimalSymbolDumper.cpp | 5 +- tools/llvm-pdbutil/MinimalTypeDumper.cpp | 20 +- tools/llvm-pdbutil/MinimalTypeDumper.h | 4 +- tools/llvm-pdbutil/PdbYaml.cpp | 35 - tools/llvm-pdbutil/PdbYaml.h | 2 +- tools/llvm-pdbutil/llvm-pdbutil.cpp | 4 +- tools/llvm-readobj/CMakeLists.txt | 2 + tools/llvm-readobj/COFFDumper.cpp | 3 +- tools/llvm-readobj/ELFDumper.cpp | 14 +- tools/llvm-readobj/llvm-readobj.cpp | 22 +- tools/opt-viewer/opt-diff.py | 27 +- tools/opt-viewer/opt-stats.py | 14 +- tools/opt-viewer/opt-viewer.py | 14 +- tools/opt-viewer/optpmap.py | 1 + tools/opt-viewer/optrecord.py | 20 +- unittests/Analysis/CGSCCPassManagerTest.cpp | 2 + unittests/Analysis/LazyCallGraphTest.cpp | 47 +- .../DebugInfo/CodeView/RandomAccessVisitorTest.cpp | 3 +- unittests/DebugInfo/PDB/CMakeLists.txt | 3 +- unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp | 183 -- unittests/IR/CFGBuilder.cpp | 269 ++ unittests/IR/CFGBuilder.h | 94 + unittests/IR/CMakeLists.txt | 1 + unittests/IR/DominatorTreeTest.cpp | 300 +- unittests/IR/IRBuilderTest.cpp | 11 +- unittests/IR/MetadataTest.cpp | 24 +- unittests/Support/TargetParserTest.cpp | 5 +- unittests/Support/YAMLIOTest.cpp | 16 + unittests/Support/raw_ostream_test.cpp | 5 + utils/TableGen/CodeGenRegisters.cpp | 12 +- utils/lit/lit/LitConfig.py | 4 +- utils/lit/lit/TestRunner.py | 2 + utils/lit/lit/main.py | 12 +- utils/vim/syntax/llvm.vim | 4 +- 708 files changed, 43451 insertions(+), 10484 deletions(-) create mode 100644 include/llvm/DebugInfo/CodeView/GUID.h delete mode 100644 include/llvm/DebugInfo/CodeView/TypeServerHandler.h delete mode 100644 include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h create mode 100644 include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h delete mode 100644 lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp delete mode 100644 lib/Fuzzer/FuzzerTraceState.cpp create mode 100644 lib/Fuzzer/test/FlagsTest.cpp create mode 100644 lib/Target/AArch64/AArch64FalkorHWPFFix.cpp create mode 100644 lib/Target/SystemZ/SystemZScheduleZ14.td create mode 100644 lib/Target/X86/X86CmovConversion.cpp create mode 100644 lib/Target/X86/X86ScheduleZnver1.td create mode 100644 lib/ToolDrivers/llvm-dlltool/CMakeLists.txt create mode 100644 lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp create mode 100644 lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt create mode 100644 lib/ToolDrivers/llvm-dlltool/Options.td create mode 100644 test/Bitcode/upgrade-importedentity.ll create mode 100644 test/Bitcode/upgrade-importedentity.ll.bc create mode 100644 test/CodeGen/AArch64/GlobalISel/select-fma.mir create mode 100644 test/CodeGen/AArch64/aarch64_win64cc_vararg.ll create mode 100644 test/CodeGen/AArch64/falkor-hwpf-fix.ll create mode 100644 test/CodeGen/AArch64/falkor-hwpf-fix.mir create mode 100644 test/CodeGen/AArch64/falkor-hwpf.ll create mode 100644 test/CodeGen/AArch64/win64_vararg.ll create mode 100644 test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll create mode 100644 test/CodeGen/AMDGPU/move-to-valu-worklist.ll create mode 100644 test/CodeGen/BPF/select_ri.ll create mode 100644 test/CodeGen/Hexagon/switch-lut-explicit-section.ll create mode 100644 test/CodeGen/Hexagon/switch-lut-function-section.ll create mode 100644 test/CodeGen/Hexagon/switch-lut-multiple-functions.ll create mode 100644 test/CodeGen/Hexagon/switch-lut-text-section.ll create mode 100644 test/CodeGen/Hexagon/vect/vect-load-v4i16.ll delete mode 100644 test/CodeGen/Hexagon/vect/vect-loadv4i16.ll create mode 100644 test/CodeGen/Hexagon/vect/vect-v4i16.ll create mode 100644 test/CodeGen/MIR/AMDGPU/fold-multiple.mir create mode 100644 test/CodeGen/Mips/long-calls.ll create mode 100644 test/CodeGen/PowerPC/PR33671.ll create mode 100644 test/CodeGen/SPARC/soft-mul-div.ll create mode 100644 test/CodeGen/SystemZ/branch-11.ll create mode 100644 test/CodeGen/SystemZ/fp-abs-03.ll create mode 100644 test/CodeGen/SystemZ/fp-abs-04.ll create mode 100644 test/CodeGen/SystemZ/fp-add-04.ll create mode 100644 test/CodeGen/SystemZ/fp-cmp-06.ll create mode 100644 test/CodeGen/SystemZ/fp-const-11.ll create mode 100644 test/CodeGen/SystemZ/fp-conv-15.ll create mode 100644 test/CodeGen/SystemZ/fp-conv-16.ll create mode 100644 test/CodeGen/SystemZ/fp-copysign-02.ll create mode 100644 test/CodeGen/SystemZ/fp-div-04.ll create mode 100644 test/CodeGen/SystemZ/fp-move-13.ll create mode 100644 test/CodeGen/SystemZ/fp-mul-10.ll create mode 100644 test/CodeGen/SystemZ/fp-mul-11.ll create mode 100644 test/CodeGen/SystemZ/fp-mul-12.ll create mode 100644 test/CodeGen/SystemZ/fp-neg-02.ll create mode 100644 test/CodeGen/SystemZ/fp-round-03.ll create mode 100644 test/CodeGen/SystemZ/fp-sqrt-04.ll create mode 100644 test/CodeGen/SystemZ/fp-sub-04.ll create mode 100644 test/CodeGen/SystemZ/int-add-17.ll create mode 100644 test/CodeGen/SystemZ/int-mul-09.ll create mode 100644 test/CodeGen/SystemZ/int-mul-10.ll create mode 100644 test/CodeGen/SystemZ/int-mul-11.ll create mode 100644 test/CodeGen/SystemZ/int-sub-10.ll create mode 100644 test/CodeGen/SystemZ/tdc-07.ll create mode 100644 test/CodeGen/SystemZ/vec-abs-06.ll create mode 100644 test/CodeGen/SystemZ/vec-add-02.ll create mode 100644 test/CodeGen/SystemZ/vec-and-04.ll create mode 100644 test/CodeGen/SystemZ/vec-cmp-07.ll create mode 100644 test/CodeGen/SystemZ/vec-ctpop-02.ll create mode 100644 test/CodeGen/SystemZ/vec-div-02.ll create mode 100644 test/CodeGen/SystemZ/vec-intrinsics-01.ll create mode 100644 test/CodeGen/SystemZ/vec-intrinsics-02.ll delete mode 100644 test/CodeGen/SystemZ/vec-intrinsics.ll create mode 100644 test/CodeGen/SystemZ/vec-max-05.ll create mode 100644 test/CodeGen/SystemZ/vec-min-05.ll create mode 100644 test/CodeGen/SystemZ/vec-move-18.ll create mode 100644 test/CodeGen/SystemZ/vec-mul-03.ll create mode 100644 test/CodeGen/SystemZ/vec-mul-04.ll create mode 100644 test/CodeGen/SystemZ/vec-mul-05.ll create mode 100644 test/CodeGen/SystemZ/vec-neg-02.ll create mode 100644 test/CodeGen/SystemZ/vec-or-03.ll create mode 100644 test/CodeGen/SystemZ/vec-round-02.ll create mode 100644 test/CodeGen/SystemZ/vec-sqrt-02.ll create mode 100644 test/CodeGen/SystemZ/vec-sub-02.ll create mode 100644 test/CodeGen/SystemZ/vec-xor-02.ll create mode 100644 test/CodeGen/Thumb/litpoolremat.ll create mode 100644 test/CodeGen/X86/alias-static-alloca.ll create mode 100644 test/CodeGen/X86/avx512-rotate.ll create mode 100644 test/CodeGen/X86/bmi-schedule.ll create mode 100644 test/CodeGen/X86/bmi2-schedule.ll create mode 100644 test/CodeGen/X86/bswap-rotate.ll create mode 100644 test/CodeGen/X86/f16c-schedule.ll create mode 100644 test/CodeGen/X86/lea32-schedule.ll create mode 100644 test/CodeGen/X86/lea64-schedule.ll create mode 100644 test/CodeGen/X86/lzcnt-schedule.ll create mode 100644 test/CodeGen/X86/memcmp-minsize.ll create mode 100644 test/CodeGen/X86/memcmp-optsize.ll create mode 100644 test/CodeGen/X86/popcnt-schedule.ll create mode 100644 test/CodeGen/X86/pr32282.ll create mode 100644 test/CodeGen/X86/pr32515.ll create mode 100644 test/CodeGen/X86/pr33772.ll create mode 100644 test/CodeGen/X86/pr33828.ll create mode 100644 test/CodeGen/X86/rotate_vec.ll create mode 100644 test/CodeGen/X86/vector-rotate-512.ll create mode 100644 test/CodeGen/X86/x86-cmov-converter.ll create mode 100644 test/DllTool/coff-exports.def create mode 100644 test/DllTool/coff-weak-exports.def create mode 100644 test/DllTool/lit.local.cfg create mode 100644 test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll create mode 100644 test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll create mode 100644 test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll create mode 100644 test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll create mode 100644 test/MC/AArch64/coff-relocations.s create mode 100644 test/MC/AArch64/invalid-instructions-spellcheck.s create mode 100644 test/MC/Disassembler/SystemZ/insns-z14.txt delete mode 100644 test/MC/Mips/mt/invalid-wrong-error.s delete mode 100644 test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s delete mode 100644 test/MC/Mips/mt/mftr-mttr-aliases-invalid.s delete mode 100644 test/MC/Mips/mt/mftr-mttr-aliases.s delete mode 100644 test/MC/Mips/mt/mftr-mttr-reserved-valid.s create mode 100644 test/MC/SystemZ/insn-bad-z14.s create mode 100644 test/MC/SystemZ/insn-good-z14.s create mode 100644 test/MC/SystemZ/invalid-instructions-spellcheck.s create mode 100644 test/ObjectYAML/CodeView/guid.yaml create mode 100644 test/Other/cgscc-libcall-update.ll create mode 100644 test/Transforms/EarlyCSE/globalsaa-memoryssa.ll create mode 100644 test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll create mode 100644 test/Transforms/GlobalOpt/pr33686.ll create mode 100644 test/Transforms/IRCE/eq_ne.ll create mode 100644 test/Transforms/IRCE/pre_post_loops.ll create mode 100644 test/Transforms/Inline/AArch64/ext.ll create mode 100644 test/Transforms/Inline/PowerPC/ext.ll create mode 100644 test/Transforms/Inline/PowerPC/lit.local.cfg create mode 100644 test/Transforms/Inline/X86/ext.ll delete mode 100644 test/Transforms/InstCombine/and-not-or.ll create mode 100644 test/Transforms/InstCombine/element-atomic-memintrins.ll create mode 100644 test/Transforms/InstCombine/pr33765.ll create mode 100644 test/Transforms/LoopInterchange/current-limitations-lcssa.ll create mode 100644 test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll create mode 100644 test/Transforms/LoopInterchange/interchange-not-profitable.ll create mode 100644 test/Transforms/LoopInterchange/interchange-output-dependencies.ll create mode 100644 test/Transforms/LoopInterchange/interchange-simple-count-down.ll create mode 100644 test/Transforms/LoopInterchange/interchange-simple-count-up.ll delete mode 100644 test/Transforms/LoopInterchange/interchange.ll create mode 100644 test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll create mode 100644 test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll create mode 100644 test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll create mode 100644 test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll create mode 100644 test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll create mode 100644 test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll create mode 100644 test/tools/llvm-dwarfdump/X86/verify_debug_info.s create mode 100644 test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s create mode 100644 test/tools/llvm-mt/help.test create mode 100644 test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64 create mode 100644 test/tools/llvm-objdump/AArch64/macho-reloc-addend.test create mode 100644 tools/llvm-mt/CMakeLists.txt create mode 100644 tools/llvm-mt/LLVMBuild.txt create mode 100644 tools/llvm-mt/Opts.td create mode 100644 tools/llvm-mt/llvm-mt.cpp delete mode 100644 unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp create mode 100644 unittests/IR/CFGBuilder.cpp create mode 100644 unittests/IR/CFGBuilder.h diff --git a/RELEASE_TESTERS.TXT b/RELEASE_TESTERS.TXT index 7bfa88c6cf0e..9a01c725fb51 100644 --- a/RELEASE_TESTERS.TXT +++ b/RELEASE_TESTERS.TXT @@ -41,14 +41,9 @@ E: hans@chromium.org T: x86 O: Windows -N: Renato Golin -E: renato.golin@linaro.org -T: ARM -O: Linux - N: Diana Picus E: diana.picus@linaro.org -T: AArch64 +T: ARM, AArch64 O: Linux N: Simon Dardis diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst index e201333f3007..0a5cb00a48d3 100644 --- a/docs/AliasAnalysis.rst +++ b/docs/AliasAnalysis.rst @@ -132,7 +132,8 @@ The ``MayAlias`` response is used whenever the two pointers might refer to the same object. The ``PartialAlias`` response is used when the two memory objects are known to -be overlapping in some way, but do not start at the same address. +be overlapping in some way, regardless whether they start at the same address +or not. The ``MustAlias`` response may only be returned if the two memory objects are guaranteed to always start at exactly the same location. A ``MustAlias`` diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst index 722718bf4f16..fa41198755fd 100644 --- a/docs/CodingStandards.rst +++ b/docs/CodingStandards.rst @@ -34,10 +34,10 @@ There are some conventions that are not uniformly followed in the code base (e.g. the naming convention). This is because they are relatively new, and a lot of code was written before they were put in place. Our long term goal is for the entire codebase to follow the convention, but we explicitly *do not* -want patches that do large-scale reformating of existing code. On the other +want patches that do large-scale reformatting of existing code. On the other hand, it is reasonable to rename the methods of a class if you're about to -change it in some other way. Just do the reformating as a separate commit from -the functionality change. +change it in some other way. Just do the reformatting as a separate commit +from the functionality change. The ultimate goal of these guidelines is to increase the readability and maintainability of our common source base. If you have suggestions for topics to diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst index b4d15ef57b73..fbe1a9ab1843 100644 --- a/docs/CommandGuide/lit.rst +++ b/docs/CommandGuide/lit.rst @@ -80,6 +80,13 @@ OUTPUT OPTIONS Show more information on test failures, for example the entire test output instead of just the test result. +.. option:: -vv, --echo-all-commands + + Echo all commands to stdout, as they are being executed. + This can be valuable for debugging test failures, as the last echoed command + will be the one which has failed. + This option implies ``--verbose``. + .. option:: -a, --show-all Show more information about all tests, for example the entire test diff --git a/include/llvm/Analysis/DominanceFrontier.h b/include/llvm/Analysis/DominanceFrontier.h index 8cae63c3c869..b566aeaf1fd6 100644 --- a/include/llvm/Analysis/DominanceFrontier.h +++ b/include/llvm/Analysis/DominanceFrontier.h @@ -29,9 +29,9 @@ namespace llvm { /// DominanceFrontierBase - Common base class for computing forward and inverse /// dominance frontiers for a function. /// -template +template class DominanceFrontierBase { -public: + public: typedef std::set DomSetType; // Dom set for a bb typedef std::map DomSetMapType; // Dom set map @@ -40,10 +40,10 @@ protected: DomSetMapType Frontiers; std::vector Roots; - const bool IsPostDominators; + static constexpr bool IsPostDominators = IsPostDom; -public: - DominanceFrontierBase(bool isPostDom) : IsPostDominators(isPostDom) {} + public: + DominanceFrontierBase() {} /// getRoots - Return the root blocks of the current CFG. This may include /// multiple blocks if we are computing post dominators. For forward @@ -96,7 +96,7 @@ public: /// compare - Return true if the other dominance frontier base matches /// this dominance frontier base. Otherwise return false. - bool compare(DominanceFrontierBase &Other) const; + bool compare(DominanceFrontierBase &Other) const; /// print - Convert to human readable form /// @@ -113,22 +113,21 @@ public: /// used to compute a forward dominator frontiers. /// template -class ForwardDominanceFrontierBase : public DominanceFrontierBase { -private: +class ForwardDominanceFrontierBase + : public DominanceFrontierBase { + private: typedef GraphTraits BlockTraits; public: - typedef DominatorTreeBase DomTreeT; - typedef DomTreeNodeBase DomTreeNodeT; - typedef typename DominanceFrontierBase::DomSetType DomSetType; - - ForwardDominanceFrontierBase() : DominanceFrontierBase(false) {} - - void analyze(DomTreeT &DT) { - this->Roots = DT.getRoots(); - assert(this->Roots.size() == 1 && - "Only one entry block for forward domfronts!"); - calculate(DT, DT[this->Roots[0]]); + typedef DomTreeBase DomTreeT; + typedef DomTreeNodeBase DomTreeNodeT; + typedef typename DominanceFrontierBase::DomSetType DomSetType; + + void analyze(DomTreeT &DT) { + this->Roots = DT.getRoots(); + assert(this->Roots.size() == 1 && + "Only one entry block for forward domfronts!"); + calculate(DT, DT[this->Roots[0]]); } const DomSetType &calculate(const DomTreeT &DT, const DomTreeNodeT *Node); @@ -136,15 +135,16 @@ public: class DominanceFrontier : public ForwardDominanceFrontierBase { public: - typedef DominatorTreeBase DomTreeT; - typedef DomTreeNodeBase DomTreeNodeT; - typedef DominanceFrontierBase::DomSetType DomSetType; - typedef DominanceFrontierBase::iterator iterator; - typedef DominanceFrontierBase::const_iterator const_iterator; - - /// Handle invalidation explicitly. - bool invalidate(Function &F, const PreservedAnalyses &PA, - FunctionAnalysisManager::Invalidator &); + typedef DomTreeBase DomTreeT; + typedef DomTreeNodeBase DomTreeNodeT; + typedef DominanceFrontierBase::DomSetType DomSetType; + typedef DominanceFrontierBase::iterator iterator; + typedef DominanceFrontierBase::const_iterator + const_iterator; + + /// Handle invalidation explicitly. + bool invalidate(Function &F, const PreservedAnalyses &PA, + FunctionAnalysisManager::Invalidator &); }; class DominanceFrontierWrapperPass : public FunctionPass { @@ -168,7 +168,8 @@ public: void dump() const; }; -extern template class DominanceFrontierBase; +extern template class DominanceFrontierBase; +extern template class DominanceFrontierBase; extern template class ForwardDominanceFrontierBase; /// \brief Analysis pass which computes a \c DominanceFrontier. diff --git a/include/llvm/Analysis/DominanceFrontierImpl.h b/include/llvm/Analysis/DominanceFrontierImpl.h index 9f8cacc24f2c..5093b975e709 100644 --- a/include/llvm/Analysis/DominanceFrontierImpl.h +++ b/include/llvm/Analysis/DominanceFrontierImpl.h @@ -39,33 +39,33 @@ public: const DomTreeNodeT *parentNode; }; -template -void DominanceFrontierBase::removeBlock(BlockT *BB) { +template +void DominanceFrontierBase::removeBlock(BlockT *BB) { assert(find(BB) != end() && "Block is not in DominanceFrontier!"); for (iterator I = begin(), E = end(); I != E; ++I) I->second.erase(BB); Frontiers.erase(BB); } -template -void DominanceFrontierBase::addToFrontier(iterator I, - BlockT *Node) { +template +void DominanceFrontierBase::addToFrontier(iterator I, + BlockT *Node) { assert(I != end() && "BB is not in DominanceFrontier!"); assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB"); I->second.erase(Node); } -template -void DominanceFrontierBase::removeFromFrontier(iterator I, - BlockT *Node) { +template +void DominanceFrontierBase::removeFromFrontier( + iterator I, BlockT *Node) { assert(I != end() && "BB is not in DominanceFrontier!"); assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB"); I->second.erase(Node); } -template -bool DominanceFrontierBase::compareDomSet(DomSetType &DS1, - const DomSetType &DS2) const { +template +bool DominanceFrontierBase::compareDomSet( + DomSetType &DS1, const DomSetType &DS2) const { std::set tmpSet; for (BlockT *BB : DS2) tmpSet.insert(BB); @@ -88,9 +88,9 @@ bool DominanceFrontierBase::compareDomSet(DomSetType &DS1, return false; } -template -bool DominanceFrontierBase::compare( - DominanceFrontierBase &Other) const { +template +bool DominanceFrontierBase::compare( + DominanceFrontierBase &Other) const { DomSetMapType tmpFrontiers; for (typename DomSetMapType::const_iterator I = Other.begin(), E = Other.end(); @@ -118,8 +118,8 @@ bool DominanceFrontierBase::compare( return false; } -template -void DominanceFrontierBase::print(raw_ostream &OS) const { +template +void DominanceFrontierBase::print(raw_ostream &OS) const { for (const_iterator I = begin(), E = end(); I != E; ++I) { OS << " DomFrontier for BB "; if (I->first) @@ -142,8 +142,8 @@ void DominanceFrontierBase::print(raw_ostream &OS) const { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -template -void DominanceFrontierBase::dump() const { +template +void DominanceFrontierBase::dump() const { print(dbgs()); } #endif diff --git a/include/llvm/Analysis/IteratedDominanceFrontier.h b/include/llvm/Analysis/IteratedDominanceFrontier.h index bd74d6bd14c3..edaf4e9025bc 100644 --- a/include/llvm/Analysis/IteratedDominanceFrontier.h +++ b/include/llvm/Analysis/IteratedDominanceFrontier.h @@ -42,11 +42,11 @@ namespace llvm { /// By default, liveness is not used to prune the IDF computation. /// The template parameters should be either BasicBlock* or Inverse, depending on if you want the forward or reverse IDF. -template +template class IDFCalculator { - -public: - IDFCalculator(DominatorTreeBase &DT) : DT(DT), useLiveIn(false) {} + public: + IDFCalculator(DominatorTreeBase &DT) + : DT(DT), useLiveIn(false) {} /// \brief Give the IDF calculator the set of blocks in which the value is /// defined. This is equivalent to the set of starting blocks it should be @@ -84,12 +84,12 @@ public: void calculate(SmallVectorImpl &IDFBlocks); private: - DominatorTreeBase &DT; - bool useLiveIn; - const SmallPtrSetImpl *LiveInBlocks; - const SmallPtrSetImpl *DefBlocks; + DominatorTreeBase &DT; + bool useLiveIn; + const SmallPtrSetImpl *LiveInBlocks; + const SmallPtrSetImpl *DefBlocks; }; -typedef IDFCalculator ForwardIDFCalculator; -typedef IDFCalculator> ReverseIDFCalculator; +typedef IDFCalculator ForwardIDFCalculator; +typedef IDFCalculator, true> ReverseIDFCalculator; } #endif diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h index 3a052761ad7d..a025f2275fb4 100644 --- a/include/llvm/Analysis/LazyCallGraph.h +++ b/include/llvm/Analysis/LazyCallGraph.h @@ -43,6 +43,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -908,7 +909,7 @@ public: /// This sets up the graph and computes all of the entry points of the graph. /// No function definitions are scanned until their nodes in the graph are /// requested during traversal. - LazyCallGraph(Module &M); + LazyCallGraph(Module &M, TargetLibraryInfo &TLI); LazyCallGraph(LazyCallGraph &&G); LazyCallGraph &operator=(LazyCallGraph &&RHS); @@ -966,6 +967,22 @@ public: return insertInto(F, N); } + /// Get the sequence of known and defined library functions. + /// + /// These functions, because they are known to LLVM, can have calls + /// introduced out of thin air from arbitrary IR. + ArrayRef getLibFunctions() const { + return LibFunctions.getArrayRef(); + } + + /// Test whether a function is a known and defined library function tracked by + /// the call graph. + /// + /// Because these functions are known to LLVM they are specially modeled in + /// the call graph and even when all IR-level references have been removed + /// remain active and reachable. + bool isLibFunction(Function &F) const { return LibFunctions.count(&F); } + ///@{ /// \name Pre-SCC Mutation API /// @@ -1100,6 +1117,11 @@ private: /// These are all of the RefSCCs which have no children. SmallVector LeafRefSCCs; + /// Defined functions that are also known library functions which the + /// optimizer can reason about and therefore might introduce calls to out of + /// thin air. + SmallSetVector LibFunctions; + /// Helper to insert a new function, with an already looked-up entry in /// the NodeMap. Node &insertInto(Function &F, Node *&MappedN); @@ -1216,8 +1238,8 @@ public: /// /// This just builds the set of entry points to the call graph. The rest is /// built lazily as it is walked. - LazyCallGraph run(Module &M, ModuleAnalysisManager &) { - return LazyCallGraph(M); + LazyCallGraph run(Module &M, ModuleAnalysisManager &AM) { + return LazyCallGraph(M, AM.getResult(M)); } }; diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h index 096df1e421a7..70ce9a870517 100644 --- a/include/llvm/Analysis/LoopInfo.h +++ b/include/llvm/Analysis/LoopInfo.h @@ -56,7 +56,8 @@ class Loop; class MDNode; class PHINode; class raw_ostream; -template class DominatorTreeBase; +template +class DominatorTreeBase; template class LoopInfoBase; template class LoopBase; @@ -663,12 +664,12 @@ public: } /// Create the loop forest using a stable algorithm. - void analyze(const DominatorTreeBase &DomTree); + void analyze(const DominatorTreeBase &DomTree); // Debugging void print(raw_ostream &OS) const; - void verify(const DominatorTreeBase &DomTree) const; + void verify(const DominatorTreeBase &DomTree) const; }; // Implementation in LoopInfoImpl.h @@ -683,7 +684,7 @@ class LoopInfo : public LoopInfoBase { LoopInfo(const LoopInfo &) = delete; public: LoopInfo() {} - explicit LoopInfo(const DominatorTreeBase &DomTree); + explicit LoopInfo(const DominatorTreeBase &DomTree); LoopInfo(LoopInfo &&Arg) : BaseT(std::move(static_cast(Arg))) {} LoopInfo &operator=(LoopInfo &&RHS) { diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h index 372fc8b21745..e9177e68ed77 100644 --- a/include/llvm/Analysis/LoopInfoImpl.h +++ b/include/llvm/Analysis/LoopInfoImpl.h @@ -340,10 +340,10 @@ void LoopBase::print(raw_ostream &OS, unsigned Depth, /// Discover a subloop with the specified backedges such that: All blocks within /// this loop are mapped to this loop or a subloop. And all subloops within this /// loop have their parent loop set to this loop or a subloop. -template -static void discoverAndMapSubloop(LoopT *L, ArrayRef Backedges, - LoopInfoBase *LI, - const DominatorTreeBase &DomTree) { +template +static void discoverAndMapSubloop( + LoopT *L, ArrayRef Backedges, LoopInfoBase *LI, + const DomTreeBase &DomTree) { typedef GraphTraits > InvBlockTraits; unsigned NumBlocks = 0; @@ -462,10 +462,9 @@ void PopulateLoopsDFS::insertIntoLoop(BlockT *Block) { /// /// The Block vectors are inclusive, so step 3 requires loop-depth number of /// insertions per block. -template -void LoopInfoBase:: -analyze(const DominatorTreeBase &DomTree) { - +template +void LoopInfoBase::analyze( + const DomTreeBase &DomTree) { // Postorder traversal of the dominator tree. const DomTreeNodeBase *DomRoot = DomTree.getRootNode(); for (auto DomNode : post_order(DomRoot)) { @@ -607,7 +606,7 @@ static void compareLoops(const LoopT *L, const LoopT *OtherL, template void LoopInfoBase::verify( - const DominatorTreeBase &DomTree) const { + const DomTreeBase &DomTree) const { DenseSet Loops; for (iterator I = begin(), E = end(); I != E; ++I) { assert(!(*I)->getParentLoop() && "Top-level loop has a parent!"); diff --git a/include/llvm/Analysis/PostDominators.h b/include/llvm/Analysis/PostDominators.h index 94ee3b03bb86..17f2e8eaf4a2 100644 --- a/include/llvm/Analysis/PostDominators.h +++ b/include/llvm/Analysis/PostDominators.h @@ -22,10 +22,8 @@ namespace llvm { /// PostDominatorTree Class - Concrete subclass of DominatorTree that is used to /// compute the post-dominator tree. /// -struct PostDominatorTree : public DominatorTreeBase { - typedef DominatorTreeBase Base; - - PostDominatorTree() : DominatorTreeBase(true) {} +struct PostDominatorTree : public PostDomTreeBase { + typedef PostDomTreeBase Base; /// Handle invalidation explicitly. bool invalidate(Function &F, const PreservedAnalyses &PA, diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h index c7accfae78b0..d1b182755cf8 100644 --- a/include/llvm/Analysis/ScalarEvolution.h +++ b/include/llvm/Analysis/ScalarEvolution.h @@ -237,17 +237,15 @@ struct FoldingSetTrait : DefaultFoldingSetTrait { }; /// This class represents an assumption that two SCEV expressions are equal, -/// and this can be checked at run-time. We assume that the left hand side is -/// a SCEVUnknown and the right hand side a constant. +/// and this can be checked at run-time. class SCEVEqualPredicate final : public SCEVPredicate { - /// We assume that LHS == RHS, where LHS is a SCEVUnknown and RHS a - /// constant. - const SCEVUnknown *LHS; - const SCEVConstant *RHS; + /// We assume that LHS == RHS. + const SCEV *LHS; + const SCEV *RHS; public: - SCEVEqualPredicate(const FoldingSetNodeIDRef ID, const SCEVUnknown *LHS, - const SCEVConstant *RHS); + SCEVEqualPredicate(const FoldingSetNodeIDRef ID, const SCEV *LHS, + const SCEV *RHS); /// Implementation of the SCEVPredicate interface bool implies(const SCEVPredicate *N) const override; @@ -256,10 +254,10 @@ public: const SCEV *getExpr() const override; /// Returns the left hand side of the equality. - const SCEVUnknown *getLHS() const { return LHS; } + const SCEV *getLHS() const { return LHS; } /// Returns the right hand side of the equality. - const SCEVConstant *getRHS() const { return RHS; } + const SCEV *getRHS() const { return RHS; } /// Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const SCEVPredicate *P) { @@ -1241,6 +1239,14 @@ public: SmallVector NewOp(Operands.begin(), Operands.end()); return getAddRecExpr(NewOp, L, Flags); } + + /// Checks if \p SymbolicPHI can be rewritten as an AddRecExpr under some + /// Predicates. If successful return these ; + /// The function is intended to be called from PSCEV (the caller will decide + /// whether to actually add the predicates and carry out the rewrites). + Optional>> + createAddRecFromPHIWithCasts(const SCEVUnknown *SymbolicPHI); + /// Returns an expression for a GEP /// /// \p GEP The GEP. The indices contained in the GEP itself are ignored, @@ -1675,8 +1681,7 @@ public: return F.getParent()->getDataLayout(); } - const SCEVPredicate *getEqualPredicate(const SCEVUnknown *LHS, - const SCEVConstant *RHS); + const SCEVPredicate *getEqualPredicate(const SCEV *LHS, const SCEV *RHS); const SCEVPredicate * getWrapPredicate(const SCEVAddRecExpr *AR, @@ -1692,6 +1697,19 @@ public: SmallPtrSetImpl &Preds); private: + /// Similar to createAddRecFromPHI, but with the additional flexibility of + /// suggesting runtime overflow checks in case casts are encountered. + /// If successful, the analysis records that for this loop, \p SymbolicPHI, + /// which is the UnknownSCEV currently representing the PHI, can be rewritten + /// into an AddRec, assuming some predicates; The function then returns the + /// AddRec and the predicates as a pair, and caches this pair in + /// PredicatedSCEVRewrites. + /// If the analysis is not successful, a mapping from the \p SymbolicPHI to + /// itself (with no predicates) is recorded, and a nullptr with an empty + /// predicates vector is returned as a pair. + Optional>> + createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI); + /// Compute the backedge taken count knowing the interval difference, the /// stride and presence of the equality in the comparison. const SCEV *computeBECount(const SCEV *Delta, const SCEV *Stride, @@ -1722,6 +1740,12 @@ private: FoldingSet UniquePreds; BumpPtrAllocator SCEVAllocator; + /// Cache tentative mappings from UnknownSCEVs in a Loop, to a SCEV expression + /// they can be rewritten into under certain predicates. + DenseMap, + std::pair>> + PredicatedSCEVRewrites; + /// The head of a linked list of all SCEVUnknown values that have been /// allocated. This is used by releaseMemory to locate them all and call /// their destructors. diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index dfb525e3de7a..24edd3826a2e 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -155,6 +155,13 @@ public: int getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands) const; + /// \brief Estimate the cost of a EXT operation when lowered. + /// + /// The contract for this function is the same as \c getOperationCost except + /// that it supports an interface that provides extra information specific to + /// the EXT operation. + int getExtCost(const Instruction *I, const Value *Src) const; + /// \brief Estimate the cost of a function call when lowered. /// /// The contract for this is the same as \c getOperationCost except that it @@ -849,6 +856,7 @@ public: virtual int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) = 0; virtual int getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands) = 0; + virtual int getExtCost(const Instruction *I, const Value *Src) = 0; virtual int getCallCost(FunctionType *FTy, int NumArgs) = 0; virtual int getCallCost(const Function *F, int NumArgs) = 0; virtual int getCallCost(const Function *F, @@ -1022,6 +1030,9 @@ public: ArrayRef Operands) override { return Impl.getGEPCost(PointeeType, Ptr, Operands); } + int getExtCost(const Instruction *I, const Value *Src) override { + return Impl.getExtCost(I, Src); + } int getCallCost(FunctionType *FTy, int NumArgs) override { return Impl.getCallCost(FTy, NumArgs); } diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 8740ee92eed5..0b07fe9aa232 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -120,6 +120,10 @@ public: return SI.getNumCases(); } + int getExtCost(const Instruction *I, const Value *Src) { + return TTI::TCC_Basic; + } + unsigned getCallCost(FunctionType *FTy, int NumArgs) { assert(FTy && "FunctionType must be provided to this routine."); @@ -728,6 +732,8 @@ public: // nop on most sane targets. if (isa(CI->getOperand(0))) return TTI::TCC_Free; + if (isa(CI) || isa(CI) || isa(CI)) + return static_cast(this)->getExtCost(CI, Operands.back()); } return static_cast(this)->getOperationCost( diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index b59fd60e8aed..633107024792 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -155,6 +155,18 @@ public: return BaseT::getGEPCost(PointeeType, Ptr, Operands); } + int getExtCost(const Instruction *I, const Value *Src) { + if (getTLI()->isExtFree(I)) + return TargetTransformInfo::TCC_Free; + + if (isa(I) || isa(I)) + if (const LoadInst *LI = dyn_cast(Src)) + if (getTLI()->isExtLoad(LI, I, DL)) + return TargetTransformInfo::TCC_Free; + + return TargetTransformInfo::TCC_Basic; + } + unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Arguments) { return BaseT::getIntrinsicCost(IID, RetTy, Arguments); diff --git a/include/llvm/CodeGen/MachineDominanceFrontier.h b/include/llvm/CodeGen/MachineDominanceFrontier.h index 370ffbe4862e..6efeefd9a721 100644 --- a/include/llvm/CodeGen/MachineDominanceFrontier.h +++ b/include/llvm/CodeGen/MachineDominanceFrontier.h @@ -23,27 +23,24 @@ class MachineDominanceFrontier : public MachineFunctionPass { ForwardDominanceFrontierBase Base; public: - using DomTreeT = DominatorTreeBase; - using DomTreeNodeT = DomTreeNodeBase; - using DomSetType = DominanceFrontierBase::DomSetType; - using iterator = DominanceFrontierBase::iterator; - using const_iterator = - DominanceFrontierBase::const_iterator; + using DomTreeT = DomTreeBase; + using DomTreeNodeT = DomTreeNodeBase; + using DomSetType = DominanceFrontierBase::DomSetType; + using iterator = DominanceFrontierBase::iterator; + using const_iterator = + DominanceFrontierBase::const_iterator; - MachineDominanceFrontier(const MachineDominanceFrontier &) = delete; - MachineDominanceFrontier & - operator=(const MachineDominanceFrontier &) = delete; + MachineDominanceFrontier(const MachineDominanceFrontier &) = delete; + MachineDominanceFrontier &operator=(const MachineDominanceFrontier &) = delete; - static char ID; + static char ID; - MachineDominanceFrontier(); + MachineDominanceFrontier(); - DominanceFrontierBase &getBase() { - return Base; - } + DominanceFrontierBase &getBase() { return Base; } - inline const std::vector &getRoots() const { - return Base.getRoots(); + inline const std::vector &getRoots() const { + return Base.getRoots(); } MachineBasicBlock *getRoot() const { @@ -98,7 +95,7 @@ public: return Base.compareDomSet(DS1, DS2); } - bool compare(DominanceFrontierBase &Other) const { + bool compare(DominanceFrontierBase &Other) const { return Base.compare(Other); } diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h index 74a7c3ea04ae..8bf98f606495 100644 --- a/include/llvm/CodeGen/MachineDominators.h +++ b/include/llvm/CodeGen/MachineDominators.h @@ -28,13 +28,15 @@ namespace llvm { -template<> -inline void DominatorTreeBase::addRoot(MachineBasicBlock* MBB) { +template <> +inline void DominatorTreeBase::addRoot( + MachineBasicBlock *MBB) { this->Roots.push_back(MBB); } extern template class DomTreeNodeBase; -extern template class DominatorTreeBase; +extern template class DominatorTreeBase; // DomTree +extern template class DominatorTreeBase; // PostDomTree using MachineDomTreeNode = DomTreeNodeBase; @@ -65,7 +67,7 @@ class MachineDominatorTree : public MachineFunctionPass { mutable SmallSet NewBBs; /// The DominatorTreeBase that is used to compute a normal dominator tree - std::unique_ptr> DT; + std::unique_ptr> DT; /// \brief Apply all the recorded critical edges to the DT. /// This updates the underlying DT information in a way that uses @@ -79,9 +81,8 @@ public: MachineDominatorTree(); - DominatorTreeBase &getBase() { - if (!DT) - DT.reset(new DominatorTreeBase(false)); + DomTreeBase &getBase() { + if (!DT) DT.reset(new DomTreeBase()); applySplitCriticalEdges(); return *DT; } diff --git a/include/llvm/CodeGen/MachinePostDominators.h b/include/llvm/CodeGen/MachinePostDominators.h index 70bdb191ad34..d29d2d85cb0a 100644 --- a/include/llvm/CodeGen/MachinePostDominators.h +++ b/include/llvm/CodeGen/MachinePostDominators.h @@ -26,7 +26,7 @@ namespace llvm { /// struct MachinePostDominatorTree : public MachineFunctionPass { private: - DominatorTreeBase *DT; + PostDomTreeBase *DT; public: static char ID; diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h index 70ccc867cd38..df55e181364c 100644 --- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h +++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h @@ -17,7 +17,6 @@ namespace llvm { namespace codeview { class TypeCollection; -class TypeServerHandler; class TypeVisitorCallbacks; enum VisitorDataSource { @@ -31,11 +30,9 @@ enum VisitorDataSource { Error visitTypeRecord(CVType &Record, TypeIndex Index, TypeVisitorCallbacks &Callbacks, - VisitorDataSource Source = VDS_BytesPresent, - TypeServerHandler *TS = nullptr); + VisitorDataSource Source = VDS_BytesPresent); Error visitTypeRecord(CVType &Record, TypeVisitorCallbacks &Callbacks, - VisitorDataSource Source = VDS_BytesPresent, - TypeServerHandler *TS = nullptr); + VisitorDataSource Source = VDS_BytesPresent); Error visitMemberRecord(CVMemberRecord Record, TypeVisitorCallbacks &Callbacks, VisitorDataSource Source = VDS_BytesPresent); @@ -46,12 +43,9 @@ Error visitMemberRecordStream(ArrayRef FieldList, TypeVisitorCallbacks &Callbacks); Error visitTypeStream(const CVTypeArray &Types, TypeVisitorCallbacks &Callbacks, - VisitorDataSource Source = VDS_BytesPresent, - TypeServerHandler *TS = nullptr); -Error visitTypeStream(CVTypeRange Types, TypeVisitorCallbacks &Callbacks, - TypeServerHandler *TS = nullptr); -Error visitTypeStream(TypeCollection &Types, TypeVisitorCallbacks &Callbacks, - TypeServerHandler *TS = nullptr); + VisitorDataSource Source = VDS_BytesPresent); +Error visitTypeStream(CVTypeRange Types, TypeVisitorCallbacks &Callbacks); +Error visitTypeStream(TypeCollection &Types, TypeVisitorCallbacks &Callbacks); } // end namespace codeview } // end namespace llvm diff --git a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h index db944c7057f7..94f104ff772c 100644 --- a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h +++ b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h @@ -84,7 +84,7 @@ public: Error mapEncodedInteger(uint64_t &Value); Error mapEncodedInteger(APSInt &Value); Error mapStringZ(StringRef &Value); - Error mapGuid(StringRef &Guid); + Error mapGuid(GUID &Guid); Error mapStringZVectorZ(std::vector &Value); diff --git a/include/llvm/DebugInfo/CodeView/Formatters.h b/include/llvm/DebugInfo/CodeView/Formatters.h index 0842c1e373db..278ad02a39cd 100644 --- a/include/llvm/DebugInfo/CodeView/Formatters.h +++ b/include/llvm/DebugInfo/CodeView/Formatters.h @@ -12,6 +12,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/CodeView/GUID.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/Support/FormatAdapters.h" #include "llvm/Support/FormatVariadic.h" @@ -31,7 +32,7 @@ public: explicit GuidAdapter(ArrayRef Guid); explicit GuidAdapter(StringRef Guid); - void format(raw_ostream &Stream, StringRef Style) override ; + void format(raw_ostream &Stream, StringRef Style) override; }; } // end namespace detail @@ -60,6 +61,13 @@ public: } }; +template <> struct format_provider { + static void format(const codeview::GUID &V, llvm::raw_ostream &Stream, + StringRef Style) { + Stream << V; + } +}; + } // end namespace llvm #endif // LLVM_DEBUGINFO_CODEVIEW_FORMATTERS_H diff --git a/include/llvm/DebugInfo/CodeView/GUID.h b/include/llvm/DebugInfo/CodeView/GUID.h new file mode 100644 index 000000000000..a055ce9e2e45 --- /dev/null +++ b/include/llvm/DebugInfo/CodeView/GUID.h @@ -0,0 +1,55 @@ +//===- GUID.h ---------------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_DEBUGINFO_CODEVIEW_GUID_H +#define LLVM_DEBUGINFO_CODEVIEW_GUID_H + +#include +#include + +namespace llvm { +class raw_ostream; + +namespace codeview { + +/// This represents the 'GUID' type from windows.h. +struct GUID { + uint8_t Guid[16]; +}; + +inline bool operator==(const GUID &LHS, const GUID &RHS) { + return 0 == ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid)); +} + +inline bool operator<(const GUID &LHS, const GUID &RHS) { + return ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid)) < 0; +} + +inline bool operator<=(const GUID &LHS, const GUID &RHS) { + return ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid)) <= 0; +} + +inline bool operator>(const GUID &LHS, const GUID &RHS) { + return !(LHS <= RHS); +} + +inline bool operator>=(const GUID &LHS, const GUID &RHS) { + return !(LHS < RHS); +} + +inline bool operator!=(const GUID &LHS, const GUID &RHS) { + return !(LHS == RHS); +} + +raw_ostream &operator<<(raw_ostream &OS, const GUID &Guid); + +} // namespace codeview +} // namespace llvm + +#endif diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/include/llvm/DebugInfo/CodeView/SymbolRecord.h index cdfc1745cea5..f3086cf3dbb9 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolRecord.h +++ b/include/llvm/DebugInfo/CodeView/SymbolRecord.h @@ -848,7 +848,7 @@ public: : SymbolRecord(SymbolRecordKind::BuildInfoSym), RecordOffset(RecordOffset) {} - uint32_t BuildId; + TypeIndex BuildId; uint32_t RecordOffset; }; diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h index 2efeb1b3cefd..7942c0c0bc21 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecord.h +++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h @@ -18,6 +18,7 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/DebugInfo/CodeView/CVRecord.h" #include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/GUID.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/Support/BinaryStreamArray.h" #include "llvm/Support/Endian.h" @@ -539,15 +540,17 @@ class TypeServer2Record : public TypeRecord { public: TypeServer2Record() = default; explicit TypeServer2Record(TypeRecordKind Kind) : TypeRecord(Kind) {} - TypeServer2Record(StringRef Guid, uint32_t Age, StringRef Name) - : TypeRecord(TypeRecordKind::TypeServer2), Guid(Guid), Age(Age), - Name(Name) {} + TypeServer2Record(StringRef GuidStr, uint32_t Age, StringRef Name) + : TypeRecord(TypeRecordKind::TypeServer2), Age(Age), Name(Name) { + assert(GuidStr.size() == 16 && "guid isn't 16 bytes"); + ::memcpy(Guid.Guid, GuidStr.data(), 16); + } - StringRef getGuid() const { return Guid; } + const GUID &getGuid() const { return Guid; } uint32_t getAge() const { return Age; } StringRef getName() const { return Name; } - StringRef Guid; + GUID Guid; uint32_t Age; StringRef Name; }; diff --git a/include/llvm/DebugInfo/CodeView/TypeServerHandler.h b/include/llvm/DebugInfo/CodeView/TypeServerHandler.h deleted file mode 100644 index e96baad9ceae..000000000000 --- a/include/llvm/DebugInfo/CodeView/TypeServerHandler.h +++ /dev/null @@ -1,38 +0,0 @@ -//===- TypeServerHandler.h --------------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H -#define LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H - -#include "llvm/Support/Error.h" - -namespace llvm { -namespace codeview { - -class TypeServer2Record; -class TypeVisitorCallbacks; - -class TypeServerHandler { -public: - virtual ~TypeServerHandler() = default; - - /// Handle a TypeServer record. If the implementation returns true - /// the record will not be processed by the top-level visitor. If - /// it returns false, it will be processed. If it returns an Error, - /// then the top-level visitor will fail. - virtual Expected handle(TypeServer2Record &TS, - TypeVisitorCallbacks &Callbacks) { - return false; - } -}; - -} // end namespace codeview -} // end namespace llvm - -#endif // LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H diff --git a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h index 3ad2b4e9c92f..d78fab47db66 100644 --- a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h +++ b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h @@ -19,7 +19,6 @@ namespace llvm { namespace codeview { class TypeIndex; -class TypeServerHandler; class TypeTableBuilder; /// \brief Merge one set of type records into another. This method assumes @@ -31,16 +30,13 @@ class TypeTableBuilder; /// type stream, that contains the index of the corresponding type record /// in the destination stream. /// -/// \param Handler (optional) If non-null, an interface that gets invoked -/// to handle type server records. -/// /// \param Types The collection of types to merge in. /// /// \returns Error::success() if the operation succeeded, otherwise an /// appropriate error code. Error mergeTypeRecords(TypeTableBuilder &Dest, SmallVectorImpl &SourceToDest, - TypeServerHandler *Handler, const CVTypeArray &Types); + const CVTypeArray &Types); /// \brief Merge one set of id records into another. This method assumes /// that all records are id records, and there are no Type records present. @@ -65,7 +61,7 @@ Error mergeTypeRecords(TypeTableBuilder &Dest, /// appropriate error code. Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef Types, SmallVectorImpl &SourceToDest, - const CVTypeArray &Ids); + const CVTypeArray &Ids); /// \brief Merge a unified set of type and id records, splitting them into /// separate output streams. @@ -78,9 +74,6 @@ Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef Types, /// id stream, that contains the index of the corresponding id record /// in the destination stream. /// -/// \param Handler (optional) If non-null, an interface that gets invoked -/// to handle type server records. -/// /// \param IdsAndTypes The collection of id records to merge in. /// /// \returns Error::success() if the operation succeeded, otherwise an @@ -88,8 +81,7 @@ Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef Types, Error mergeTypeAndIdRecords(TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes, SmallVectorImpl &SourceToDest, - TypeServerHandler *Handler, - const CVTypeArray &IdsAndTypes); + const CVTypeArray &IdsAndTypes); } // end namespace codeview } // end namespace llvm diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h index ea36ab7ab5b6..056c1b77c65d 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h +++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h @@ -238,6 +238,34 @@ public: uint8_t getUnitType() const { return UnitType; } + static bool isValidUnitType(uint8_t UnitType) { + return UnitType == dwarf::DW_UT_compile || UnitType == dwarf::DW_UT_type || + UnitType == dwarf::DW_UT_partial || + UnitType == dwarf::DW_UT_skeleton || + UnitType == dwarf::DW_UT_split_compile || + UnitType == dwarf::DW_UT_split_type; + } + + /// \brief Return the number of bytes for the header of a unit of + /// UnitType type. + /// + /// This function must be called with a valid unit type which in + /// DWARF5 is defined as one of the following six types. + static uint32_t getDWARF5HeaderSize(uint8_t UnitType) { + switch (UnitType) { + case dwarf::DW_UT_compile: + case dwarf::DW_UT_partial: + return 12; + case dwarf::DW_UT_skeleton: + case dwarf::DW_UT_split_compile: + return 20; + case dwarf::DW_UT_type: + case dwarf::DW_UT_split_type: + return 24; + } + llvm_unreachable("Invalid UnitType."); + } + uint64_t getBaseAddress() const { return BaseAddr; } void setBaseAddress(uint64_t base_addr) { diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h index 9eb5c45faba8..c0291a83ed97 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h +++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h @@ -21,6 +21,7 @@ class DWARFContext; class DWARFDie; class DWARFUnit; class DWARFAcceleratorTable; +class DWARFDataExtractor; /// A class that verifies DWARF debug information given a DWARF Context. class DWARFVerifier { @@ -30,10 +31,35 @@ class DWARFVerifier { /// can verify each reference points to a valid DIE and not an offset that /// lies between to valid DIEs. std::map> ReferenceToDIEOffsets; - uint32_t NumDebugInfoErrors = 0; uint32_t NumDebugLineErrors = 0; uint32_t NumAppleNamesErrors = 0; + /// Verifies the header of a unit in the .debug_info section. + /// + /// This function currently checks for: + /// - Unit is in 32-bit DWARF format. The function can be modified to + /// support 64-bit format. + /// - The DWARF version is valid + /// - The unit type is valid (if unit is in version >=5) + /// - The unit doesn't extend beyond .debug_info section + /// - The address size is valid + /// - The offset in the .debug_abbrev section is valid + /// + /// \param DebugInfoData The .debug_info section data + /// \param Offset A reference to the offset start of the unit. The offset will + /// be updated to point to the next unit in .debug_info + /// \param UnitIndex The index of the unit to be verified + /// \param UnitType A reference to the type of the unit + /// \param isUnitDWARF64 A reference to a flag that shows whether the unit is + /// in 64-bit format. + /// + /// \returns true if the header is verified successfully, false otherwise. + bool verifyUnitHeader(const DWARFDataExtractor DebugInfoData, + uint32_t *Offset, unsigned UnitIndex, uint8_t &UnitType, + bool &isUnitDWARF64); + + + bool verifyUnitContents(DWARFUnit Unit); /// Verifies the attribute's DWARF attribute and its value. /// /// This function currently checks for: @@ -42,7 +68,11 @@ class DWARFVerifier { /// /// \param Die The DWARF DIE that owns the attribute value /// \param AttrValue The DWARF attribute value to check - void verifyDebugInfoAttribute(const DWARFDie &Die, DWARFAttribute &AttrValue); + /// + /// \returns NumErrors The number of errors occured during verification of + /// attributes' values in a .debug_info section unit + unsigned verifyDebugInfoAttribute(const DWARFDie &Die, + DWARFAttribute &AttrValue); /// Verifies the attribute's DWARF form. /// @@ -53,7 +83,10 @@ class DWARFVerifier { /// /// \param Die The DWARF DIE that owns the attribute value /// \param AttrValue The DWARF attribute value to check - void verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue); + /// + /// \returns NumErrors The number of errors occured during verification of + /// attributes' forms in a .debug_info section unit + unsigned verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue); /// Verifies the all valid references that were found when iterating through /// all of the DIE attributes. @@ -62,7 +95,10 @@ class DWARFVerifier { /// offset matches. This helps to ensure if a DWARF link phase moved things /// around, that it doesn't create invalid references by failing to relocate /// CU relative and absolute references. - void verifyDebugInfoReferences(); + /// + /// \returns NumErrors The number of errors occured during verification of + /// references for the .debug_info section + unsigned verifyDebugInfoReferences(); /// Verify the the DW_AT_stmt_list encoding and value and ensure that no /// compile units that have the same DW_AT_stmt_list value. diff --git a/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h b/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h index 3710eb29e7f9..d37b48540ffa 100644 --- a/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h +++ b/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h @@ -106,7 +106,7 @@ public: getVirtualBaseTableType() const override; PDB_DataKind getDataKind() const override; PDB_SymType getSymTag() const override; - PDB_UniqueId getGuid() const override; + codeview::GUID getGuid() const override; int32_t getOffset() const override; int32_t getThisAdjust() const override; int32_t getVirtualBasePointerOffset() const override; diff --git a/include/llvm/DebugInfo/PDB/GenericError.h b/include/llvm/DebugInfo/PDB/GenericError.h index 466cb455651b..03205a986f1a 100644 --- a/include/llvm/DebugInfo/PDB/GenericError.h +++ b/include/llvm/DebugInfo/PDB/GenericError.h @@ -19,6 +19,7 @@ namespace pdb { enum class generic_error_code { invalid_path = 1, dia_sdk_not_present, + type_server_not_found, unspecified, }; diff --git a/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h b/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h index fab086c62c72..eefc36518728 100644 --- a/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h +++ b/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h @@ -118,7 +118,7 @@ public: virtual uint32_t getVirtualTableShapeId() const = 0; virtual PDB_DataKind getDataKind() const = 0; virtual PDB_SymType getSymTag() const = 0; - virtual PDB_UniqueId getGuid() const = 0; + virtual codeview::GUID getGuid() const = 0; virtual int32_t getOffset() const = 0; virtual int32_t getThisAdjust() const = 0; virtual int32_t getVirtualBasePointerOffset() const = 0; diff --git a/include/llvm/DebugInfo/PDB/Native/Formatters.h b/include/llvm/DebugInfo/PDB/Native/Formatters.h index 183f0ad8307e..7d5eab2e2a09 100644 --- a/include/llvm/DebugInfo/PDB/Native/Formatters.h +++ b/include/llvm/DebugInfo/PDB/Native/Formatters.h @@ -23,13 +23,6 @@ break; namespace llvm { -template <> struct format_provider { - static void format(const pdb::PDB_UniqueId &V, llvm::raw_ostream &Stream, - StringRef Style) { - codeview::fmt_guid(V.Guid).format(Stream, Style); - } -}; - template <> struct format_provider { static void format(const pdb::PdbRaw_ImplVer &V, llvm::raw_ostream &Stream, StringRef Style) { diff --git a/include/llvm/DebugInfo/PDB/Native/InfoStream.h b/include/llvm/DebugInfo/PDB/Native/InfoStream.h index 37bf5f3b573c..fb8271cb5ebc 100644 --- a/include/llvm/DebugInfo/PDB/Native/InfoStream.h +++ b/include/llvm/DebugInfo/PDB/Native/InfoStream.h @@ -12,6 +12,7 @@ #include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/StringMap.h" +#include "llvm/DebugInfo/CodeView/GUID.h" #include "llvm/DebugInfo/MSF/MappedBlockStream.h" #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h" #include "llvm/DebugInfo/PDB/Native/RawConstants.h" @@ -39,7 +40,7 @@ public: PdbRaw_ImplVer getVersion() const; uint32_t getSignature() const; uint32_t getAge() const; - PDB_UniqueId getGuid() const; + codeview::GUID getGuid() const; uint32_t getNamedStreamMapByteSize() const; PdbRaw_Features getFeatures() const; @@ -71,7 +72,7 @@ private: // Due to the aforementioned limitations with `Signature`, this is a new // signature present on VC70 and higher PDBs which is guaranteed to be // universally unique. - PDB_UniqueId Guid; + codeview::GUID Guid; BinarySubstreamRef SubNamedStreams; diff --git a/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h index 90c28a90d252..c6cb0e221e70 100644 --- a/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h +++ b/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h @@ -37,7 +37,7 @@ public: void setVersion(PdbRaw_ImplVer V); void setSignature(uint32_t S); void setAge(uint32_t A); - void setGuid(PDB_UniqueId G); + void setGuid(codeview::GUID G); void addFeature(PdbRaw_FeatureSig Sig); uint32_t finalize(); @@ -54,7 +54,7 @@ private: PdbRaw_ImplVer Ver; uint32_t Sig; uint32_t Age; - PDB_UniqueId Guid; + codeview::GUID Guid; NamedStreamMap &NamedStreams; }; diff --git a/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h b/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h index ddb7f811da38..587c7ff2b092 100644 --- a/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h +++ b/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h @@ -27,7 +27,7 @@ public: uint32_t getAge() const override; std::string getSymbolsFileName() const override; - PDB_UniqueId getGuid() const override; + codeview::GUID getGuid() const override; bool hasCTypes() const override; bool hasPrivateSymbols() const override; diff --git a/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h b/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h index 66a9eae28e23..2c6548dcce21 100644 --- a/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h +++ b/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h @@ -111,7 +111,7 @@ public: getVirtualBaseTableType() const override; PDB_DataKind getDataKind() const override; PDB_SymType getSymTag() const override; - PDB_UniqueId getGuid() const override; + codeview::GUID getGuid() const override; int32_t getOffset() const override; int32_t getThisAdjust() const override; int32_t getVirtualBasePointerOffset() const override; diff --git a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h deleted file mode 100644 index 196ba4d6ffbd..000000000000 --- a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h +++ /dev/null @@ -1,46 +0,0 @@ -//===- PDBTypeServerHandler.h -----------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_DEBUGINFO_PDB_PDBTYPESERVERHANDLER_H -#define LLVM_DEBUGINFO_PDB_PDBTYPESERVERHANDLER_H - -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/StringSet.h" -#include "llvm/DebugInfo/CodeView/TypeRecord.h" -#include "llvm/DebugInfo/CodeView/TypeServerHandler.h" -#include "llvm/DebugInfo/PDB/Native/NativeSession.h" -#include "llvm/DebugInfo/PDB/PDBTypes.h" - -#include -#include - -namespace llvm { -namespace pdb { -class NativeSession; - -class PDBTypeServerHandler : public codeview::TypeServerHandler { -public: - PDBTypeServerHandler(bool RevisitAlways = false); - - void addSearchPath(StringRef Path); - Expected handle(codeview::TypeServer2Record &TS, - codeview::TypeVisitorCallbacks &Callbacks) override; - -private: - Expected handleInternal(PDBFile &File, - codeview::TypeVisitorCallbacks &Callbacks); - - bool RevisitAlways; - std::unique_ptr Session; - StringSet<> SearchPaths; -}; -} -} - -#endif diff --git a/include/llvm/DebugInfo/PDB/Native/RawTypes.h b/include/llvm/DebugInfo/PDB/Native/RawTypes.h index a3cdd3f09a44..b6321cbf45a8 100644 --- a/include/llvm/DebugInfo/PDB/Native/RawTypes.h +++ b/include/llvm/DebugInfo/PDB/Native/RawTypes.h @@ -10,6 +10,7 @@ #ifndef LLVM_DEBUGINFO_PDB_RAW_RAWTYPES_H #define LLVM_DEBUGINFO_PDB_RAW_RAWTYPES_H +#include "llvm/DebugInfo/CodeView/GUID.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/Support/Endian.h" @@ -268,17 +269,6 @@ struct PublicsStreamHeader { support::ulittle32_t NumSections; }; -/// Defines a 128-bit unique identifier. This maps to a GUID on Windows, but -/// is abstracted here for the purposes of non-Windows platforms that don't have -/// the GUID structure defined. -struct PDB_UniqueId { - uint8_t Guid[16]; -}; - -inline bool operator==(const PDB_UniqueId &LHS, const PDB_UniqueId &RHS) { - return 0 == ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid)); -} - // The header preceeding the global TPI stream. // This corresponds to `HDR` in PDB/dbi/tpi.h. struct TpiStreamHeader { @@ -312,7 +302,7 @@ struct InfoStreamHeader { support::ulittle32_t Version; support::ulittle32_t Signature; support::ulittle32_t Age; - PDB_UniqueId Guid; + codeview::GUID Guid; }; /// The header preceeding the /names stream. diff --git a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h index 156abb59a6be..c1edec7a26fe 100644 --- a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h +++ b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h @@ -10,84 +10,13 @@ #ifndef LLVM_DEBUGINFO_PDB_TPIHASHING_H #define LLVM_DEBUGINFO_PDB_TPIHASHING_H -#include "llvm/ADT/Optional.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" -#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" -#include "llvm/DebugInfo/PDB/Native/RawError.h" -#include "llvm/Support/BinaryStreamArray.h" -#include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" -#include -#include namespace llvm { namespace pdb { -class TpiHashUpdater : public codeview::TypeVisitorCallbacks { -public: - TpiHashUpdater() = default; - -#define TYPE_RECORD(EnumName, EnumVal, Name) \ - virtual Error visitKnownRecord(codeview::CVType &CVR, \ - codeview::Name##Record &Record) override { \ - visitKnownRecordImpl(CVR, Record); \ - return Error::success(); \ - } -#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#define MEMBER_RECORD(EnumName, EnumVal, Name) -#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" - -private: - template - void visitKnownRecordImpl(codeview::CVType &CVR, RecordKind &Record) { - CVR.Hash = 0; - } - - void visitKnownRecordImpl(codeview::CVType &CVR, - codeview::UdtSourceLineRecord &Rec); - void visitKnownRecordImpl(codeview::CVType &CVR, - codeview::UdtModSourceLineRecord &Rec); - void visitKnownRecordImpl(codeview::CVType &CVR, codeview::ClassRecord &Rec); - void visitKnownRecordImpl(codeview::CVType &CVR, codeview::EnumRecord &Rec); - void visitKnownRecordImpl(codeview::CVType &CVR, codeview::UnionRecord &Rec); -}; - -class TpiHashVerifier : public codeview::TypeVisitorCallbacks { -public: - TpiHashVerifier(FixedStreamArray &HashValues, - uint32_t NumHashBuckets) - : HashValues(HashValues), NumHashBuckets(NumHashBuckets) {} - - Error visitKnownRecord(codeview::CVType &CVR, - codeview::UdtSourceLineRecord &Rec) override; - Error visitKnownRecord(codeview::CVType &CVR, - codeview::UdtModSourceLineRecord &Rec) override; - Error visitKnownRecord(codeview::CVType &CVR, - codeview::ClassRecord &Rec) override; - Error visitKnownRecord(codeview::CVType &CVR, - codeview::EnumRecord &Rec) override; - Error visitKnownRecord(codeview::CVType &CVR, - codeview::UnionRecord &Rec) override; - Error visitTypeBegin(codeview::CVType &CVR) override; - -private: - Error verifySourceLine(codeview::TypeIndex TI); - - Error errorInvalidHash() { - return make_error( - raw_error_code::invalid_tpi_hash, - "Type index is 0x" + - utohexstr(codeview::TypeIndex::FirstNonSimpleIndex + Index)); - } - - FixedStreamArray HashValues; - codeview::CVType RawRecord; - uint32_t NumHashBuckets; - uint32_t Index = -1; -}; +Expected hashTypeRecord(const llvm::codeview::CVType &Type); } // end namespace pdb } // end namespace llvm diff --git a/include/llvm/DebugInfo/PDB/PDBExtras.h b/include/llvm/DebugInfo/PDB/PDBExtras.h index 3a38f21b94c8..778121c8eb79 100644 --- a/include/llvm/DebugInfo/PDB/PDBExtras.h +++ b/include/llvm/DebugInfo/PDB/PDBExtras.h @@ -32,7 +32,6 @@ raw_ostream &operator<<(raw_ostream &OS, const PDB_Checksum &Checksum); raw_ostream &operator<<(raw_ostream &OS, const PDB_Lang &Lang); raw_ostream &operator<<(raw_ostream &OS, const PDB_SymType &Tag); raw_ostream &operator<<(raw_ostream &OS, const PDB_MemberAccess &Access); -raw_ostream &operator<<(raw_ostream &OS, const PDB_UniqueId &Guid); raw_ostream &operator<<(raw_ostream &OS, const PDB_UdtType &Type); raw_ostream &operator<<(raw_ostream &OS, const PDB_Machine &Machine); diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h index c1acca386820..27b5457fc8ff 100644 --- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h +++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h @@ -22,6 +22,7 @@ #include "llvm/ExecutionEngine/JITSymbol.h" #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" +#include "llvm/ExecutionEngine/Orc/OrcError.h" #include "llvm/ExecutionEngine/RuntimeDyld.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constant.h" @@ -289,21 +290,21 @@ public: // FIXME: We should track and free associated resources (unused compile // callbacks, uncompiled IR, and no-longer-needed/reachable function // implementations). - // FIXME: Return Error once the JIT APIs are Errorized. - bool updatePointer(std::string FuncName, JITTargetAddress FnBodyAddr) { + Error updatePointer(std::string FuncName, JITTargetAddress FnBodyAddr) { //Find out which logical dylib contains our symbol auto LDI = LogicalDylibs.begin(); for (auto LDE = LogicalDylibs.end(); LDI != LDE; ++LDI) { - if (auto LMResources = LDI->getLogicalModuleResourcesForSymbol(FuncName, false)) { + if (auto LMResources = + LDI->getLogicalModuleResourcesForSymbol(FuncName, false)) { Module &SrcM = LMResources->SourceModule->getResource(); std::string CalledFnName = mangle(FuncName, SrcM.getDataLayout()); - if (auto EC = LMResources->StubsMgr->updatePointer(CalledFnName, FnBodyAddr)) - return false; - else - return true; + if (auto Err = LMResources->StubsMgr->updatePointer(CalledFnName, + FnBodyAddr)) + return Err; + return Error::success(); } } - return false; + return make_error(FuncName); } private: @@ -363,11 +364,8 @@ private: }); } - auto EC = LD.StubsMgr->createStubs(StubInits); - (void)EC; - // FIXME: This should be propagated back to the user. Stub creation may - // fail for remote JITs. - assert(!EC && "Error generating stubs"); + if (auto Err = LD.StubsMgr->createStubs(StubInits)) + return Err; } // If this module doesn't contain any globals, aliases, or module flags then diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h index a9778514b9f1..0c1862c5c3ea 100644 --- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h +++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h @@ -135,12 +135,13 @@ public: virtual void *getPointerToNamedFunction(const std::string &Name, bool AbortOnFailure = true); -private: +protected: struct EHFrame { uint8_t *Addr; size_t Size; }; - std::vector EHFrames; + typedef std::vector EHFrameInfos; + EHFrameInfos EHFrames; }; // Create wrappers for C Binding types (see CBindingWrapping.h). diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h index 801e88aba4d1..850964afc307 100644 --- a/include/llvm/IR/CallingConv.h +++ b/include/llvm/IR/CallingConv.h @@ -143,11 +143,15 @@ namespace CallingConv { /// System V ABI, used on most non-Windows systems. X86_64_SysV = 78, - /// \brief The C convention as implemented on Windows/x86-64. This - /// convention differs from the more common \c X86_64_SysV convention - /// in a number of ways, most notably in that XMM registers used to pass - /// arguments are shadowed by GPRs, and vice versa. - X86_64_Win64 = 79, + /// \brief The C convention as implemented on Windows/x86-64 and + /// AArch64. This convention differs from the more common + /// \c X86_64_SysV convention in a number of ways, most notably in + /// that XMM registers used to pass arguments are shadowed by GPRs, + /// and vice versa. + /// On AArch64, this is identical to the normal C (AAPCS) calling + /// convention for normal functions, but floats are passed in integer + /// registers to variadic functions. + Win64 = 79, /// \brief MSVC calling convention that passes vectors and vector aggregates /// in SSE registers. diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h index 2e72c41ccee3..0094fd54992a 100644 --- a/include/llvm/IR/Constants.h +++ b/include/llvm/IR/Constants.h @@ -598,6 +598,10 @@ public: /// specified element in the low bits of a uint64_t. uint64_t getElementAsInteger(unsigned i) const; + /// If this is a sequential container of integers (of any size), return the + /// specified element as an APInt. + APInt getElementAsAPInt(unsigned i) const; + /// If this is a sequential container of floating point type, return the /// specified element as an APFloat. APFloat getElementAsAPFloat(unsigned i) const; @@ -761,6 +765,10 @@ public: /// i32/i64/float/double) and must be a ConstantFP or ConstantInt. static Constant *getSplat(unsigned NumElts, Constant *Elt); + /// Returns true if this is a splat constant, meaning that all elements have + /// the same value. + bool isSplat() const; + /// If this is a splat constant, meaning that all of the elements have the /// same value, return that value. Otherwise return NULL. Constant *getSplatValue() const; diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h index 8e6bb4baccaf..6a14f783005d 100644 --- a/include/llvm/IR/DIBuilder.h +++ b/include/llvm/IR/DIBuilder.h @@ -674,32 +674,37 @@ namespace llvm { /// Create a descriptor for an imported module. /// \param Context The scope this module is imported into - /// \param NS The namespace being imported here - /// \param Line Line number + /// \param NS The namespace being imported here. + /// \param File File where the declaration is located. + /// \param Line Line number of the declaration. DIImportedEntity *createImportedModule(DIScope *Context, DINamespace *NS, - unsigned Line); + DIFile *File, unsigned Line); /// Create a descriptor for an imported module. - /// \param Context The scope this module is imported into - /// \param NS An aliased namespace - /// \param Line Line number + /// \param Context The scope this module is imported into. + /// \param NS An aliased namespace. + /// \param File File where the declaration is located. + /// \param Line Line number of the declaration. DIImportedEntity *createImportedModule(DIScope *Context, - DIImportedEntity *NS, unsigned Line); + DIImportedEntity *NS, DIFile *File, + unsigned Line); /// Create a descriptor for an imported module. - /// \param Context The scope this module is imported into - /// \param M The module being imported here - /// \param Line Line number + /// \param Context The scope this module is imported into. + /// \param M The module being imported here + /// \param File File where the declaration is located. + /// \param Line Line number of the declaration. DIImportedEntity *createImportedModule(DIScope *Context, DIModule *M, - unsigned Line); + DIFile *File, unsigned Line); /// Create a descriptor for an imported function. - /// \param Context The scope this module is imported into - /// \param Decl The declaration (or definition) of a function, type, or - /// variable - /// \param Line Line number + /// \param Context The scope this module is imported into. + /// \param Decl The declaration (or definition) of a function, type, or + /// variable. + /// \param File File where the declaration is located. + /// \param Line Line number of the declaration. DIImportedEntity *createImportedDeclaration(DIScope *Context, DINode *Decl, - unsigned Line, + DIFile *File, unsigned Line, StringRef Name = ""); /// Insert a new llvm.dbg.declare intrinsic call. diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h index 9374fe4fae76..678a43ae7926 100644 --- a/include/llvm/IR/DebugInfoMetadata.h +++ b/include/llvm/IR/DebugInfoMetadata.h @@ -435,10 +435,10 @@ public: /// Return the raw underlying file. /// - /// A \a DIFile is a \a DIScope, but it doesn't point at a separate file - /// (it\em is the file). If \c this is an \a DIFile, we need to return \c - /// this. Otherwise, return the first operand, which is where all other - /// subclasses store their file pointer. + /// A \a DIFile is a \a DIScope, but it doesn't point at a separate file (it + /// \em is the file). If \c this is an \a DIFile, we need to return \c this. + /// Otherwise, return the first operand, which is where all other subclasses + /// store their file pointer. Metadata *getRawFile() const { return isa(this) ? const_cast(this) : static_cast(getOperand(0)); @@ -2551,32 +2551,32 @@ class DIImportedEntity : public DINode { static DIImportedEntity *getImpl(LLVMContext &Context, unsigned Tag, DIScope *Scope, DINodeRef Entity, - unsigned Line, StringRef Name, + DIFile *File, unsigned Line, StringRef Name, StorageType Storage, bool ShouldCreate = true) { - return getImpl(Context, Tag, Scope, Entity, Line, + return getImpl(Context, Tag, Scope, Entity, File, Line, getCanonicalMDString(Context, Name), Storage, ShouldCreate); } static DIImportedEntity *getImpl(LLVMContext &Context, unsigned Tag, Metadata *Scope, Metadata *Entity, - unsigned Line, MDString *Name, - StorageType Storage, + Metadata *File, unsigned Line, + MDString *Name, StorageType Storage, bool ShouldCreate = true); TempDIImportedEntity cloneImpl() const { return getTemporary(getContext(), getTag(), getScope(), getEntity(), - getLine(), getName()); + getFile(), getLine(), getName()); } public: DEFINE_MDNODE_GET(DIImportedEntity, (unsigned Tag, DIScope *Scope, DINodeRef Entity, - unsigned Line, StringRef Name = ""), - (Tag, Scope, Entity, Line, Name)) + DIFile *File, unsigned Line, StringRef Name = ""), + (Tag, Scope, Entity, File, Line, Name)) DEFINE_MDNODE_GET(DIImportedEntity, (unsigned Tag, Metadata *Scope, Metadata *Entity, - unsigned Line, MDString *Name), - (Tag, Scope, Entity, Line, Name)) + Metadata *File, unsigned Line, MDString *Name), + (Tag, Scope, Entity, File, Line, Name)) TempDIImportedEntity clone() const { return cloneImpl(); } @@ -2584,10 +2584,12 @@ public: DIScope *getScope() const { return cast_or_null(getRawScope()); } DINodeRef getEntity() const { return DINodeRef(getRawEntity()); } StringRef getName() const { return getStringOperand(2); } + DIFile *getFile() const { return cast_or_null(getRawFile()); } Metadata *getRawScope() const { return getOperand(0); } Metadata *getRawEntity() const { return getOperand(1); } MDString *getRawName() const { return getOperandAs(2); } + Metadata *getRawFile() const { return getOperand(3); } static bool classof(const Metadata *MD) { return MD->getMetadataID() == DIImportedEntityKind; diff --git a/include/llvm/IR/Dominators.h b/include/llvm/IR/Dominators.h index e10d14c19793..5b21a2c83e4a 100644 --- a/include/llvm/IR/Dominators.h +++ b/include/llvm/IR/Dominators.h @@ -34,22 +34,31 @@ class Module; class raw_ostream; extern template class DomTreeNodeBase; -extern template class DominatorTreeBase; +extern template class DominatorTreeBase; // DomTree +extern template class DominatorTreeBase; // PostDomTree namespace DomTreeBuilder { -extern template void Calculate( - DominatorTreeBaseByGraphTraits> &DT, Function &F); - -extern template void Calculate>( - DominatorTreeBaseByGraphTraits>> &DT, - Function &F); - -extern template bool Verify( - const DominatorTreeBaseByGraphTraits> &DT); - -extern template bool Verify>( - const DominatorTreeBaseByGraphTraits>> - &DT); +using BBDomTree = DomTreeBase; +using BBPostDomTree = PostDomTreeBase; + +extern template void Calculate(BBDomTree &DT, Function &F); +extern template void Calculate(BBPostDomTree &DT, + Function &F); + +extern template void InsertEdge(BBDomTree &DT, BasicBlock *From, + BasicBlock *To); +extern template void InsertEdge(BBPostDomTree &DT, + BasicBlock *From, + BasicBlock *To); + +extern template void DeleteEdge(BBDomTree &DT, BasicBlock *From, + BasicBlock *To); +extern template void DeleteEdge(BBPostDomTree &DT, + BasicBlock *From, + BasicBlock *To); + +extern template bool Verify(const BBDomTree &DT); +extern template bool Verify(const BBPostDomTree &DT); } // namespace DomTreeBuilder using DomTreeNode = DomTreeNodeBase; @@ -122,14 +131,12 @@ template <> struct DenseMapInfo { /// the dominator tree is initially constructed may still exist in the tree, /// even if the tree is properly updated. Calling code should not rely on the /// preceding statements; this is stated only to assist human understanding. -class DominatorTree : public DominatorTreeBase { -public: - using Base = DominatorTreeBase; +class DominatorTree : public DominatorTreeBase { + public: + using Base = DominatorTreeBase; - DominatorTree() : DominatorTreeBase(false) {} - explicit DominatorTree(Function &F) : DominatorTreeBase(false) { - recalculate(F); - } + DominatorTree() = default; + explicit DominatorTree(Function &F) { recalculate(F); } /// Handle invalidation explicitly. bool invalidate(Function &F, const PreservedAnalyses &PA, diff --git a/include/llvm/IR/IntrinsicsHexagon.td b/include/llvm/IR/IntrinsicsHexagon.td index 8ac56e03be6a..098245344725 100644 --- a/include/llvm/IR/IntrinsicsHexagon.td +++ b/include/llvm/IR/IntrinsicsHexagon.td @@ -32,16 +32,6 @@ class Hexagon_qi_mem_Intrinsic : Hexagon_Intrinsic; - -// -// DEF_FUNCTION_TYPE_1(void_ftype_SI,BT_VOID,BT_INT) -> -// Hexagon_void_si_Intrinsic -// -class Hexagon_void_si_Intrinsic - : Hexagon_Intrinsic; - // // DEF_FUNCTION_TYPE_1(HI_ftype_SI,BT_I16,BT_INT) -> // Hexagon_hi_si_Intrinsic @@ -4959,11 +4949,25 @@ Hexagon_di_di_Intrinsic<"HEXAGON_S2_interleave">; // def int_hexagon_S2_deinterleave : Hexagon_di_di_Intrinsic<"HEXAGON_S2_deinterleave">; + // // BUILTIN_INFO(HEXAGON.dcfetch_A,v_ftype_DI*,1) // def int_hexagon_prefetch : -Hexagon_void_si_Intrinsic<"HEXAGON_prefetch">; +Hexagon_Intrinsic<"HEXAGON_prefetch", [], [llvm_ptr_ty], []>; +def int_hexagon_Y2_dccleana : +Hexagon_Intrinsic<"HEXAGON_Y2_dccleana", [], [llvm_ptr_ty], []>; +def int_hexagon_Y2_dccleaninva : +Hexagon_Intrinsic<"HEXAGON_Y2_dccleaninva", [], [llvm_ptr_ty], []>; +def int_hexagon_Y2_dcinva : +Hexagon_Intrinsic<"HEXAGON_Y2_dcinva", [], [llvm_ptr_ty], []>; +def int_hexagon_Y2_dczeroa : +Hexagon_Intrinsic<"HEXAGON_Y2_dczeroa", [], [llvm_ptr_ty], + [IntrWriteMem, IntrArgMemOnly, IntrHasSideEffects]>; +def int_hexagon_Y4_l2fetch : +Hexagon_Intrinsic<"HEXAGON_Y4_l2fetch", [], [llvm_ptr_ty, llvm_i32_ty], []>; +def int_hexagon_Y5_l2fetch : +Hexagon_Intrinsic<"HEXAGON_Y5_l2fetch", [], [llvm_ptr_ty, llvm_i64_ty], []>; def llvm_ptr32_ty : LLVMPointerType; def llvm_ptr64_ty : LLVMPointerType; diff --git a/include/llvm/IR/IntrinsicsSystemZ.td b/include/llvm/IR/IntrinsicsSystemZ.td index 9be37d3645b2..98065bc51d99 100644 --- a/include/llvm/IR/IntrinsicsSystemZ.td +++ b/include/llvm/IR/IntrinsicsSystemZ.td @@ -373,6 +373,49 @@ let TargetPrefix = "s390" in { def int_s390_vfidb : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + + // Instructions from the Vector Enhancements Facility 1 + def int_s390_vbperm : SystemZBinaryConv<"vbperm", llvm_v2i64_ty, + llvm_v16i8_ty>; + + def int_s390_vmslg : GCCBuiltin<"__builtin_s390_vmslg">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v16i8_ty, + llvm_i32_ty], [IntrNoMem]>; + + def int_s390_vfmaxdb : Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_s390_vfmindb : Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_s390_vfmaxsb : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_s390_vfminsb : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty], + [IntrNoMem]>; + + def int_s390_vfcesbs : SystemZBinaryConvCC; + def int_s390_vfchsbs : SystemZBinaryConvCC; + def int_s390_vfchesbs : SystemZBinaryConvCC; + + def int_s390_vftcisb : SystemZBinaryConvIntCC; + + def int_s390_vfisb : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; + + // Instructions from the Vector Packed Decimal Facility + def int_s390_vlrl : GCCBuiltin<"__builtin_s390_vlrl">, + Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty, llvm_ptr_ty], + [IntrReadMem, IntrArgMemOnly]>; + + def int_s390_vstrl : GCCBuiltin<"__builtin_s390_vstrl">, + Intrinsic<[], [llvm_v16i8_ty, llvm_i32_ty, llvm_ptr_ty], + // In fact write-only but there's no property + // for that. + [IntrArgMemOnly]>; } //===----------------------------------------------------------------------===// diff --git a/include/llvm/MC/LaneBitmask.h b/include/llvm/MC/LaneBitmask.h index 5ca06d1148e2..73b987b074db 100644 --- a/include/llvm/MC/LaneBitmask.h +++ b/include/llvm/MC/LaneBitmask.h @@ -75,6 +75,9 @@ namespace llvm { static LaneBitmask getNone() { return LaneBitmask(0); } static LaneBitmask getAll() { return ~LaneBitmask(0); } + static LaneBitmask getLane(unsigned Lane) { + return LaneBitmask(Type(1) << Lane); + } private: Type Mask = 0; diff --git a/include/llvm/MC/MCFixup.h b/include/llvm/MC/MCFixup.h index b493ca0b0ea7..b83086c327f2 100644 --- a/include/llvm/MC/MCFixup.h +++ b/include/llvm/MC/MCFixup.h @@ -69,7 +69,7 @@ class MCFixup { /// an instruction or an assembler directive. const MCExpr *Value; - /// The byte index of start of the relocation inside the encoded instruction. + /// The byte index of start of the relocation inside the MCFragment. uint32_t Offset; /// The target dependent kind of fixup item this is. The kind is used to diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h index 340d8253b8c9..9150a8b5c80a 100644 --- a/include/llvm/MC/MCInstrDesc.h +++ b/include/llvm/MC/MCInstrDesc.h @@ -209,6 +209,15 @@ public: /// well. unsigned getNumOperands() const { return NumOperands; } + using const_opInfo_iterator = const MCOperandInfo *; + + const_opInfo_iterator opInfo_begin() const { return OpInfo; } + const_opInfo_iterator opInfo_end() const { return OpInfo + NumOperands; } + + iterator_range operands() const { + return make_range(opInfo_begin(), opInfo_end()); + } + /// \brief Return the number of MachineOperands that are register /// definitions. Register definitions always occur at the start of the /// machine operand list. This is the number of "outs" in the .td file, diff --git a/include/llvm/Object/COFFImportFile.h b/include/llvm/Object/COFFImportFile.h index 060f965233e1..8e215b565fc4 100644 --- a/include/llvm/Object/COFFImportFile.h +++ b/include/llvm/Object/COFFImportFile.h @@ -95,7 +95,7 @@ struct COFFShortExport { } }; -std::error_code writeImportLibrary(StringRef DLLName, +std::error_code writeImportLibrary(StringRef ImportName, StringRef Path, ArrayRef Exports, COFF::MachineTypes Machine); diff --git a/include/llvm/Object/COFFModuleDefinition.h b/include/llvm/Object/COFFModuleDefinition.h index a0e8eacdb7a3..be139a2833b0 100644 --- a/include/llvm/Object/COFFModuleDefinition.h +++ b/include/llvm/Object/COFFModuleDefinition.h @@ -16,7 +16,6 @@ // //===----------------------------------------------------------------------===// - #ifndef LLVM_OBJECT_COFF_MODULE_DEFINITION_H #define LLVM_OBJECT_COFF_MODULE_DEFINITION_H @@ -29,6 +28,7 @@ namespace object { struct COFFModuleDefinition { std::vector Exports; std::string OutputFile; + std::string ImportName; uint64_t ImageBase = 0; uint64_t StackReserve = 0; uint64_t StackCommit = 0; @@ -40,8 +40,12 @@ struct COFFModuleDefinition { uint32_t MinorOSVersion = 0; }; +// mingw and wine def files do not mangle _ for x86 which +// is a consequence of legacy binutils' dlltool functionality. +// This MingwDef flag should be removed once mingw stops this pratice. Expected -parseCOFFModuleDefinition(MemoryBufferRef MB, COFF::MachineTypes Machine); +parseCOFFModuleDefinition(MemoryBufferRef MB, COFF::MachineTypes Machine, + bool MingwDef = false); } // End namespace object. } // End namespace llvm. diff --git a/include/llvm/ObjectYAML/CodeViewYAMLTypes.h b/include/llvm/ObjectYAML/CodeViewYAMLTypes.h index 6746fd60b6cb..88a5668f0a14 100644 --- a/include/llvm/ObjectYAML/CodeViewYAMLTypes.h +++ b/include/llvm/ObjectYAML/CodeViewYAMLTypes.h @@ -60,6 +60,8 @@ ArrayRef toDebugT(ArrayRef, BumpPtrAllocator &Alloc); } // end namespace llvm +LLVM_YAML_DECLARE_SCALAR_TRAITS(codeview::GUID, true) + LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::LeafRecord) LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::MemberRecord) diff --git a/include/llvm/Support/AArch64TargetParser.def b/include/llvm/Support/AArch64TargetParser.def index 8eccebcd932a..09f9602a24d9 100644 --- a/include/llvm/Support/AArch64TargetParser.def +++ b/include/llvm/Support/AArch64TargetParser.def @@ -43,8 +43,9 @@ AARCH64_ARCH_EXT_NAME("crypto", AArch64::AEK_CRYPTO, "+crypto","-crypto") AARCH64_ARCH_EXT_NAME("fp", AArch64::AEK_FP, "+fp-armv8", "-fp-armv8") AARCH64_ARCH_EXT_NAME("simd", AArch64::AEK_SIMD, "+neon", "-neon") AARCH64_ARCH_EXT_NAME("fp16", AArch64::AEK_FP16, "+fullfp16", "-fullfp16") -AARCH64_ARCH_EXT_NAME("profile", AArch64::AEK_PROFILE, "+spe", "-spe") -AARCH64_ARCH_EXT_NAME("ras", AArch64::AEK_RAS, "+ras", "-ras") +AARCH64_ARCH_EXT_NAME("profile", AArch64::AEK_PROFILE, "+spe", "-spe") +AARCH64_ARCH_EXT_NAME("ras", AArch64::AEK_RAS, "+ras", "-ras") +AARCH64_ARCH_EXT_NAME("sve", AArch64::AEK_SVE, "+sve", "-sve") #undef AARCH64_ARCH_EXT_NAME #ifndef AARCH64_CPU_NAME diff --git a/include/llvm/Support/BinaryItemStream.h b/include/llvm/Support/BinaryItemStream.h index f4b319217819..fe7e6caeaafb 100644 --- a/include/llvm/Support/BinaryItemStream.h +++ b/include/llvm/Support/BinaryItemStream.h @@ -62,32 +62,45 @@ public: return Error::success(); } - void setItems(ArrayRef ItemArray) { Items = ItemArray; } + void setItems(ArrayRef ItemArray) { + Items = ItemArray; + computeItemOffsets(); + } uint32_t getLength() override { - uint32_t Size = 0; - for (const auto &Item : Items) - Size += Traits::length(Item); - return Size; + return ItemEndOffsets.empty() ? 0 : ItemEndOffsets.back(); } private: - Expected translateOffsetIndex(uint32_t Offset) const { + void computeItemOffsets() { + ItemEndOffsets.clear(); + ItemEndOffsets.reserve(Items.size()); uint32_t CurrentOffset = 0; - uint32_t CurrentIndex = 0; for (const auto &Item : Items) { - if (CurrentOffset >= Offset) - break; - CurrentOffset += Traits::length(Item); - ++CurrentIndex; + uint32_t Len = Traits::length(Item); + assert(Len > 0 && "no empty items"); + CurrentOffset += Len; + ItemEndOffsets.push_back(CurrentOffset); } - if (CurrentOffset != Offset) + } + + Expected translateOffsetIndex(uint32_t Offset) { + // Make sure the offset is somewhere in our items array. + if (Offset >= getLength()) return make_error(stream_error_code::stream_too_short); - return CurrentIndex; + ++Offset; + auto Iter = + std::lower_bound(ItemEndOffsets.begin(), ItemEndOffsets.end(), Offset); + size_t Idx = std::distance(ItemEndOffsets.begin(), Iter); + assert(Idx < Items.size() && "binary search for offset failed"); + return Idx; } llvm::support::endianness Endian; ArrayRef Items; + + // Sorted vector of offsets to accelerate lookup. + std::vector ItemEndOffsets; }; } // end namespace llvm diff --git a/include/llvm/Support/Format.h b/include/llvm/Support/Format.h index 017b4973f1ff..bcbd2bec5722 100644 --- a/include/llvm/Support/Format.h +++ b/include/llvm/Support/Format.h @@ -125,30 +125,39 @@ inline format_object format(const char *Fmt, const Ts &... Vals) { return format_object(Fmt, Vals...); } -/// This is a helper class used for left_justify() and right_justify(). +/// This is a helper class for left_justify, right_justify, and center_justify. class FormattedString { +public: + enum Justification { JustifyNone, JustifyLeft, JustifyRight, JustifyCenter }; + FormattedString(StringRef S, unsigned W, Justification J) + : Str(S), Width(W), Justify(J) {} + +private: StringRef Str; unsigned Width; - bool RightJustify; + Justification Justify; friend class raw_ostream; - -public: - FormattedString(StringRef S, unsigned W, bool R) - : Str(S), Width(W), RightJustify(R) { } }; /// left_justify - append spaces after string so total output is /// \p Width characters. If \p Str is larger that \p Width, full string /// is written with no padding. inline FormattedString left_justify(StringRef Str, unsigned Width) { - return FormattedString(Str, Width, false); + return FormattedString(Str, Width, FormattedString::JustifyLeft); } /// right_justify - add spaces before string so total output is /// \p Width characters. If \p Str is larger that \p Width, full string /// is written with no padding. inline FormattedString right_justify(StringRef Str, unsigned Width) { - return FormattedString(Str, Width, true); + return FormattedString(Str, Width, FormattedString::JustifyRight); +} + +/// center_justify - add spaces before and after string so total output is +/// \p Width characters. If \p Str is larger that \p Width, full string +/// is written with no padding. +inline FormattedString center_justify(StringRef Str, unsigned Width) { + return FormattedString(Str, Width, FormattedString::JustifyCenter); } /// This is a helper class used for format_hex() and format_decimal(). diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h index 394a45387d8a..706320fed9a7 100644 --- a/include/llvm/Support/GenericDomTree.h +++ b/include/llvm/Support/GenericDomTree.h @@ -41,27 +41,21 @@ namespace llvm { -template class DominatorTreeBase; +template +class DominatorTreeBase; -namespace detail { - -template struct DominatorTreeBaseTraits { - static_assert(std::is_pointer::value, - "Currently NodeRef must be a pointer type."); - using type = DominatorTreeBase< - typename std::remove_pointer::type>; -}; - -} // end namespace detail - -template -using DominatorTreeBaseByGraphTraits = - typename detail::DominatorTreeBaseTraits::type; +namespace DomTreeBuilder { +template +struct SemiNCAInfo; +} // namespace DomTreeBuilder /// \brief Base class for the actual dominator tree node. template class DomTreeNodeBase { friend struct PostDominatorTree; - template friend class DominatorTreeBase; + friend class DominatorTreeBase; + friend class DominatorTreeBase; + friend struct DomTreeBuilder::SemiNCAInfo>; + friend struct DomTreeBuilder::SemiNCAInfo>; NodeT *TheBB; DomTreeNodeBase *IDom; @@ -192,58 +186,69 @@ void PrintDomTree(const DomTreeNodeBase *N, raw_ostream &O, } namespace DomTreeBuilder { -template -struct SemiNCAInfo; +// The routines below are provided in a separate header but referenced here. +template +void Calculate(DomTreeT &DT, FuncT &F); + +template +void InsertEdge(DomTreeT &DT, typename DomTreeT::NodePtr From, + typename DomTreeT::NodePtr To); -// The calculate routine is provided in a separate header but referenced here. -template -void Calculate(DominatorTreeBaseByGraphTraits> &DT, FuncT &F); +template +void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From, + typename DomTreeT::NodePtr To); -// The verify function is provided in a separate header but referenced here. -template -bool Verify(const DominatorTreeBaseByGraphTraits> &DT); +template +bool Verify(const DomTreeT &DT); } // namespace DomTreeBuilder /// \brief Core dominator tree base class. /// /// This class is a generic template over graph nodes. It is instantiated for /// various graphs in the LLVM IR or in the code generator. -template class DominatorTreeBase { +template +class DominatorTreeBase { protected: std::vector Roots; - bool IsPostDominators; using DomTreeNodeMapType = DenseMap>>; DomTreeNodeMapType DomTreeNodes; DomTreeNodeBase *RootNode; + using ParentPtr = decltype(std::declval()->getParent()); + ParentPtr Parent = nullptr; mutable bool DFSInfoValid = false; mutable unsigned int SlowQueries = 0; - friend struct DomTreeBuilder::SemiNCAInfo; - using SNCAInfoTy = DomTreeBuilder::SemiNCAInfo; + friend struct DomTreeBuilder::SemiNCAInfo; public: - explicit DominatorTreeBase(bool isPostDom) : IsPostDominators(isPostDom) {} + static_assert(std::is_pointer::NodeRef>::value, + "Currently DominatorTreeBase supports only pointer nodes"); + using NodeType = NodeT; + using NodePtr = NodeT *; + static constexpr bool IsPostDominator = IsPostDom; + + DominatorTreeBase() {} DominatorTreeBase(DominatorTreeBase &&Arg) : Roots(std::move(Arg.Roots)), - IsPostDominators(Arg.IsPostDominators), DomTreeNodes(std::move(Arg.DomTreeNodes)), - RootNode(std::move(Arg.RootNode)), - DFSInfoValid(std::move(Arg.DFSInfoValid)), - SlowQueries(std::move(Arg.SlowQueries)) { + RootNode(Arg.RootNode), + Parent(Arg.Parent), + DFSInfoValid(Arg.DFSInfoValid), + SlowQueries(Arg.SlowQueries) { Arg.wipe(); } DominatorTreeBase &operator=(DominatorTreeBase &&RHS) { Roots = std::move(RHS.Roots); - IsPostDominators = RHS.IsPostDominators; DomTreeNodes = std::move(RHS.DomTreeNodes); - RootNode = std::move(RHS.RootNode); - DFSInfoValid = std::move(RHS.DFSInfoValid); - SlowQueries = std::move(RHS.SlowQueries); + RootNode = RHS.RootNode; + Parent = RHS.Parent; + DFSInfoValid = RHS.DFSInfoValid; + SlowQueries = RHS.SlowQueries; RHS.wipe(); return *this; } @@ -259,11 +264,12 @@ template class DominatorTreeBase { /// isPostDominator - Returns true if analysis based of postdoms /// - bool isPostDominator() const { return IsPostDominators; } + bool isPostDominator() const { return IsPostDominator; } /// compare - Return false if the other dominator tree base matches this /// dominator tree base. Otherwise return true. bool compare(const DominatorTreeBase &Other) const { + if (Parent != Other.Parent) return true; const DomTreeNodeMapType &OtherDomTreeNodes = Other.DomTreeNodes; if (DomTreeNodes.size() != OtherDomTreeNodes.size()) @@ -443,10 +449,50 @@ template class DominatorTreeBase { const_cast(B)); } + bool isVirtualRoot(const DomTreeNodeBase *A) const { + return isPostDominator() && !A->getBlock(); + } + //===--------------------------------------------------------------------===// // API to update (Post)DominatorTree information based on modifications to // the CFG... + /// Inform the dominator tree about a CFG edge insertion and update the tree. + /// + /// This function has to be called just before or just after making the update + /// on the actual CFG. There cannot be any other updates that the dominator + /// tree doesn't know about. + /// + /// Note that for postdominators it automatically takes care of inserting + /// a reverse edge internally (so there's no need to swap the parameters). + /// + void insertEdge(NodeT *From, NodeT *To) { + assert(From); + assert(To); + assert(From->getParent() == Parent); + assert(To->getParent() == Parent); + DomTreeBuilder::InsertEdge(*this, From, To); + } + + /// Inform the dominator tree about a CFG edge deletion and update the tree. + /// + /// This function has to be called just after making the update + /// on the actual CFG. There cannot be any other updates that the dominator + /// tree doesn't know about. The only exception is when the deletion that the + /// tree is informed about makes some (domominator) subtree unreachable -- in + /// this case, it is fine to perform deletions within this subtree. + /// + /// Note that for postdominators it automatically takes care of deleting + /// a reverse edge internally (so there's no need to swap the parameters). + /// + void deleteEdge(NodeT *From, NodeT *To) { + assert(From); + assert(To); + assert(From->getParent() == Parent); + assert(To->getParent() == Parent); + DomTreeBuilder::DeleteEdge(*this, From, To); + } + /// Add a new node to the dominator tree information. /// /// This creates a new node as a child of DomBB dominator node, linking it @@ -530,7 +576,7 @@ template class DominatorTreeBase { /// splitBlock - BB is split and now it has one successor. Update dominator /// tree to reflect this change. void splitBlock(NodeT *NewBB) { - if (this->IsPostDominators) + if (IsPostDominator) Split>(NewBB); else Split(NewBB); @@ -607,37 +653,33 @@ public: template void recalculate(FT &F) { using TraitsTy = GraphTraits; reset(); + Parent = &F; - if (!this->IsPostDominators) { + if (!IsPostDominator) { // Initialize root NodeT *entry = TraitsTy::getEntryNode(&F); addRoot(entry); - - DomTreeBuilder::Calculate(*this, F); } else { // Initialize the roots list for (auto *Node : nodes(&F)) if (TraitsTy::child_begin(Node) == TraitsTy::child_end(Node)) addRoot(Node); - - DomTreeBuilder::Calculate>(*this, F); } + + DomTreeBuilder::Calculate(*this, F); } /// verify - check parent and sibling property - bool verify() const { - return this->isPostDominator() - ? DomTreeBuilder::Verify>(*this) - : DomTreeBuilder::Verify(*this); - } + bool verify() const { return DomTreeBuilder::Verify(*this); } protected: void addRoot(NodeT *BB) { this->Roots.push_back(BB); } void reset() { DomTreeNodes.clear(); - this->Roots.clear(); + Roots.clear(); RootNode = nullptr; + Parent = nullptr; DFSInfoValid = false; SlowQueries = 0; } @@ -719,13 +761,21 @@ public: void wipe() { DomTreeNodes.clear(); RootNode = nullptr; + Parent = nullptr; } }; +template +using DomTreeBase = DominatorTreeBase; + +template +using PostDomTreeBase = DominatorTreeBase; + // These two functions are declared out of line as a workaround for building // with old (< r147295) versions of clang because of pr11642. -template -bool DominatorTreeBase::dominates(const NodeT *A, const NodeT *B) const { +template +bool DominatorTreeBase::dominates(const NodeT *A, + const NodeT *B) const { if (A == B) return true; @@ -735,9 +785,9 @@ bool DominatorTreeBase::dominates(const NodeT *A, const NodeT *B) const { return dominates(getNode(const_cast(A)), getNode(const_cast(B))); } -template -bool DominatorTreeBase::properlyDominates(const NodeT *A, - const NodeT *B) const { +template +bool DominatorTreeBase::properlyDominates( + const NodeT *A, const NodeT *B) const { if (A == B) return false; diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h index a0fec668e05c..be90afa4c3c8 100644 --- a/include/llvm/Support/GenericDomTreeConstruction.h +++ b/include/llvm/Support/GenericDomTreeConstruction.h @@ -20,15 +20,28 @@ /// out that the theoretically slower O(n*log(n)) implementation is actually /// faster than the almost-linear O(n*alpha(n)) version, even for large CFGs. /// +/// The file uses the Depth Based Search algorithm to perform incremental +/// upates (insertion and deletions). The implemented algorithm is based on this +/// publication: +/// +/// An Experimental Study of Dynamic Dominators +/// Loukas Georgiadis, et al., April 12 2016, pp. 5-7, 9-10: +/// https://arxiv.org/pdf/1604.02711.pdf +/// //===----------------------------------------------------------------------===// #ifndef LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H #define LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H +#include +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/GenericDomTree.h" +#define DEBUG_TYPE "dom-tree-builder" + namespace llvm { namespace DomTreeBuilder { @@ -46,13 +59,14 @@ struct ChildrenGetter { } }; -// Information record used by Semi-NCA during tree construction. -template +template struct SemiNCAInfo { - using NodePtr = NodeT *; - using DomTreeT = DominatorTreeBase; + using NodePtr = typename DomTreeT::NodePtr; + using NodeT = typename DomTreeT::NodeType; using TreeNodePtr = DomTreeNodeBase *; + static constexpr bool IsPostDom = DomTreeT::IsPostDominator; + // Information record used by Semi-NCA during tree construction. struct InfoRec { unsigned DFSNum = 0; unsigned Parent = 0; @@ -62,11 +76,13 @@ struct SemiNCAInfo { SmallVector ReverseChildren; }; - std::vector NumToNode; + // Number to node mapping is 1-based. Initialize the mapping to start with + // a dummy element. + std::vector NumToNode = {nullptr}; DenseMap NodeToInfo; void clear() { - NumToNode.clear(); + NumToNode = {nullptr}; // Restore to initial state with a dummy start node. NodeToInfo.clear(); } @@ -90,12 +106,28 @@ struct SemiNCAInfo { // Add a new tree node for this NodeT, and link it as a child of // IDomNode return (DT.DomTreeNodes[BB] = IDomNode->addChild( - llvm::make_unique>(BB, IDomNode))) + llvm::make_unique>(BB, IDomNode))) .get(); } static bool AlwaysDescend(NodePtr, NodePtr) { return true; } + struct BlockNamePrinter { + NodePtr N; + + BlockNamePrinter(NodePtr Block) : N(Block) {} + BlockNamePrinter(TreeNodePtr TN) : N(TN ? TN->getBlock() : nullptr) {} + + friend raw_ostream &operator<<(raw_ostream &O, const BlockNamePrinter &BP) { + if (!BP.N) + O << "nullptr"; + else + BP.N->printAsOperand(O, false); + + return O; + } + }; + // Custom DFS implementation which can skip nodes based on a provided // predicate. It also collects ReverseChildren so that we don't have to spend // time getting predecessors in SemiNCA. @@ -177,44 +209,42 @@ struct SemiNCAInfo { return VInInfo.Label; } - template - void runSemiNCA(DomTreeT &DT, unsigned NumBlocks) { - // Step #1: Number blocks in depth-first order and initialize variables used - // in later stages of the algorithm. - const unsigned N = doFullDFSWalk(DT, AlwaysDescend); - - // It might be that some blocks did not get a DFS number (e.g., blocks of - // infinite loops). In these cases an artificial exit node is required. - const bool MultipleRoots = - DT.Roots.size() > 1 || (DT.isPostDominator() && N != NumBlocks); - + // This function requires DFS to be run before calling it. + void runSemiNCA(DomTreeT &DT, const unsigned MinLevel = 0) { + const unsigned NextDFSNum(NumToNode.size()); // Initialize IDoms to spanning tree parents. - for (unsigned i = 1; i <= N; ++i) { + for (unsigned i = 1; i < NextDFSNum; ++i) { const NodePtr V = NumToNode[i]; auto &VInfo = NodeToInfo[V]; VInfo.IDom = NumToNode[VInfo.Parent]; } - // Step #2: Calculate the semidominators of all vertices. - for (unsigned i = N; i >= 2; --i) { + // Step #1: Calculate the semidominators of all vertices. + for (unsigned i = NextDFSNum - 1; i >= 2; --i) { NodePtr W = NumToNode[i]; auto &WInfo = NodeToInfo[W]; // Initialize the semi dominator to point to the parent node. WInfo.Semi = WInfo.Parent; - for (const auto &N : WInfo.ReverseChildren) - if (NodeToInfo.count(N)) { // Only if this predecessor is reachable! - unsigned SemiU = NodeToInfo[eval(N, i + 1)].Semi; - if (SemiU < WInfo.Semi) - WInfo.Semi = SemiU; - } + for (const auto &N : WInfo.ReverseChildren) { + if (NodeToInfo.count(N) == 0) // Skip unreachable predecessors. + continue; + + const TreeNodePtr TN = DT.getNode(N); + // Skip predecessors whose level is above the subtree we are processing. + if (TN && TN->getLevel() < MinLevel) + continue; + + unsigned SemiU = NodeToInfo[eval(N, i + 1)].Semi; + if (SemiU < WInfo.Semi) WInfo.Semi = SemiU; + } } - // Step #3: Explicitly define the immediate dominator of each vertex. + // Step #2: Explicitly define the immediate dominator of each vertex. // IDom[i] = NCA(SDom[i], SpanningTreeParent(i)). // Note that the parents were stored in IDoms and later got invalidated // during path compression in Eval. - for (unsigned i = 2; i <= N; ++i) { + for (unsigned i = 2; i < NextDFSNum; ++i) { const NodePtr W = NumToNode[i]; auto &WInfo = NodeToInfo[W]; const unsigned SDomNum = NodeToInfo[NumToNode[WInfo.Semi]].DFSNum; @@ -224,6 +254,36 @@ struct SemiNCAInfo { WInfo.IDom = WIDomCandidate; } + } + + template + unsigned doFullDFSWalk(const DomTreeT &DT, DescendCondition DC) { + unsigned Num = 0; + + if (DT.Roots.size() > 1) { + auto &BBInfo = NodeToInfo[nullptr]; + BBInfo.DFSNum = BBInfo.Semi = ++Num; + BBInfo.Label = nullptr; + + NumToNode.push_back(nullptr); // NumToNode[n] = V; + } + + if (DT.isPostDominator()) { + for (auto *Root : DT.Roots) Num = runDFS(Root, Num, DC, 1); + } else { + assert(DT.Roots.size() == 1); + Num = runDFS(DT.Roots[0], Num, DC, Num); + } + + return Num; + } + + void calculateFromScratch(DomTreeT &DT, const unsigned NumBlocks) { + // Step #0: Number blocks in depth-first order and initialize variables used + // in later stages of the algorithm. + const unsigned LastDFSNum = doFullDFSWalk(DT, AlwaysDescend); + + runSemiNCA(DT); if (DT.Roots.empty()) return; @@ -231,25 +291,32 @@ struct SemiNCAInfo { // one exit block, or it may be the virtual exit (denoted by // (BasicBlock *)0) which postdominates all real exits if there are multiple // exit blocks, or an infinite loop. + // It might be that some blocks did not get a DFS number (e.g., blocks of + // infinite loops). In these cases an artificial exit node is required. + const bool MultipleRoots = DT.Roots.size() > 1 || (DT.isPostDominator() && + LastDFSNum != NumBlocks); NodePtr Root = !MultipleRoots ? DT.Roots[0] : nullptr; - DT.RootNode = - (DT.DomTreeNodes[Root] = - llvm::make_unique>(Root, nullptr)) - .get(); + DT.RootNode = (DT.DomTreeNodes[Root] = + llvm::make_unique>(Root, nullptr)) + .get(); + attachNewSubtree(DT, DT.RootNode); + } - // Loop over all of the reachable blocks in the function... - for (unsigned i = 2; i <= N; ++i) { + void attachNewSubtree(DomTreeT& DT, const TreeNodePtr AttachTo) { + // Attach the first unreachable block to AttachTo. + NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock(); + // Loop over all of the discovered blocks in the function... + for (size_t i = 1, e = NumToNode.size(); i != e; ++i) { NodePtr W = NumToNode[i]; + DEBUG(dbgs() << "\tdiscovered a new reachable node " + << BlockNamePrinter(W) << "\n"); // Don't replace this with 'count', the insertion side effect is important - if (DT.DomTreeNodes[W]) - continue; // Haven't calculated this node yet? + if (DT.DomTreeNodes[W]) continue; // Haven't calculated this node yet? NodePtr ImmDom = getIDom(W); - assert(ImmDom || DT.DomTreeNodes[nullptr]); - // Get or calculate the node for the immediate dominator TreeNodePtr IDomNode = getNodeForBlock(ImmDom, DT); @@ -260,34 +327,208 @@ struct SemiNCAInfo { } } - template - unsigned doFullDFSWalk(const DomTreeT &DT, DescendCondition DC) { - unsigned Num = 0; - NumToNode.push_back(nullptr); + void reattachExistingSubtree(DomTreeT &DT, const TreeNodePtr AttachTo) { + NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock(); + for (size_t i = 1, e = NumToNode.size(); i != e; ++i) { + const NodePtr N = NumToNode[i]; + const TreeNodePtr TN = DT.getNode(N); + assert(TN); + const TreeNodePtr NewIDom = DT.getNode(NodeToInfo[N].IDom); + TN->setIDom(NewIDom); + } + } - if (DT.Roots.size() > 1) { - auto &BBInfo = NodeToInfo[nullptr]; - BBInfo.DFSNum = BBInfo.Semi = ++Num; - BBInfo.Label = nullptr; + // Helper struct used during edge insertions. + struct InsertionInfo { + using BucketElementTy = std::pair; + struct DecreasingLevel { + bool operator()(const BucketElementTy &First, + const BucketElementTy &Second) const { + return First.first > Second.first; + } + }; + + std::priority_queue, + DecreasingLevel> + Bucket; // Queue of tree nodes sorted by level in descending order. + SmallDenseSet Affected; + SmallDenseSet Visited; + SmallVector AffectedQueue; + SmallVector VisitedNotAffectedQueue; + }; - NumToNode.push_back(nullptr); // NumToNode[n] = V; + static void InsertEdge(DomTreeT &DT, const NodePtr From, const NodePtr To) { + assert(From && To && "Cannot connect nullptrs"); + DEBUG(dbgs() << "Inserting edge " << BlockNamePrinter(From) << " -> " + << BlockNamePrinter(To) << "\n"); + const TreeNodePtr FromTN = DT.getNode(From); + + // Ignore edges from unreachable nodes. + if (!FromTN) return; + + DT.DFSInfoValid = false; + + const TreeNodePtr ToTN = DT.getNode(To); + if (!ToTN) + InsertUnreachable(DT, FromTN, To); + else + InsertReachable(DT, FromTN, ToTN); + } + + // Handles insertion to a node already in the dominator tree. + static void InsertReachable(DomTreeT &DT, const TreeNodePtr From, + const TreeNodePtr To) { + DEBUG(dbgs() << "\tReachable " << BlockNamePrinter(From->getBlock()) + << " -> " << BlockNamePrinter(To->getBlock()) << "\n"); + const NodePtr NCDBlock = + DT.findNearestCommonDominator(From->getBlock(), To->getBlock()); + assert(NCDBlock || DT.isPostDominator()); + const TreeNodePtr NCD = DT.getNode(NCDBlock); + assert(NCD); + + DEBUG(dbgs() << "\t\tNCA == " << BlockNamePrinter(NCD) << "\n"); + const TreeNodePtr ToIDom = To->getIDom(); + + // Nothing affected -- NCA property holds. + // (Based on the lemma 2.5 from the second paper.) + if (NCD == To || NCD == ToIDom) return; + + // Identify and collect affected nodes. + InsertionInfo II; + DEBUG(dbgs() << "Marking " << BlockNamePrinter(To) << " as affected\n"); + II.Affected.insert(To); + const unsigned ToLevel = To->getLevel(); + DEBUG(dbgs() << "Putting " << BlockNamePrinter(To) << " into a Bucket\n"); + II.Bucket.push({ToLevel, To}); + + while (!II.Bucket.empty()) { + const TreeNodePtr CurrentNode = II.Bucket.top().second; + II.Bucket.pop(); + DEBUG(dbgs() << "\tAdding to Visited and AffectedQueue: " + << BlockNamePrinter(CurrentNode) << "\n"); + II.Visited.insert(CurrentNode); + II.AffectedQueue.push_back(CurrentNode); + + // Discover and collect affected successors of the current node. + VisitInsertion(DT, CurrentNode, CurrentNode->getLevel(), NCD, II); } - if (DT.isPostDominator()) { - for (auto *Root : DT.Roots) Num = runDFS(Root, Num, DC, 1); - } else { - assert(DT.Roots.size() == 1); - Num = runDFS(DT.Roots[0], Num, DC, Num); + // Finish by updating immediate dominators and levels. + UpdateInsertion(DT, NCD, II); + } + + // Visits an affected node and collect its affected successors. + static void VisitInsertion(DomTreeT &DT, const TreeNodePtr TN, + const unsigned RootLevel, const TreeNodePtr NCD, + InsertionInfo &II) { + const unsigned NCDLevel = NCD->getLevel(); + DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << "\n"); + + assert(TN->getBlock()); + for (const NodePtr Succ : + ChildrenGetter::Get(TN->getBlock())) { + const TreeNodePtr SuccTN = DT.getNode(Succ); + assert(SuccTN && "Unreachable successor found at reachable insertion"); + const unsigned SuccLevel = SuccTN->getLevel(); + + DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ) + << ", level = " << SuccLevel << "\n"); + + // Succ dominated by subtree From -- not affected. + // (Based on the lemma 2.5 from the second paper.) + if (SuccLevel > RootLevel) { + DEBUG(dbgs() << "\t\tDominated by subtree From\n"); + if (II.Visited.count(SuccTN) != 0) continue; + + DEBUG(dbgs() << "\t\tMarking visited not affected " + << BlockNamePrinter(Succ) << "\n"); + II.Visited.insert(SuccTN); + II.VisitedNotAffectedQueue.push_back(SuccTN); + VisitInsertion(DT, SuccTN, RootLevel, NCD, II); + } else if ((SuccLevel > NCDLevel + 1) && II.Affected.count(SuccTN) == 0) { + DEBUG(dbgs() << "\t\tMarking affected and adding " + << BlockNamePrinter(Succ) << " to a Bucket\n"); + II.Affected.insert(SuccTN); + II.Bucket.push({SuccLevel, SuccTN}); + } } + } - return Num; + // Updates immediate dominators and levels after insertion. + static void UpdateInsertion(DomTreeT &DT, const TreeNodePtr NCD, + InsertionInfo &II) { + DEBUG(dbgs() << "Updating NCD = " << BlockNamePrinter(NCD) << "\n"); + + for (const TreeNodePtr TN : II.AffectedQueue) { + DEBUG(dbgs() << "\tIDom(" << BlockNamePrinter(TN) + << ") = " << BlockNamePrinter(NCD) << "\n"); + TN->setIDom(NCD); + } + + UpdateLevelsAfterInsertion(II); } - static void PrintBlockOrNullptr(raw_ostream &O, NodePtr Obj) { - if (!Obj) - O << "nullptr"; - else - Obj->printAsOperand(O, false); + static void UpdateLevelsAfterInsertion(InsertionInfo &II) { + DEBUG(dbgs() << "Updating levels for visited but not affected nodes\n"); + + for (const TreeNodePtr TN : II.VisitedNotAffectedQueue) { + DEBUG(dbgs() << "\tlevel(" << BlockNamePrinter(TN) << ") = (" + << BlockNamePrinter(TN->getIDom()) << ") " + << TN->getIDom()->getLevel() << " + 1\n"); + TN->UpdateLevel(); + } + } + + // Handles insertion to previously unreachable nodes. + static void InsertUnreachable(DomTreeT &DT, const TreeNodePtr From, + const NodePtr To) { + DEBUG(dbgs() << "Inserting " << BlockNamePrinter(From) + << " -> (unreachable) " << BlockNamePrinter(To) << "\n"); + + // Collect discovered edges to already reachable nodes. + SmallVector, 8> DiscoveredEdgesToReachable; + // Discover and connect nodes that became reachable with the insertion. + ComputeUnreachableDominators(DT, To, From, DiscoveredEdgesToReachable); + + DEBUG(dbgs() << "Inserted " << BlockNamePrinter(From) + << " -> (prev unreachable) " << BlockNamePrinter(To) << "\n"); + + DEBUG(DT.print(dbgs())); + + // Used the discovered edges and inset discovered connecting (incoming) + // edges. + for (const auto &Edge : DiscoveredEdgesToReachable) { + DEBUG(dbgs() << "\tInserting discovered connecting edge " + << BlockNamePrinter(Edge.first) << " -> " + << BlockNamePrinter(Edge.second) << "\n"); + InsertReachable(DT, DT.getNode(Edge.first), Edge.second); + } + } + + // Connects nodes that become reachable with an insertion. + static void ComputeUnreachableDominators( + DomTreeT &DT, const NodePtr Root, const TreeNodePtr Incoming, + SmallVectorImpl> + &DiscoveredConnectingEdges) { + assert(!DT.getNode(Root) && "Root must not be reachable"); + + // Visit only previously unreachable nodes. + auto UnreachableDescender = [&DT, &DiscoveredConnectingEdges](NodePtr From, + NodePtr To) { + const TreeNodePtr ToTN = DT.getNode(To); + if (!ToTN) return true; + + DiscoveredConnectingEdges.push_back({From, ToTN}); + return false; + }; + + SemiNCAInfo SNCA; + SNCA.runDFS(Root, 0, UnreachableDescender, 0); + SNCA.runSemiNCA(DT); + SNCA.attachNewSubtree(DT, Incoming); + + DEBUG(dbgs() << "After adding unreachable nodes\n"); + DEBUG(DT.print(dbgs())); } // Checks if the tree contains all reachable nodes in the input graph. @@ -298,12 +539,23 @@ struct SemiNCAInfo { for (auto &NodeToTN : DT.DomTreeNodes) { const TreeNodePtr TN = NodeToTN.second.get(); const NodePtr BB = TN->getBlock(); - if (!BB) continue; + + // Virtual root has a corresponding virtual CFG node. + if (DT.isVirtualRoot(TN)) continue; if (NodeToInfo.count(BB) == 0) { - errs() << "DomTree node "; - PrintBlockOrNullptr(errs(), BB); - errs() << " not found by DFS walk!\n"; + errs() << "DomTree node " << BlockNamePrinter(BB) + << " not found by DFS walk!\n"; + errs().flush(); + + return false; + } + } + + for (const NodePtr N : NumToNode) { + if (N && !DT.getNode(N)) { + errs() << "CFG node " << BlockNamePrinter(N) + << " not found in the DomTree!\n"; errs().flush(); return false; @@ -313,6 +565,215 @@ struct SemiNCAInfo { return true; } + static void DeleteEdge(DomTreeT &DT, const NodePtr From, const NodePtr To) { + assert(From && To && "Cannot disconnect nullptrs"); + DEBUG(dbgs() << "Deleting edge " << BlockNamePrinter(From) << " -> " + << BlockNamePrinter(To) << "\n"); + +#ifndef NDEBUG + // Ensure that the edge was in fact deleted from the CFG before informing + // the DomTree about it. + // The check is O(N), so run it only in debug configuration. + auto IsSuccessor = [](const NodePtr SuccCandidate, const NodePtr Of) { + auto Successors = ChildrenGetter::Get(Of); + return llvm::find(Successors, SuccCandidate) != Successors.end(); + }; + (void)IsSuccessor; + assert(!IsSuccessor(To, From) && "Deleted edge still exists in the CFG!"); +#endif + + const TreeNodePtr FromTN = DT.getNode(From); + // Deletion in an unreachable subtree -- nothing to do. + if (!FromTN) return; + + const TreeNodePtr ToTN = DT.getNode(To); + assert(ToTN && "To already unreachable -- there is no edge to delete"); + const NodePtr NCDBlock = DT.findNearestCommonDominator(From, To); + const TreeNodePtr NCD = DT.getNode(NCDBlock); + + // To dominates From -- nothing to do. + if (ToTN == NCD) return; + + const TreeNodePtr ToIDom = ToTN->getIDom(); + DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom " + << BlockNamePrinter(ToIDom) << "\n"); + + // To remains reachable after deletion. + // (Based on the caption under Figure 4. from the second paper.) + if (FromTN != ToIDom || HasProperSupport(DT, ToTN)) + DeleteReachable(DT, FromTN, ToTN); + else + DeleteUnreachable(DT, ToTN); + } + + // Handles deletions that leave destination nodes reachable. + static void DeleteReachable(DomTreeT &DT, const TreeNodePtr FromTN, + const TreeNodePtr ToTN) { + DEBUG(dbgs() << "Deleting reachable " << BlockNamePrinter(FromTN) << " -> " + << BlockNamePrinter(ToTN) << "\n"); + DEBUG(dbgs() << "\tRebuilding subtree\n"); + + // Find the top of the subtree that needs to be rebuilt. + // (Based on the lemma 2.6 from the second paper.) + const NodePtr ToIDom = + DT.findNearestCommonDominator(FromTN->getBlock(), ToTN->getBlock()); + assert(ToIDom || DT.isPostDominator()); + const TreeNodePtr ToIDomTN = DT.getNode(ToIDom); + assert(ToIDomTN); + const TreeNodePtr PrevIDomSubTree = ToIDomTN->getIDom(); + // Top of the subtree to rebuild is the root node. Rebuild the tree from + // scratch. + if (!PrevIDomSubTree) { + DEBUG(dbgs() << "The entire tree needs to be rebuilt\n"); + DT.recalculate(*DT.Parent); + return; + } + + // Only visit nodes in the subtree starting at To. + const unsigned Level = ToIDomTN->getLevel(); + auto DescendBelow = [Level, &DT](NodePtr, NodePtr To) { + return DT.getNode(To)->getLevel() > Level; + }; + + DEBUG(dbgs() << "\tTop of subtree: " << BlockNamePrinter(ToIDomTN) << "\n"); + + SemiNCAInfo SNCA; + SNCA.runDFS(ToIDom, 0, DescendBelow, 0); + DEBUG(dbgs() << "\tRunning Semi-NCA\n"); + SNCA.runSemiNCA(DT, Level); + SNCA.reattachExistingSubtree(DT, PrevIDomSubTree); + } + + // Checks if a node has proper support, as defined on the page 3 and later + // explained on the page 7 of the second paper. + static bool HasProperSupport(DomTreeT &DT, const TreeNodePtr TN) { + DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN) << "\n"); + for (const NodePtr Pred : + ChildrenGetter::Get(TN->getBlock())) { + DEBUG(dbgs() << "\tPred " << BlockNamePrinter(Pred) << "\n"); + if (!DT.getNode(Pred)) continue; + + const NodePtr Support = + DT.findNearestCommonDominator(TN->getBlock(), Pred); + DEBUG(dbgs() << "\tSupport " << BlockNamePrinter(Support) << "\n"); + if (Support != TN->getBlock()) { + DEBUG(dbgs() << "\t" << BlockNamePrinter(TN) + << " is reachable from support " + << BlockNamePrinter(Support) << "\n"); + return true; + } + } + + return false; + } + + // Handle deletions that make destination node unreachable. + // (Based on the lemma 2.7 from the second paper.) + static void DeleteUnreachable(DomTreeT &DT, const TreeNodePtr ToTN) { + DEBUG(dbgs() << "Deleting unreachable subtree " << BlockNamePrinter(ToTN) + << "\n"); + assert(ToTN); + assert(ToTN->getBlock()); + + SmallVector AffectedQueue; + const unsigned Level = ToTN->getLevel(); + + // Traverse destination node's descendants with greater level in the tree + // and collect visited nodes. + auto DescendAndCollect = [Level, &AffectedQueue, &DT](NodePtr, NodePtr To) { + const TreeNodePtr TN = DT.getNode(To); + assert(TN); + if (TN->getLevel() > Level) return true; + if (llvm::find(AffectedQueue, To) == AffectedQueue.end()) + AffectedQueue.push_back(To); + + return false; + }; + + SemiNCAInfo SNCA; + unsigned LastDFSNum = + SNCA.runDFS(ToTN->getBlock(), 0, DescendAndCollect, 0); + + TreeNodePtr MinNode = ToTN; + + // Identify the top of the subtree to rebuilt by finding the NCD of all + // the affected nodes. + for (const NodePtr N : AffectedQueue) { + const TreeNodePtr TN = DT.getNode(N); + const NodePtr NCDBlock = + DT.findNearestCommonDominator(TN->getBlock(), ToTN->getBlock()); + assert(NCDBlock || DT.isPostDominator()); + const TreeNodePtr NCD = DT.getNode(NCDBlock); + assert(NCD); + + DEBUG(dbgs() << "Processing affected node " << BlockNamePrinter(TN) + << " with NCD = " << BlockNamePrinter(NCD) + << ", MinNode =" << BlockNamePrinter(MinNode) << "\n"); + if (NCD != TN && NCD->getLevel() < MinNode->getLevel()) MinNode = NCD; + } + + // Root reached, rebuild the whole tree from scratch. + if (!MinNode->getIDom()) { + DEBUG(dbgs() << "The entire tree needs to be rebuilt\n"); + DT.recalculate(*DT.Parent); + return; + } + + // Erase the unreachable subtree in reverse preorder to process all children + // before deleting their parent. + for (unsigned i = LastDFSNum; i > 0; --i) { + const NodePtr N = SNCA.NumToNode[i]; + const TreeNodePtr TN = DT.getNode(N); + DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(TN) << "\n"); + + EraseNode(DT, TN); + } + + // The affected subtree start at the To node -- there's no extra work to do. + if (MinNode == ToTN) return; + + DEBUG(dbgs() << "DeleteUnreachable: running DFS with MinNode = " + << BlockNamePrinter(MinNode) << "\n"); + const unsigned MinLevel = MinNode->getLevel(); + const TreeNodePtr PrevIDom = MinNode->getIDom(); + assert(PrevIDom); + SNCA.clear(); + + // Identify nodes that remain in the affected subtree. + auto DescendBelow = [MinLevel, &DT](NodePtr, NodePtr To) { + const TreeNodePtr ToTN = DT.getNode(To); + return ToTN && ToTN->getLevel() > MinLevel; + }; + SNCA.runDFS(MinNode->getBlock(), 0, DescendBelow, 0); + + DEBUG(dbgs() << "Previous IDom(MinNode) = " << BlockNamePrinter(PrevIDom) + << "\nRunning Semi-NCA\n"); + + // Rebuild the remaining part of affected subtree. + SNCA.runSemiNCA(DT, MinLevel); + SNCA.reattachExistingSubtree(DT, PrevIDom); + } + + // Removes leaf tree nodes from the dominator tree. + static void EraseNode(DomTreeT &DT, const TreeNodePtr TN) { + assert(TN); + assert(TN->getNumChildren() == 0 && "Not a tree leaf"); + + const TreeNodePtr IDom = TN->getIDom(); + assert(IDom); + + auto ChIt = llvm::find(IDom->Children, TN); + assert(ChIt != IDom->Children.end()); + std::swap(*ChIt, IDom->Children.back()); + IDom->Children.pop_back(); + + DT.DomTreeNodes.erase(TN->getBlock()); + } + + //~~ + //===--------------- DomTree correctness verification ---------------------=== + //~~ + // Check if for every parent with a level L in the tree all of its children // have level L + 1. static bool VerifyLevels(const DomTreeT &DT) { @@ -323,20 +784,18 @@ struct SemiNCAInfo { const TreeNodePtr IDom = TN->getIDom(); if (!IDom && TN->getLevel() != 0) { - errs() << "Node without an IDom "; - PrintBlockOrNullptr(errs(), BB); - errs() << " has a nonzero level " << TN->getLevel() << "!\n"; + errs() << "Node without an IDom " << BlockNamePrinter(BB) + << " has a nonzero level " << TN->getLevel() << "!\n"; errs().flush(); return false; } if (IDom && TN->getLevel() != IDom->getLevel() + 1) { - errs() << "Node "; - PrintBlockOrNullptr(errs(), BB); - errs() << " has level " << TN->getLevel() << " while it's IDom "; - PrintBlockOrNullptr(errs(), IDom->getBlock()); - errs() << " has level " << IDom->getLevel() << "!\n"; + errs() << "Node " << BlockNamePrinter(BB) << " has level " + << TN->getLevel() << " while its IDom " + << BlockNamePrinter(IDom->getBlock()) << " has level " + << IDom->getLevel() << "!\n"; errs().flush(); return false; @@ -363,18 +822,14 @@ struct SemiNCAInfo { assert(ToTN); const NodePtr NCD = DT.findNearestCommonDominator(From, To); - const TreeNodePtr NCDTN = NCD ? DT.getNode(NCD) : nullptr; + const TreeNodePtr NCDTN = DT.getNode(NCD); const TreeNodePtr ToIDom = ToTN->getIDom(); if (NCDTN != ToTN && NCDTN != ToIDom) { - errs() << "NearestCommonDominator verification failed:\n\tNCD(From:"; - PrintBlockOrNullptr(errs(), From); - errs() << ", To:"; - PrintBlockOrNullptr(errs(), To); - errs() << ") = "; - PrintBlockOrNullptr(errs(), NCD); - errs() << ",\t (should be To or IDom[To]: "; - PrintBlockOrNullptr(errs(), ToIDom ? ToIDom->getBlock() : nullptr); - errs() << ")\n"; + errs() << "NearestCommonDominator verification failed:\n\tNCD(From:" + << BlockNamePrinter(From) << ", To:" << BlockNamePrinter(To) + << ") = " << BlockNamePrinter(NCD) + << ",\t (should be To or IDom[To]: " << BlockNamePrinter(ToIDom) + << ")\n"; errs().flush(); return false; @@ -440,11 +895,9 @@ struct SemiNCAInfo { for (TreeNodePtr Child : TN->getChildren()) if (NodeToInfo.count(Child->getBlock()) != 0) { - errs() << "Child "; - PrintBlockOrNullptr(errs(), Child->getBlock()); - errs() << " reachable after its parent "; - PrintBlockOrNullptr(errs(), BB); - errs() << " is removed!\n"; + errs() << "Child " << BlockNamePrinter(Child) + << " reachable after its parent " << BlockNamePrinter(BB) + << " is removed!\n"; errs().flush(); return false; @@ -477,11 +930,9 @@ struct SemiNCAInfo { if (S == N) continue; if (NodeToInfo.count(S->getBlock()) == 0) { - errs() << "Node "; - PrintBlockOrNullptr(errs(), S->getBlock()); - errs() << " not reachable when its sibling "; - PrintBlockOrNullptr(errs(), N->getBlock()); - errs() << " is removed!\n"; + errs() << "Node " << BlockNamePrinter(S) + << " not reachable when its sibling " << BlockNamePrinter(N) + << " is removed!\n"; errs().flush(); return false; @@ -494,23 +945,30 @@ struct SemiNCAInfo { } }; -template -void Calculate(DominatorTreeBaseByGraphTraits> &DT, - FuncT &F) { - using NodePtr = typename GraphTraits::NodeRef; - static_assert(std::is_pointer::value, - "NodePtr should be a pointer type"); - SemiNCAInfo::type> SNCA; - SNCA.template runSemiNCA(DT, GraphTraits::size(&F)); + +template +void Calculate(DomTreeT &DT, FuncT &F) { + SemiNCAInfo SNCA; + SNCA.calculateFromScratch(DT, GraphTraits::size(&F)); } -template -bool Verify(const DominatorTreeBaseByGraphTraits> &DT) { - using NodePtr = typename GraphTraits::NodeRef; - static_assert(std::is_pointer::value, - "NodePtr should be a pointer type"); - SemiNCAInfo::type> SNCA; +template +void InsertEdge(DomTreeT &DT, typename DomTreeT::NodePtr From, + typename DomTreeT::NodePtr To) { + if (DT.isPostDominator()) std::swap(From, To); + SemiNCAInfo::InsertEdge(DT, From, To); +} + +template +void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From, + typename DomTreeT::NodePtr To) { + if (DT.isPostDominator()) std::swap(From, To); + SemiNCAInfo::DeleteEdge(DT, From, To); +} +template +bool Verify(const DomTreeT &DT) { + SemiNCAInfo SNCA; return SNCA.verifyReachability(DT) && SNCA.VerifyLevels(DT) && SNCA.verifyNCD(DT) && SNCA.verifyParentProperty(DT) && SNCA.verifySiblingProperty(DT); @@ -519,4 +977,6 @@ bool Verify(const DominatorTreeBaseByGraphTraits> &DT) { } // namespace DomTreeBuilder } // namespace llvm +#undef DEBUG_TYPE + #endif diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h index 72c28865ac57..e13582f6a6d3 100644 --- a/include/llvm/Support/TargetParser.h +++ b/include/llvm/Support/TargetParser.h @@ -85,6 +85,7 @@ enum ArchExtKind : unsigned { AEK_DSP = 0x400, AEK_FP16 = 0x800, AEK_RAS = 0x1000, + AEK_SVE = 0x2000, // Unsupported extensions. AEK_OS = 0x8000000, AEK_IWMMXT = 0x10000000, @@ -166,7 +167,8 @@ enum ArchExtKind : unsigned { AEK_FP16 = 0x20, AEK_PROFILE = 0x40, AEK_RAS = 0x80, - AEK_LSE = 0x100 + AEK_LSE = 0x100, + AEK_SVE = 0x200 }; StringRef getCanonicalArchName(StringRef Arch); diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h index 15b3b11db045..71fdf47f1979 100644 --- a/include/llvm/Support/YAMLTraits.h +++ b/include/llvm/Support/YAMLTraits.h @@ -1114,6 +1114,10 @@ public: void *Ctxt = nullptr, SourceMgr::DiagHandlerTy DiagHandler = nullptr, void *DiagHandlerCtxt = nullptr); + Input(MemoryBufferRef Input, + void *Ctxt = nullptr, + SourceMgr::DiagHandlerTy DiagHandler = nullptr, + void *DiagHandlerCtxt = nullptr); ~Input() override; // Check if there was an syntax or semantic error during parsing. diff --git a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index 178b08d7b8b7..50de41fd1320 100644 --- a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -58,6 +58,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 60a03bdc182d..23711d636c9a 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -2012,6 +2012,35 @@ public: return isExtFreeImpl(I); } + /// Return true if \p Load and \p Ext can form an ExtLoad. + /// For example, in AArch64 + /// %L = load i8, i8* %ptr + /// %E = zext i8 %L to i32 + /// can be lowered into one load instruction + /// ldrb w0, [x0] + bool isExtLoad(const LoadInst *Load, const Instruction *Ext, + const DataLayout &DL) const { + EVT VT = getValueType(DL, Ext->getType()); + EVT LoadVT = getValueType(DL, Load->getType()); + + // If the load has other users and the truncate is not free, the ext + // probably isn't free. + if (!Load->hasOneUse() && (isTypeLegal(LoadVT) || !isTypeLegal(VT)) && + !isTruncateFree(Ext->getType(), Load->getType())) + return false; + + // Check whether the target supports casts folded into loads. + unsigned LType; + if (isa(Ext)) + LType = ISD::ZEXTLOAD; + else { + assert(isa(Ext) && "Unexpected ext type!"); + LType = ISD::SEXTLOAD; + } + + return isLoadExtLegal(LType, VT, LoadVT); + } + /// Return true if any actual instruction that defines a value of type FromTy /// implicitly zero-extends the value to ToTy in the result register. /// diff --git a/include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h b/include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h new file mode 100644 index 000000000000..964b0f7620a2 --- /dev/null +++ b/include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h @@ -0,0 +1,24 @@ +//===- DlltoolDriver.h - dlltool.exe-compatible driver ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Defines an interface to a dlltool.exe-compatible driver. +// Used by llvm-dlltool. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLDRIVERS_LLVM_DLLTOOL_DLLTOOLDRIVER_H +#define LLVM_TOOLDRIVERS_LLVM_DLLTOOL_DLLTOOLDRIVER_H + +namespace llvm { +template class ArrayRef; + +int dlltoolDriverMain(ArrayRef ArgsArr); +} // namespace llvm + +#endif diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp index 3ddefc6520a7..74b5d79ebac5 100644 --- a/lib/Analysis/CGSCCPassManager.cpp +++ b/lib/Analysis/CGSCCPassManager.cpp @@ -433,7 +433,7 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass( if (Visited.insert(C).second) Worklist.push_back(C); - LazyCallGraph::visitReferences(Worklist, Visited, [&](Function &Referee) { + auto VisitRef = [&](Function &Referee) { Node &RefereeN = *G.lookup(Referee); Edge *E = N->lookup(RefereeN); // FIXME: Similarly to new calls, we also currently preclude @@ -444,7 +444,12 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass( RetainedEdges.insert(&RefereeN); if (E->isCall()) DemotedCallTargets.insert(&RefereeN); - }); + }; + LazyCallGraph::visitReferences(Worklist, Visited, VisitRef); + + // Include synthetic reference edges to known, defined lib functions. + for (auto *F : G.getLibFunctions()) + VisitRef(*F); // First remove all of the edges that are no longer present in this function. // We have to build a list of dead targets first and then remove them as the diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp index 5b6e2d0476e4..c08c6cfe0c3b 100644 --- a/lib/Analysis/DominanceFrontier.cpp +++ b/lib/Analysis/DominanceFrontier.cpp @@ -14,7 +14,8 @@ using namespace llvm; namespace llvm { -template class DominanceFrontierBase; +template class DominanceFrontierBase; +template class DominanceFrontierBase; template class ForwardDominanceFrontierBase; } diff --git a/lib/Analysis/InstCount.cpp b/lib/Analysis/InstCount.cpp index 27c6b580e7ac..95ab6ee3db5b 100644 --- a/lib/Analysis/InstCount.cpp +++ b/lib/Analysis/InstCount.cpp @@ -26,7 +26,6 @@ using namespace llvm; STATISTIC(TotalInsts , "Number of instructions (of all types)"); STATISTIC(TotalBlocks, "Number of basic blocks"); STATISTIC(TotalFuncs , "Number of non-external functions"); -STATISTIC(TotalMemInst, "Number of memory instructions"); #define HANDLE_INST(N, OPCODE, CLASS) \ STATISTIC(Num ## OPCODE ## Inst, "Number of " #OPCODE " insts"); @@ -75,13 +74,6 @@ FunctionPass *llvm::createInstCountPass() { return new InstCount(); } // function. // bool InstCount::runOnFunction(Function &F) { - unsigned StartMemInsts = - NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst + - NumInvokeInst + NumAllocaInst; visit(F); - unsigned EndMemInsts = - NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst + - NumInvokeInst + NumAllocaInst; - TotalMemInst += EndMemInsts-StartMemInsts; return false; } diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index f6632020b8fc..b4f3b87e1846 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -1745,14 +1745,11 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, return Constant::getNullValue(Op0->getType()); // (A | ?) & A = A - Value *A = nullptr, *B = nullptr; - if (match(Op0, m_Or(m_Value(A), m_Value(B))) && - (A == Op1 || B == Op1)) + if (match(Op0, m_c_Or(m_Specific(Op1), m_Value()))) return Op1; // A & (A | ?) = A - if (match(Op1, m_Or(m_Value(A), m_Value(B))) && - (A == Op0 || B == Op0)) + if (match(Op1, m_c_Or(m_Specific(Op0), m_Value()))) return Op0; // A mask that only clears known zeros of a shifted value is a no-op. @@ -1852,26 +1849,22 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, return Constant::getAllOnesValue(Op0->getType()); // (A & ?) | A = A - Value *A = nullptr, *B = nullptr; - if (match(Op0, m_And(m_Value(A), m_Value(B))) && - (A == Op1 || B == Op1)) + if (match(Op0, m_c_And(m_Specific(Op1), m_Value()))) return Op1; // A | (A & ?) = A - if (match(Op1, m_And(m_Value(A), m_Value(B))) && - (A == Op0 || B == Op0)) + if (match(Op1, m_c_And(m_Specific(Op0), m_Value()))) return Op0; // ~(A & ?) | A = -1 - if (match(Op0, m_Not(m_And(m_Value(A), m_Value(B)))) && - (A == Op1 || B == Op1)) + if (match(Op0, m_Not(m_c_And(m_Specific(Op1), m_Value())))) return Constant::getAllOnesValue(Op1->getType()); // A | ~(A & ?) = -1 - if (match(Op1, m_Not(m_And(m_Value(A), m_Value(B)))) && - (A == Op0 || B == Op0)) + if (match(Op1, m_Not(m_c_And(m_Specific(Op1), m_Value())))) return Constant::getAllOnesValue(Op0->getType()); + Value *A, *B; // (A & ~B) | (A ^ B) -> (A ^ B) // (~B & A) | (A ^ B) -> (A ^ B) // (A & ~B) | (B ^ A) -> (B ^ A) diff --git a/lib/Analysis/IteratedDominanceFrontier.cpp b/lib/Analysis/IteratedDominanceFrontier.cpp index 0e02850df349..3992657417c5 100644 --- a/lib/Analysis/IteratedDominanceFrontier.cpp +++ b/lib/Analysis/IteratedDominanceFrontier.cpp @@ -17,8 +17,8 @@ #include namespace llvm { -template -void IDFCalculator::calculate( +template +void IDFCalculator::calculate( SmallVectorImpl &PHIBlocks) { // Use a priority queue keyed on dominator tree level so that inserted nodes // are handled from the bottom of the dominator tree upwards. @@ -88,6 +88,6 @@ void IDFCalculator::calculate( } } -template class IDFCalculator; -template class IDFCalculator>; +template class IDFCalculator; +template class IDFCalculator, true>; } diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp index a4c3e43b4b0c..d287f81985fd 100644 --- a/lib/Analysis/LazyCallGraph.cpp +++ b/lib/Analysis/LazyCallGraph.cpp @@ -106,6 +106,13 @@ LazyCallGraph::EdgeSequence &LazyCallGraph::Node::populateSlow() { LazyCallGraph::Edge::Ref); }); + // Add implicit reference edges to any defined libcall functions (if we + // haven't found an explicit edge). + for (auto *F : G->LibFunctions) + if (!Visited.count(F)) + addEdge(Edges->Edges, Edges->EdgeIndexMap, G->get(*F), + LazyCallGraph::Edge::Ref); + return *Edges; } @@ -120,15 +127,34 @@ LLVM_DUMP_METHOD void LazyCallGraph::Node::dump() const { } #endif -LazyCallGraph::LazyCallGraph(Module &M) { +static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) { + LibFunc LF; + + // Either this is a normal library function or a "vectorizable" function. + return TLI.getLibFunc(F, LF) || TLI.isFunctionVectorizable(F.getName()); +} + +LazyCallGraph::LazyCallGraph(Module &M, TargetLibraryInfo &TLI) { DEBUG(dbgs() << "Building CG for module: " << M.getModuleIdentifier() << "\n"); - for (Function &F : M) - if (!F.isDeclaration() && !F.hasLocalLinkage()) { - DEBUG(dbgs() << " Adding '" << F.getName() - << "' to entry set of the graph.\n"); - addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F), Edge::Ref); - } + for (Function &F : M) { + if (F.isDeclaration()) + continue; + // If this function is a known lib function to LLVM then we want to + // synthesize reference edges to it to model the fact that LLVM can turn + // arbitrary code into a library function call. + if (isKnownLibFunction(F, TLI)) + LibFunctions.insert(&F); + + if (F.hasLocalLinkage()) + continue; + + // External linkage defined functions have edges to them from other + // modules. + DEBUG(dbgs() << " Adding '" << F.getName() + << "' to entry set of the graph.\n"); + addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F), Edge::Ref); + } // Now add entry nodes for functions reachable via initializers to globals. SmallVector Worklist; @@ -149,7 +175,8 @@ LazyCallGraph::LazyCallGraph(Module &M) { LazyCallGraph::LazyCallGraph(LazyCallGraph &&G) : BPA(std::move(G.BPA)), NodeMap(std::move(G.NodeMap)), EntryEdges(std::move(G.EntryEdges)), SCCBPA(std::move(G.SCCBPA)), - SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)) { + SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)), + LibFunctions(std::move(G.LibFunctions)) { updateGraphPtrs(); } @@ -160,6 +187,7 @@ LazyCallGraph &LazyCallGraph::operator=(LazyCallGraph &&G) { SCCBPA = std::move(G.SCCBPA); SCCMap = std::move(G.SCCMap); LeafRefSCCs = std::move(G.LeafRefSCCs); + LibFunctions = std::move(G.LibFunctions); updateGraphPtrs(); return *this; } @@ -1580,6 +1608,11 @@ void LazyCallGraph::removeDeadFunction(Function &F) { assert(F.use_empty() && "This routine should only be called on trivially dead functions!"); + // We shouldn't remove library functions as they are never really dead while + // the call graph is in use -- every function definition refers to them. + assert(!isLibFunction(F) && + "Must not remove lib functions from the call graph!"); + auto NI = NodeMap.find(&F); if (NI == NodeMap.end()) // Not in the graph at all! diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp index baf932432a0a..697b58622bb4 100644 --- a/lib/Analysis/LoopInfo.cpp +++ b/lib/Analysis/LoopInfo.cpp @@ -609,7 +609,7 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) { return NearLoop; } -LoopInfo::LoopInfo(const DominatorTreeBase &DomTree) { +LoopInfo::LoopInfo(const DomTreeBase &DomTree) { analyze(DomTree); } diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp index 86d0d92799f2..86de474c7aa9 100644 --- a/lib/Analysis/MemorySSA.cpp +++ b/lib/Analysis/MemorySSA.cpp @@ -39,7 +39,6 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Transforms/Scalar.h" #include #define DEBUG_TYPE "memoryssa" diff --git a/lib/Analysis/PostDominators.cpp b/lib/Analysis/PostDominators.cpp index 1caf151546d9..811373ac850b 100644 --- a/lib/Analysis/PostDominators.cpp +++ b/lib/Analysis/PostDominators.cpp @@ -23,6 +23,8 @@ using namespace llvm; #define DEBUG_TYPE "postdomtree" +template class llvm::DominatorTreeBase; // PostDomTreeBase + //===----------------------------------------------------------------------===// // PostDominatorTree Implementation //===----------------------------------------------------------------------===// diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 3fb1ab980add..b973203a89b6 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -4173,6 +4173,319 @@ static Optional MatchBinaryOp(Value *V, DominatorTree &DT) { return None; } +/// Helper function to createAddRecFromPHIWithCasts. We have a phi +/// node whose symbolic (unknown) SCEV is \p SymbolicPHI, which is updated via +/// the loop backedge by a SCEVAddExpr, possibly also with a few casts on the +/// way. This function checks if \p Op, an operand of this SCEVAddExpr, +/// follows one of the following patterns: +/// Op == (SExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) +/// Op == (ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) +/// If the SCEV expression of \p Op conforms with one of the expected patterns +/// we return the type of the truncation operation, and indicate whether the +/// truncated type should be treated as signed/unsigned by setting +/// \p Signed to true/false, respectively. +static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI, + bool &Signed, ScalarEvolution &SE) { + + // The case where Op == SymbolicPHI (that is, with no type conversions on + // the way) is handled by the regular add recurrence creating logic and + // would have already been triggered in createAddRecForPHI. Reaching it here + // means that createAddRecFromPHI had failed for this PHI before (e.g., + // because one of the other operands of the SCEVAddExpr updating this PHI is + // not invariant). + // + // Here we look for the case where Op = (ext(trunc(SymbolicPHI))), and in + // this case predicates that allow us to prove that Op == SymbolicPHI will + // be added. + if (Op == SymbolicPHI) + return nullptr; + + unsigned SourceBits = SE.getTypeSizeInBits(SymbolicPHI->getType()); + unsigned NewBits = SE.getTypeSizeInBits(Op->getType()); + if (SourceBits != NewBits) + return nullptr; + + const SCEVSignExtendExpr *SExt = dyn_cast(Op); + const SCEVZeroExtendExpr *ZExt = dyn_cast(Op); + if (!SExt && !ZExt) + return nullptr; + const SCEVTruncateExpr *Trunc = + SExt ? dyn_cast(SExt->getOperand()) + : dyn_cast(ZExt->getOperand()); + if (!Trunc) + return nullptr; + const SCEV *X = Trunc->getOperand(); + if (X != SymbolicPHI) + return nullptr; + Signed = SExt ? true : false; + return Trunc->getType(); +} + +static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) { + if (!PN->getType()->isIntegerTy()) + return nullptr; + const Loop *L = LI.getLoopFor(PN->getParent()); + if (!L || L->getHeader() != PN->getParent()) + return nullptr; + return L; +} + +// Analyze \p SymbolicPHI, a SCEV expression of a phi node, and check if the +// computation that updates the phi follows the following pattern: +// (SExt/ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) + InvariantAccum +// which correspond to a phi->trunc->sext/zext->add->phi update chain. +// If so, try to see if it can be rewritten as an AddRecExpr under some +// Predicates. If successful, return them as a pair. Also cache the results +// of the analysis. +// +// Example usage scenario: +// Say the Rewriter is called for the following SCEV: +// 8 * ((sext i32 (trunc i64 %X to i32) to i64) + %Step) +// where: +// %X = phi i64 (%Start, %BEValue) +// It will visitMul->visitAdd->visitSExt->visitTrunc->visitUnknown(%X), +// and call this function with %SymbolicPHI = %X. +// +// The analysis will find that the value coming around the backedge has +// the following SCEV: +// BEValue = ((sext i32 (trunc i64 %X to i32) to i64) + %Step) +// Upon concluding that this matches the desired pattern, the function +// will return the pair {NewAddRec, SmallPredsVec} where: +// NewAddRec = {%Start,+,%Step} +// SmallPredsVec = {P1, P2, P3} as follows: +// P1(WrapPred): AR: {trunc(%Start),+,(trunc %Step)} Flags: +// P2(EqualPred): %Start == (sext i32 (trunc i64 %Start to i32) to i64) +// P3(EqualPred): %Step == (sext i32 (trunc i64 %Step to i32) to i64) +// The returned pair means that SymbolicPHI can be rewritten into NewAddRec +// under the predicates {P1,P2,P3}. +// This predicated rewrite will be cached in PredicatedSCEVRewrites: +// PredicatedSCEVRewrites[{%X,L}] = {NewAddRec, {P1,P2,P3)} +// +// TODO's: +// +// 1) Extend the Induction descriptor to also support inductions that involve +// casts: When needed (namely, when we are called in the context of the +// vectorizer induction analysis), a Set of cast instructions will be +// populated by this method, and provided back to isInductionPHI. This is +// needed to allow the vectorizer to properly record them to be ignored by +// the cost model and to avoid vectorizing them (otherwise these casts, +// which are redundant under the runtime overflow checks, will be +// vectorized, which can be costly). +// +// 2) Support additional induction/PHISCEV patterns: We also want to support +// inductions where the sext-trunc / zext-trunc operations (partly) occur +// after the induction update operation (the induction increment): +// +// (Trunc iy (SExt/ZExt ix (%SymbolicPHI + InvariantAccum) to iy) to ix) +// which correspond to a phi->add->trunc->sext/zext->phi update chain. +// +// (Trunc iy ((SExt/ZExt ix (%SymbolicPhi) to iy) + InvariantAccum) to ix) +// which correspond to a phi->trunc->add->sext/zext->phi update chain. +// +// 3) Outline common code with createAddRecFromPHI to avoid duplication. +// +Optional>> +ScalarEvolution::createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI) { + SmallVector Predicates; + + // *** Part1: Analyze if we have a phi-with-cast pattern for which we can + // return an AddRec expression under some predicate. + + auto *PN = cast(SymbolicPHI->getValue()); + const Loop *L = isIntegerLoopHeaderPHI(PN, LI); + assert (L && "Expecting an integer loop header phi"); + + // The loop may have multiple entrances or multiple exits; we can analyze + // this phi as an addrec if it has a unique entry value and a unique + // backedge value. + Value *BEValueV = nullptr, *StartValueV = nullptr; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *V = PN->getIncomingValue(i); + if (L->contains(PN->getIncomingBlock(i))) { + if (!BEValueV) { + BEValueV = V; + } else if (BEValueV != V) { + BEValueV = nullptr; + break; + } + } else if (!StartValueV) { + StartValueV = V; + } else if (StartValueV != V) { + StartValueV = nullptr; + break; + } + } + if (!BEValueV || !StartValueV) + return None; + + const SCEV *BEValue = getSCEV(BEValueV); + + // If the value coming around the backedge is an add with the symbolic + // value we just inserted, possibly with casts that we can ignore under + // an appropriate runtime guard, then we found a simple induction variable! + const auto *Add = dyn_cast(BEValue); + if (!Add) + return None; + + // If there is a single occurrence of the symbolic value, possibly + // casted, replace it with a recurrence. + unsigned FoundIndex = Add->getNumOperands(); + Type *TruncTy = nullptr; + bool Signed; + for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) + if ((TruncTy = + isSimpleCastedPHI(Add->getOperand(i), SymbolicPHI, Signed, *this))) + if (FoundIndex == e) { + FoundIndex = i; + break; + } + + if (FoundIndex == Add->getNumOperands()) + return None; + + // Create an add with everything but the specified operand. + SmallVector Ops; + for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) + if (i != FoundIndex) + Ops.push_back(Add->getOperand(i)); + const SCEV *Accum = getAddExpr(Ops); + + // The runtime checks will not be valid if the step amount is + // varying inside the loop. + if (!isLoopInvariant(Accum, L)) + return None; + + + // *** Part2: Create the predicates + + // Analysis was successful: we have a phi-with-cast pattern for which we + // can return an AddRec expression under the following predicates: + // + // P1: A Wrap predicate that guarantees that Trunc(Start) + i*Trunc(Accum) + // fits within the truncated type (does not overflow) for i = 0 to n-1. + // P2: An Equal predicate that guarantees that + // Start = (Ext ix (Trunc iy (Start) to ix) to iy) + // P3: An Equal predicate that guarantees that + // Accum = (Ext ix (Trunc iy (Accum) to ix) to iy) + // + // As we next prove, the above predicates guarantee that: + // Start + i*Accum = (Ext ix (Trunc iy ( Start + i*Accum ) to ix) to iy) + // + // + // More formally, we want to prove that: + // Expr(i+1) = Start + (i+1) * Accum + // = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum + // + // Given that: + // 1) Expr(0) = Start + // 2) Expr(1) = Start + Accum + // = (Ext ix (Trunc iy (Start) to ix) to iy) + Accum :: from P2 + // 3) Induction hypothesis (step i): + // Expr(i) = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum + // + // Proof: + // Expr(i+1) = + // = Start + (i+1)*Accum + // = (Start + i*Accum) + Accum + // = Expr(i) + Accum + // = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum + Accum + // :: from step i + // + // = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy) + Accum + Accum + // + // = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy) + // + (Ext ix (Trunc iy (Accum) to ix) to iy) + // + Accum :: from P3 + // + // = (Ext ix (Trunc iy ((Start + (i-1)*Accum) + Accum) to ix) to iy) + // + Accum :: from P1: Ext(x)+Ext(y)=>Ext(x+y) + // + // = (Ext ix (Trunc iy (Start + i*Accum) to ix) to iy) + Accum + // = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum + // + // By induction, the same applies to all iterations 1<=i(PHISCEV); + + SCEVWrapPredicate::IncrementWrapFlags AddedFlags = + Signed ? SCEVWrapPredicate::IncrementNSSW + : SCEVWrapPredicate::IncrementNUSW; + const SCEVPredicate *AddRecPred = getWrapPredicate(AR, AddedFlags); + Predicates.push_back(AddRecPred); + + // Create the Equal Predicates P2,P3: + auto AppendPredicate = [&](const SCEV *Expr) -> void { + assert (isLoopInvariant(Expr, L) && "Expr is expected to be invariant"); + const SCEV *TruncatedExpr = getTruncateExpr(Expr, TruncTy); + const SCEV *ExtendedExpr = + Signed ? getSignExtendExpr(TruncatedExpr, Expr->getType()) + : getZeroExtendExpr(TruncatedExpr, Expr->getType()); + if (Expr != ExtendedExpr && + !isKnownPredicate(ICmpInst::ICMP_EQ, Expr, ExtendedExpr)) { + const SCEVPredicate *Pred = getEqualPredicate(Expr, ExtendedExpr); + DEBUG (dbgs() << "Added Predicate: " << *Pred); + Predicates.push_back(Pred); + } + }; + + AppendPredicate(StartVal); + AppendPredicate(Accum); + + // *** Part3: Predicates are ready. Now go ahead and create the new addrec in + // which the casts had been folded away. The caller can rewrite SymbolicPHI + // into NewAR if it will also add the runtime overflow checks specified in + // Predicates. + auto *NewAR = getAddRecExpr(StartVal, Accum, L, SCEV::FlagAnyWrap); + + std::pair> PredRewrite = + std::make_pair(NewAR, Predicates); + // Remember the result of the analysis for this SCEV at this locayyytion. + PredicatedSCEVRewrites[{SymbolicPHI, L}] = PredRewrite; + return PredRewrite; +} + +Optional>> +ScalarEvolution::createAddRecFromPHIWithCasts(const SCEVUnknown *SymbolicPHI) { + + auto *PN = cast(SymbolicPHI->getValue()); + const Loop *L = isIntegerLoopHeaderPHI(PN, LI); + if (!L) + return None; + + // Check to see if we already analyzed this PHI. + auto I = PredicatedSCEVRewrites.find({SymbolicPHI, L}); + if (I != PredicatedSCEVRewrites.end()) { + std::pair> Rewrite = + I->second; + // Analysis was done before and failed to create an AddRec: + if (Rewrite.first == SymbolicPHI) + return None; + // Analysis was done before and succeeded to create an AddRec under + // a predicate: + assert(isa(Rewrite.first) && "Expected an AddRec"); + assert(!(Rewrite.second).empty() && "Expected to find Predicates"); + return Rewrite; + } + + Optional>> + Rewrite = createAddRecFromPHIWithCastsImpl(SymbolicPHI); + + // Record in the cache that the analysis failed + if (!Rewrite) { + SmallVector Predicates; + PredicatedSCEVRewrites[{SymbolicPHI, L}] = {SymbolicPHI, Predicates}; + return None; + } + + return Rewrite; +} + /// A helper function for createAddRecFromPHI to handle simple cases. /// /// This function tries to find an AddRec expression for the simplest (yet most @@ -5904,6 +6217,16 @@ void ScalarEvolution::forgetLoop(const Loop *L) { RemoveLoopFromBackedgeMap(BackedgeTakenCounts); RemoveLoopFromBackedgeMap(PredicatedBackedgeTakenCounts); + // Drop information about predicated SCEV rewrites for this loop. + for (auto I = PredicatedSCEVRewrites.begin(); + I != PredicatedSCEVRewrites.end();) { + std::pair Entry = I->first; + if (Entry.second == L) + PredicatedSCEVRewrites.erase(I++); + else + ++I; + } + // Drop information about expressions based on loop-header PHIs. SmallVector Worklist; PushLoopPHIs(L, Worklist); @@ -10062,6 +10385,7 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg) UniqueSCEVs(std::move(Arg.UniqueSCEVs)), UniquePreds(std::move(Arg.UniquePreds)), SCEVAllocator(std::move(Arg.SCEVAllocator)), + PredicatedSCEVRewrites(std::move(Arg.PredicatedSCEVRewrites)), FirstUnknown(Arg.FirstUnknown) { Arg.FirstUnknown = nullptr; } @@ -10462,6 +10786,15 @@ void ScalarEvolution::forgetMemoizedResults(const SCEV *S) { HasRecMap.erase(S); MinTrailingZerosCache.erase(S); + for (auto I = PredicatedSCEVRewrites.begin(); + I != PredicatedSCEVRewrites.end();) { + std::pair Entry = I->first; + if (Entry.first == S) + PredicatedSCEVRewrites.erase(I++); + else + ++I; + } + auto RemoveSCEVFromBackedgeMap = [S, this](DenseMap &Map) { for (auto I = Map.begin(), E = Map.end(); I != E;) { @@ -10621,10 +10954,11 @@ void ScalarEvolutionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredTransitive(); } -const SCEVPredicate * -ScalarEvolution::getEqualPredicate(const SCEVUnknown *LHS, - const SCEVConstant *RHS) { +const SCEVPredicate *ScalarEvolution::getEqualPredicate(const SCEV *LHS, + const SCEV *RHS) { FoldingSetNodeID ID; + assert(LHS->getType() == RHS->getType() && + "Type mismatch between LHS and RHS"); // Unique this node based on the arguments ID.AddInteger(SCEVPredicate::P_Equal); ID.AddPointer(LHS); @@ -10687,8 +11021,7 @@ public: if (IPred->getLHS() == Expr) return IPred->getRHS(); } - - return Expr; + return convertToAddRecWithPreds(Expr); } const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) { @@ -10724,17 +11057,41 @@ public: } private: - bool addOverflowAssumption(const SCEVAddRecExpr *AR, - SCEVWrapPredicate::IncrementWrapFlags AddedFlags) { - auto *A = SE.getWrapPredicate(AR, AddedFlags); + bool addOverflowAssumption(const SCEVPredicate *P) { if (!NewPreds) { // Check if we've already made this assumption. - return Pred && Pred->implies(A); + return Pred && Pred->implies(P); } - NewPreds->insert(A); + NewPreds->insert(P); return true; } + bool addOverflowAssumption(const SCEVAddRecExpr *AR, + SCEVWrapPredicate::IncrementWrapFlags AddedFlags) { + auto *A = SE.getWrapPredicate(AR, AddedFlags); + return addOverflowAssumption(A); + } + + // If \p Expr represents a PHINode, we try to see if it can be represented + // as an AddRec, possibly under a predicate (PHISCEVPred). If it is possible + // to add this predicate as a runtime overflow check, we return the AddRec. + // If \p Expr does not meet these conditions (is not a PHI node, or we + // couldn't create an AddRec for it, or couldn't add the predicate), we just + // return \p Expr. + const SCEV *convertToAddRecWithPreds(const SCEVUnknown *Expr) { + if (!isa(Expr->getValue())) + return Expr; + Optional>> + PredicatedRewrite = SE.createAddRecFromPHIWithCasts(Expr); + if (!PredicatedRewrite) + return Expr; + for (auto *P : PredicatedRewrite->second){ + if (!addOverflowAssumption(P)) + return Expr; + } + return PredicatedRewrite->first; + } + SmallPtrSetImpl *NewPreds; SCEVUnionPredicate *Pred; const Loop *L; @@ -10771,9 +11128,11 @@ SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID, : FastID(ID), Kind(Kind) {} SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID, - const SCEVUnknown *LHS, - const SCEVConstant *RHS) - : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {} + const SCEV *LHS, const SCEV *RHS) + : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) { + assert(LHS->getType() == RHS->getType() && "LHS and RHS types don't match"); + assert(LHS != RHS && "LHS and RHS are the same SCEV"); +} bool SCEVEqualPredicate::implies(const SCEVPredicate *N) const { const auto *Op = dyn_cast(N); diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 94bbc58541a7..25813c65037f 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -82,6 +82,11 @@ int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr, return TTIImpl->getGEPCost(PointeeType, Ptr, Operands); } +int TargetTransformInfo::getExtCost(const Instruction *I, + const Value *Src) const { + return TTIImpl->getExtCost(I, Src); +} + int TargetTransformInfo::getIntrinsicCost( Intrinsic::ID IID, Type *RetTy, ArrayRef Arguments) const { int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments); diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp index 428bb21fbf51..90e0d6a216ee 100644 --- a/lib/AsmParser/LLLexer.cpp +++ b/lib/AsmParser/LLLexer.cpp @@ -588,7 +588,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(spir_func); KEYWORD(intel_ocl_bicc); KEYWORD(x86_64_sysvcc); - KEYWORD(x86_64_win64cc); + KEYWORD(win64cc); KEYWORD(x86_regcallcc); KEYWORD(webkit_jscc); KEYWORD(swiftcc); diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp index 717eb0e00f4f..13679ce1d25c 100644 --- a/lib/AsmParser/LLParser.cpp +++ b/lib/AsmParser/LLParser.cpp @@ -1670,7 +1670,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) { /// ::= 'spir_func' /// ::= 'spir_kernel' /// ::= 'x86_64_sysvcc' -/// ::= 'x86_64_win64cc' +/// ::= 'win64cc' /// ::= 'webkit_jscc' /// ::= 'anyregcc' /// ::= 'preserve_mostcc' @@ -1712,7 +1712,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) { case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break; case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break; case lltok::kw_x86_64_sysvcc: CC = CallingConv::X86_64_SysV; break; - case lltok::kw_x86_64_win64cc: CC = CallingConv::X86_64_Win64; break; + case lltok::kw_win64cc: CC = CallingConv::Win64; break; case lltok::kw_webkit_jscc: CC = CallingConv::WebKit_JS; break; case lltok::kw_anyregcc: CC = CallingConv::AnyReg; break; case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break; @@ -4411,13 +4411,15 @@ bool LLParser::ParseDIImportedEntity(MDNode *&Result, bool IsDistinct) { REQUIRED(tag, DwarfTagField, ); \ REQUIRED(scope, MDField, ); \ OPTIONAL(entity, MDField, ); \ + OPTIONAL(file, MDField, ); \ OPTIONAL(line, LineField, ); \ OPTIONAL(name, MDStringField, ); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS - Result = GET_OR_DISTINCT(DIImportedEntity, (Context, tag.Val, scope.Val, - entity.Val, line.Val, name.Val)); + Result = GET_OR_DISTINCT( + DIImportedEntity, + (Context, tag.Val, scope.Val, entity.Val, file.Val, line.Val, name.Val)); return false; } diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h index 9c7a06de81b4..0f3707ba0d1e 100644 --- a/lib/AsmParser/LLToken.h +++ b/lib/AsmParser/LLToken.h @@ -141,7 +141,7 @@ enum Kind { kw_spir_kernel, kw_spir_func, kw_x86_64_sysvcc, - kw_x86_64_win64cc, + kw_win64cc, kw_webkit_jscc, kw_anyregcc, kw_swiftcc, diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp index b1504a8034e0..10fbcdea784f 100644 --- a/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/lib/Bitcode/Reader/MetadataLoader.cpp @@ -1671,15 +1671,17 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( break; } case bitc::METADATA_IMPORTED_ENTITY: { - if (Record.size() != 6) + if (Record.size() != 6 && Record.size() != 7) return error("Invalid record"); IsDistinct = Record[0]; + bool HasFile = (Record.size() == 7); MetadataList.assignValue( GET_OR_DISTINCT(DIImportedEntity, (Context, Record[1], getMDOrNull(Record[2]), - getDITypeRefOrNull(Record[3]), Record[4], - getMDString(Record[5]))), + getDITypeRefOrNull(Record[3]), + HasFile ? getMDOrNull(Record[6]) : nullptr, + HasFile ? Record[4] : 0, getMDString(Record[5]))), NextMetadataNo); NextMetadataNo++; break; diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index 0e518d2bbc8f..dcffde1742cd 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -1718,6 +1718,7 @@ void ModuleBitcodeWriter::writeDIImportedEntity( Record.push_back(VE.getMetadataOrNullID(N->getEntity())); Record.push_back(N->getLine()); Record.push_back(VE.getMetadataOrNullID(N->getRawName())); + Record.push_back(VE.getMetadataOrNullID(N->getRawFile())); Stream.EmitRecord(bitc::METADATA_IMPORTED_ENTITY, Record, Abbrev); Record.clear(); diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index d4a90eeabe15..676c48fe5c67 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -664,8 +664,9 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE( else EntityDie = getDIE(Entity); assert(EntityDie); - addSourceLine(*IMDie, Module->getLine(), Module->getScope()->getFilename(), - Module->getScope()->getDirectory()); + auto *File = Module->getFile(); + addSourceLine(*IMDie, Module->getLine(), File ? File->getFilename() : "", + File ? File->getDirectory() : ""); addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie); StringRef Name = Module->getName(); if (!Name.empty()) diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index b7155ac2480a..45dc13d58de7 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -4267,9 +4267,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // Use a worklist to iteratively look through PHI nodes, and ensure that // the addressing mode obtained from the non-PHI roots of the graph // are equivalent. - Value *Consensus = nullptr; - unsigned NumUsesConsensus = 0; - bool IsNumUsesConsensusValid = false; + bool AddrModeFound = false; bool PhiSeen = false; SmallVector AddrModeInsts; ExtAddrMode AddrMode; @@ -4280,11 +4278,17 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Value *V = worklist.back(); worklist.pop_back(); - // Break use-def graph loops. - if (!Visited.insert(V).second) { - Consensus = nullptr; - break; - } + // We allow traversing cyclic Phi nodes. + // In case of success after this loop we ensure that traversing through + // Phi nodes ends up with all cases to compute address of the form + // BaseGV + Base + Scale * Index + Offset + // where Scale and Offset are constans and BaseGV, Base and Index + // are exactly the same Values in all cases. + // It means that BaseGV, Scale and Offset dominate our memory instruction + // and have the same value as they had in address computation represented + // as Phi. So we can safely sink address computation to memory instruction. + if (!Visited.insert(V).second) + continue; // For a PHI node, push all of its incoming values. if (PHINode *P = dyn_cast(V)) { @@ -4297,47 +4301,26 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // For non-PHIs, determine the addressing mode being computed. Note that // the result may differ depending on what other uses our candidate // addressing instructions might have. - SmallVector NewAddrModeInsts; + AddrModeInsts.clear(); ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( - V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TLI, *TRI, - InsertedInsts, PromotedInsts, TPT); - - // This check is broken into two cases with very similar code to avoid using - // getNumUses() as much as possible. Some values have a lot of uses, so - // calling getNumUses() unconditionally caused a significant compile-time - // regression. - if (!Consensus) { - Consensus = V; - AddrMode = NewAddrMode; - AddrModeInsts = NewAddrModeInsts; - continue; - } else if (NewAddrMode == AddrMode) { - if (!IsNumUsesConsensusValid) { - NumUsesConsensus = Consensus->getNumUses(); - IsNumUsesConsensusValid = true; - } + V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI, + InsertedInsts, PromotedInsts, TPT); - // Ensure that the obtained addressing mode is equivalent to that obtained - // for all other roots of the PHI traversal. Also, when choosing one - // such root as representative, select the one with the most uses in order - // to keep the cost modeling heuristics in AddressingModeMatcher - // applicable. - unsigned NumUses = V->getNumUses(); - if (NumUses > NumUsesConsensus) { - Consensus = V; - NumUsesConsensus = NumUses; - AddrModeInsts = NewAddrModeInsts; - } + if (!AddrModeFound) { + AddrModeFound = true; + AddrMode = NewAddrMode; continue; } + if (NewAddrMode == AddrMode) + continue; - Consensus = nullptr; + AddrModeFound = false; break; } // If the addressing mode couldn't be determined, or if multiple different // ones were determined, bail out now. - if (!Consensus) { + if (!AddrModeFound) { TPT.rollback(LastKnownGood); return false; } @@ -4847,25 +4830,7 @@ bool CodeGenPrepare::canFormExtLd( if (!HasPromoted && LI->getParent() == Inst->getParent()) return false; - EVT VT = TLI->getValueType(*DL, Inst->getType()); - EVT LoadVT = TLI->getValueType(*DL, LI->getType()); - - // If the load has other users and the truncate is not free, this probably - // isn't worthwhile. - if (!LI->hasOneUse() && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && - !TLI->isTruncateFree(Inst->getType(), LI->getType())) - return false; - - // Check whether the target supports casts folded into loads. - unsigned LType; - if (isa(Inst)) - LType = ISD::ZEXTLOAD; - else { - assert(isa(Inst) && "Unexpected ext type!"); - LType = ISD::SEXTLOAD; - } - - return TLI->isLoadExtLegal(LType, VT, LoadVT); + return TLI->isExtLoad(LI, Inst, *DL); } /// Move a zext or sext fed by a load into the same basic block as the load, diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 49fb5e8f075b..5258370e6680 100644 --- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -433,9 +433,12 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { } case TargetOpcode::G_SDIV: case TargetOpcode::G_UDIV: + case TargetOpcode::G_SREM: + case TargetOpcode::G_UREM: case TargetOpcode::G_ASHR: case TargetOpcode::G_LSHR: { unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV || + MI.getOpcode() == TargetOpcode::G_SREM || MI.getOpcode() == TargetOpcode::G_ASHR ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp index c176de16b593..e6f80dbb8630 100644 --- a/lib/CodeGen/MachineCombiner.cpp +++ b/lib/CodeGen/MachineCombiner.cpp @@ -11,8 +11,6 @@ // instructions do not lengthen the critical path or the resource depth. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "machine-combiner" - #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineDominators.h" @@ -32,6 +30,8 @@ using namespace llvm; +#define DEBUG_TYPE "machine-combiner" + STATISTIC(NumInstCombined, "Number of machineinst combined"); namespace { diff --git a/lib/CodeGen/MachineDominanceFrontier.cpp b/lib/CodeGen/MachineDominanceFrontier.cpp index 28ecc8f96805..b559e4e513a6 100644 --- a/lib/CodeGen/MachineDominanceFrontier.cpp +++ b/lib/CodeGen/MachineDominanceFrontier.cpp @@ -15,7 +15,8 @@ using namespace llvm; namespace llvm { -template class DominanceFrontierBase; +template class DominanceFrontierBase; +template class DominanceFrontierBase; template class ForwardDominanceFrontierBase; } diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp index 65e9e5d195a4..845e8232477c 100644 --- a/lib/CodeGen/MachineDominators.cpp +++ b/lib/CodeGen/MachineDominators.cpp @@ -31,7 +31,7 @@ static cl::opt VerifyMachineDomInfoX( namespace llvm { template class DomTreeNodeBase; -template class DominatorTreeBase; +template class DominatorTreeBase; // DomTreeBase } char MachineDominatorTree::ID = 0; @@ -49,7 +49,7 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const { bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) { CriticalEdgesToSplit.clear(); NewBBs.clear(); - DT.reset(new DominatorTreeBase(false)); + DT.reset(new DomTreeBase()); DT->recalculate(F); return false; } @@ -144,7 +144,7 @@ void MachineDominatorTree::verifyDomTree() const { return; MachineFunction &F = *getRoot()->getParent(); - DominatorTreeBase OtherDT(false); + DomTreeBase OtherDT; OtherDT.recalculate(F); if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() || DT->compare(OtherDT)) { diff --git a/lib/CodeGen/MachinePostDominators.cpp b/lib/CodeGen/MachinePostDominators.cpp index c3f6e9249e7d..488377998cb3 100644 --- a/lib/CodeGen/MachinePostDominators.cpp +++ b/lib/CodeGen/MachinePostDominators.cpp @@ -16,6 +16,10 @@ using namespace llvm; +namespace llvm { +template class DominatorTreeBase; // PostDomTreeBase +} + char MachinePostDominatorTree::ID = 0; //declare initializeMachinePostDominatorTreePass @@ -24,8 +28,7 @@ INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree", MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) { initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry()); - DT = new DominatorTreeBase(true); //true indicate - // postdominator + DT = new PostDomTreeBase(); } FunctionPass * diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 71382c18fdf9..d5d3f7a61a9f 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3889,9 +3889,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // Note: the SimplifyDemandedBits fold below can make an information-losing // transform, and then we have no way to find this better fold. if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) { - ConstantSDNode *SubLHS = isConstOrConstSplat(N0.getOperand(0)); - SDValue SubRHS = N0.getOperand(1); - if (SubLHS && SubLHS->isNullValue()) { + if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) { + SDValue SubRHS = N0.getOperand(1); if (SubRHS.getOpcode() == ISD::ZERO_EXTEND && SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) return SubRHS; @@ -4586,6 +4585,20 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, return nullptr; } +// if Left + Right == Sum (constant or constant splat vector) +static bool sumMatchConstant(SDValue Left, SDValue Right, unsigned Sum, + SelectionDAG &DAG, const SDLoc &DL) { + EVT ShiftVT = Left.getValueType(); + if (ShiftVT != Right.getValueType()) return false; + + SDValue ShiftSum = DAG.FoldConstantArithmetic(ISD::ADD, DL, ShiftVT, + Left.getNode(), Right.getNode()); + if (!ShiftSum) return false; + + ConstantSDNode *CSum = isConstOrConstSplat(ShiftSum); + return CSum && CSum->getZExtValue() == Sum; +} + // MatchRotate - Handle an 'or' of two operands. If this is one of the many // idioms for rotate, and if the target supports rotation instructions, generate // a rot[lr]. @@ -4631,30 +4644,24 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) - if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) { - uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue(); - uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue(); - if ((LShVal + RShVal) != EltSizeInBits) - return nullptr; - + if (sumMatchConstant(LHSShiftAmt, RHSShiftAmt, EltSizeInBits, DAG, DL)) { SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); // If there is an AND of either shifted operand, apply it to the result. if (LHSMask.getNode() || RHSMask.getNode()) { - SDValue Mask = DAG.getAllOnesConstant(DL, VT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); + SDValue Mask = AllOnes; if (LHSMask.getNode()) { - APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal); + SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt); Mask = DAG.getNode(ISD::AND, DL, VT, Mask, - DAG.getNode(ISD::OR, DL, VT, LHSMask, - DAG.getConstant(RHSBits, DL, VT))); + DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits)); } if (RHSMask.getNode()) { - APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal); + SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt); Mask = DAG.getNode(ISD::AND, DL, VT, Mask, - DAG.getNode(ISD::OR, DL, VT, RHSMask, - DAG.getConstant(LHSBits, DL, VT))); + DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits)); } Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask); @@ -5272,11 +5279,21 @@ SDValue DAGCombiner::visitRotate(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + unsigned Bitsize = VT.getScalarSizeInBits(); // fold (rot x, 0) -> x if (isNullConstantOrNullSplatConstant(N1)) return N0; + // fold (rot x, c) -> (rot x, c % BitSize) + if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) { + if (Cst->getAPIntValue().uge(Bitsize)) { + uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize); + return DAG.getNode(N->getOpcode(), dl, VT, N0, + DAG.getConstant(RotAmt, dl, N1.getValueType())); + } + } + // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { @@ -5286,22 +5303,24 @@ SDValue DAGCombiner::visitRotate(SDNode *N) { unsigned NextOp = N0.getOpcode(); // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize) - if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) - if (SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1)) - if (SDNode *C2 = - DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { - bool SameSide = (N->getOpcode() == NextOp); - unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; - if (SDValue CombinedShift = - DAG.FoldConstantArithmetic(CombineOp, dl, VT, C1, C2)) { - unsigned Bitsize = VT.getScalarSizeInBits(); - SDValue BitsizeC = DAG.getConstant(Bitsize, dl, VT); - SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic( - ISD::SREM, dl, VT, CombinedShift.getNode(), BitsizeC.getNode()); - return DAG.getNode( - N->getOpcode(), dl, VT, N0->getOperand(0), CombinedShiftNorm); - } + if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { + SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); + SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); + if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) { + EVT ShiftVT = C1->getValueType(0); + bool SameSide = (N->getOpcode() == NextOp); + unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; + if (SDValue CombinedShift = + DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) { + SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT); + SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic( + ISD::SREM, dl, ShiftVT, CombinedShift.getNode(), + BitsizeC.getNode()); + return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0), + CombinedShiftNorm); } + } + } return SDValue(); } @@ -7152,8 +7171,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); - CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - return CombineTo(N, ExtLoad); // Return N so it doesn't get rechecked! + // If the load value is used only by N, replace it via CombineTo N. + bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); + CombineTo(N, ExtLoad); + if (NoReplaceTrunc) + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + else + CombineTo(LN0, Trunc, ExtLoad.getValue(1)); + return SDValue(N, 0); } } @@ -7210,8 +7235,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); - CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); - return CombineTo(N, And); // Return N so it doesn't get rechecked! + bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); + CombineTo(N, And); + if (NoReplaceTrunc) + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + else + CombineTo(LN0, Trunc, ExtLoad.getValue(1)); + return SDValue(N,0); // Return N so it doesn't get rechecked! } } } @@ -7451,8 +7481,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND); - CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - return CombineTo(N, ExtLoad); // Return N so it doesn't get rechecked! + // If the load value is used only by N, replace it via CombineTo N. + bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); + CombineTo(N, ExtLoad); + if (NoReplaceTrunc) + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + else + CombineTo(LN0, Trunc, ExtLoad.getValue(1)); + return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -7503,8 +7539,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND); - CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); - return CombineTo(N, And); // Return N so it doesn't get rechecked! + bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); + CombineTo(N, And); + if (NoReplaceTrunc) + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + else + CombineTo(LN0, Trunc, ExtLoad.getValue(1)); + return SDValue(N,0); // Return N so it doesn't get rechecked! } } } @@ -7676,13 +7717,18 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); - CombineTo(N, ExtLoad); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); - CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ANY_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + // If the load value is used only by N, replace it via CombineTo N. + bool NoReplaceTrunc = N0.hasOneUse(); + CombineTo(N, ExtLoad); + if (NoReplaceTrunc) + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + else + CombineTo(LN0, Trunc, ExtLoad.getValue(1)); + return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -11373,12 +11419,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Chain, ReplLoad.getValue(1)); - // Make sure the new and old chains are cleaned up. - AddToWorklist(Token.getNode()); - - // Replace uses with load result and token factor. Don't add users - // to work list. - return CombineTo(N, ReplLoad.getValue(0), Token, false); + // Replace uses with load result and token factor + return CombineTo(N, ReplLoad.getValue(0), Token); } } @@ -12744,7 +12786,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && !NoVectors) { // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1); + unsigned Elts = i + 1; + if (MemVT.isVector()) { + // When merging vector stores, get the total number of elements. + Elts *= MemVT.getVectorNumElements(); + } + EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, @@ -13003,7 +13050,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); AddToWorklist(NewStoreChain.getNode()); - MachineMemOperand::Flags MMOFlags = isDereferenceable ? + MachineMemOperand::Flags MMOFlags = isDereferenceable ? MachineMemOperand::MODereferenceable: MachineMemOperand::MONone; @@ -16703,6 +16750,20 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0)); + // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be + // able to calculate their relative offset if at least one arises + // from an alloca. However, these allocas cannot overlap and we + // can infer there is no alias. + if (auto *A = dyn_cast(BasePtr0.getBase())) + if (auto *B = dyn_cast(BasePtr1.getBase())) { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + // If the base are the same frame index but the we couldn't find a + // constant offset, (indices are different) be conservative. + if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) || + !MFI.isFixedObjectIndex(B->getIndex()))) + return false; + } + // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis // modified to use BaseIndexOffset. diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index ac3247948169..75fec7bd1d48 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1827,10 +1827,11 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ? ISD::UADDO : ISD::USUBO, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); + TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT); + if (hasOVF) { EVT OvfVT = getSetCCResultType(NVT); SDVTList VTList = DAG.getVTList(NVT, OvfVT); - TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT); int RevOpc; if (N->getOpcode() == ISD::ADD) { RevOpc = ISD::SUB; @@ -1863,6 +1864,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2)); SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0], ISD::SETULT); + + if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) { + SDValue Carry = DAG.getZExtOrTrunc(Cmp1, dl, NVT); + Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry); + return; + } + SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1, DAG.getConstant(1, dl, NVT), DAG.getConstant(0, dl, NVT)); @@ -1877,9 +1885,14 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, SDValue Cmp = DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()), LoOps[0], LoOps[1], ISD::SETULT); - SDValue Borrow = DAG.getSelect(dl, NVT, Cmp, - DAG.getConstant(1, dl, NVT), - DAG.getConstant(0, dl, NVT)); + + SDValue Borrow; + if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) + Borrow = DAG.getZExtOrTrunc(Cmp, dl, NVT); + else + Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT), + DAG.getConstant(0, dl, NVT)); + Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow); } } diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp index 1a8d5a4f45da..0b4c6e551667 100644 --- a/lib/CodeGen/XRayInstrumentation.cpp +++ b/lib/CodeGen/XRayInstrumentation.cpp @@ -142,9 +142,9 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { return false; // Invalid value for threshold. // Count the number of MachineInstr`s in MachineFunction - int64_t MICount = 0; - for (const auto& MBB : MF) - MICount += MBB.size(); + int64_t MICount = 0; + for (const auto& MBB : MF) + MICount += MBB.size(); // Check if we have a loop. // FIXME: Maybe make this smarter, and see whether the loops are dependent diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp index 22f166a2335d..79b9fdefd40e 100644 --- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp +++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp @@ -14,7 +14,6 @@ #include "llvm/DebugInfo/CodeView/TypeCollection.h" #include "llvm/DebugInfo/CodeView/TypeDeserializer.h" #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h" -#include "llvm/DebugInfo/CodeView/TypeServerHandler.h" #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h" #include "llvm/Support/BinaryByteStream.h" #include "llvm/Support/BinaryStreamReader.h" @@ -42,13 +41,6 @@ static Error visitKnownMember(CVMemberRecord &Record, return Error::success(); } -static Expected deserializeTypeServerRecord(CVType &Record) { - TypeServer2Record R(TypeRecordKind::TypeServer2); - if (auto EC = TypeDeserializer::deserializeAs(Record, R)) - return std::move(EC); - return R; -} - static Error visitMemberRecord(CVMemberRecord &Record, TypeVisitorCallbacks &Callbacks) { if (auto EC = Callbacks.visitMemberBegin(Record)) @@ -84,8 +76,6 @@ class CVTypeVisitor { public: explicit CVTypeVisitor(TypeVisitorCallbacks &Callbacks); - void addTypeServerHandler(TypeServerHandler &Handler); - Error visitTypeRecord(CVType &Record, TypeIndex Index); Error visitTypeRecord(CVType &Record); @@ -98,45 +88,15 @@ public: Error visitFieldListMemberStream(BinaryStreamReader &Stream); private: - Expected handleTypeServer(CVType &Record); Error finishVisitation(CVType &Record); /// The interface to the class that gets notified of each visitation. TypeVisitorCallbacks &Callbacks; - - TinyPtrVector Handlers; }; CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks) : Callbacks(Callbacks) {} -void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) { - Handlers.push_back(&Handler); -} - -Expected CVTypeVisitor::handleTypeServer(CVType &Record) { - if (Record.Type == TypeLeafKind::LF_TYPESERVER2 && !Handlers.empty()) { - auto TS = deserializeTypeServerRecord(Record); - if (!TS) - return TS.takeError(); - - for (auto Handler : Handlers) { - auto ExpectedResult = Handler->handle(*TS, Callbacks); - // If there was an error, return the error. - if (!ExpectedResult) - return ExpectedResult.takeError(); - - // If the handler processed the record, return success. - if (*ExpectedResult) - return true; - - // Otherwise keep searching for a handler, eventually falling out and - // using the default record handler. - } - } - return false; -} - Error CVTypeVisitor::finishVisitation(CVType &Record) { switch (Record.Type) { default: @@ -163,12 +123,6 @@ Error CVTypeVisitor::finishVisitation(CVType &Record) { } Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) { - auto ExpectedResult = handleTypeServer(Record); - if (!ExpectedResult) - return ExpectedResult.takeError(); - if (*ExpectedResult) - return Error::success(); - if (auto EC = Callbacks.visitTypeBegin(Record, Index)) return EC; @@ -176,12 +130,6 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) { } Error CVTypeVisitor::visitTypeRecord(CVType &Record) { - auto ExpectedResult = handleTypeServer(Record); - if (!ExpectedResult) - return ExpectedResult.takeError(); - if (*ExpectedResult) - return Error::success(); - if (auto EC = Callbacks.visitTypeBegin(Record)) return EC; @@ -271,52 +219,37 @@ struct VisitHelper { Error llvm::codeview::visitTypeRecord(CVType &Record, TypeIndex Index, TypeVisitorCallbacks &Callbacks, - VisitorDataSource Source, - TypeServerHandler *TS) { + VisitorDataSource Source) { VisitHelper V(Callbacks, Source); - if (TS) - V.Visitor.addTypeServerHandler(*TS); return V.Visitor.visitTypeRecord(Record, Index); } Error llvm::codeview::visitTypeRecord(CVType &Record, TypeVisitorCallbacks &Callbacks, - VisitorDataSource Source, - TypeServerHandler *TS) { + VisitorDataSource Source) { VisitHelper V(Callbacks, Source); - if (TS) - V.Visitor.addTypeServerHandler(*TS); return V.Visitor.visitTypeRecord(Record); } Error llvm::codeview::visitTypeStream(const CVTypeArray &Types, TypeVisitorCallbacks &Callbacks, - VisitorDataSource Source, - TypeServerHandler *TS) { + VisitorDataSource Source) { VisitHelper V(Callbacks, Source); - if (TS) - V.Visitor.addTypeServerHandler(*TS); return V.Visitor.visitTypeStream(Types); } Error llvm::codeview::visitTypeStream(CVTypeRange Types, - TypeVisitorCallbacks &Callbacks, - TypeServerHandler *TS) { + TypeVisitorCallbacks &Callbacks) { VisitHelper V(Callbacks, VDS_BytesPresent); - if (TS) - V.Visitor.addTypeServerHandler(*TS); return V.Visitor.visitTypeStream(Types); } Error llvm::codeview::visitTypeStream(TypeCollection &Types, - TypeVisitorCallbacks &Callbacks, - TypeServerHandler *TS) { + TypeVisitorCallbacks &Callbacks) { // When the internal visitor calls Types.getType(Index) the interface is // required to return a CVType with the bytes filled out. So we can assume // that the bytes will be present when individual records are visited. VisitHelper V(Callbacks, VDS_BytesPresent); - if (TS) - V.Visitor.addTypeServerHandler(*TS); return V.Visitor.visitTypeStream(Types); } diff --git a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp index 711144fc2faa..4fc14480578e 100644 --- a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp +++ b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp @@ -168,18 +168,19 @@ Error CodeViewRecordIO::mapStringZ(StringRef &Value) { return Error::success(); } -Error CodeViewRecordIO::mapGuid(StringRef &Guid) { +Error CodeViewRecordIO::mapGuid(GUID &Guid) { constexpr uint32_t GuidSize = 16; if (maxFieldLength() < GuidSize) return make_error(cv_error_code::insufficient_buffer); if (isWriting()) { - assert(Guid.size() == 16 && "Invalid Guid Size!"); - if (auto EC = Writer->writeFixedString(Guid)) + if (auto EC = Writer->writeBytes(Guid.Guid)) return EC; } else { - if (auto EC = Reader->readFixedString(Guid, 16)) + ArrayRef GuidBytes; + if (auto EC = Reader->readBytes(GuidBytes, GuidSize)) return EC; + memcpy(Guid.Guid, GuidBytes.data(), GuidSize); } return Error::success(); } diff --git a/lib/DebugInfo/CodeView/Formatters.cpp b/lib/DebugInfo/CodeView/Formatters.cpp index 1fa8d219d6ac..b8d89c76da3b 100644 --- a/lib/DebugInfo/CodeView/Formatters.cpp +++ b/lib/DebugInfo/CodeView/Formatters.cpp @@ -9,6 +9,7 @@ #include "llvm/DebugInfo/CodeView/Formatters.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/DebugInfo/CodeView/GUID.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -39,3 +40,9 @@ void GuidAdapter::format(raw_ostream &Stream, StringRef Style) { } Stream << "}"; } + +raw_ostream &llvm::codeview::operator<<(raw_ostream &OS, const GUID &Guid) { + codeview::detail::GuidAdapter A(Guid.Guid); + A.format(OS, ""); + return OS; +} diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp index c2c02f8de03f..62e73acc72d6 100644 --- a/lib/DebugInfo/CodeView/SymbolDumper.cpp +++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp @@ -186,7 +186,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, BuildInfoSym &BuildInfo) { - W.printNumber("BuildId", BuildInfo.BuildId); + printTypeIndex("BuildId", BuildInfo.BuildId); return Error::success(); } diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp index 589966705015..e18a35ca1f38 100644 --- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp +++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp @@ -354,7 +354,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) { } Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) { - W->printString("Guid", formatv("{0}", fmt_guid(TS.getGuid())).str()); + W->printString("Guid", formatv("{0}", TS.getGuid()).str()); W->printNumber("Age", TS.getAge()); W->printString("Name", TS.getName()); return Error::success(); diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp index 71a0966df036..bff3516203a0 100644 --- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp +++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp @@ -10,13 +10,11 @@ #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" #include "llvm/DebugInfo/CodeView/TypeDeserializer.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h" -#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" #include "llvm/Support/Error.h" #include "llvm/Support/ScopedPrinter.h" @@ -57,56 +55,35 @@ namespace { /// streams: an item (or IPI) stream and a type stream, as this is what is /// actually stored in the final PDB. We choose which records go where by /// looking at the record kind. -class TypeStreamMerger : public TypeVisitorCallbacks { +class TypeStreamMerger { public: - explicit TypeStreamMerger(SmallVectorImpl &SourceToDest, - TypeServerHandler *Handler) - : Handler(Handler), IndexMap(SourceToDest) { + explicit TypeStreamMerger(SmallVectorImpl &SourceToDest) + : IndexMap(SourceToDest) { SourceToDest.clear(); } static const TypeIndex Untranslated; - Error visitTypeBegin(CVType &Record) override; - Error visitTypeEnd(CVType &Record) override; - Error mergeTypesAndIds(TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes, - const CVTypeArray &IdsAndTypes); + const CVTypeArray &IdsAndTypes); Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef TypeSourceToDest, - const CVTypeArray &Ids); + const CVTypeArray &Ids); Error mergeTypeRecords(TypeTableBuilder &Dest, const CVTypeArray &Types); private: Error doit(const CVTypeArray &Types); + Error remapAllTypes(const CVTypeArray &Types); + + Error remapType(const CVType &Type); + void addMapping(TypeIndex Idx); bool remapTypeIndex(TypeIndex &Idx); bool remapItemIndex(TypeIndex &Idx); - bool remapIndices(RemappedType &Record, ArrayRef Refs) { - auto OriginalData = Record.OriginalRecord.content(); - bool Success = true; - for (auto &Ref : Refs) { - uint32_t Offset = Ref.Offset; - ArrayRef Bytes = - OriginalData.slice(Ref.Offset, sizeof(TypeIndex)); - ArrayRef TIs(reinterpret_cast(Bytes.data()), - Ref.Count); - for (auto TI : TIs) { - TypeIndex NewTI = TI; - bool ThisSuccess = (Ref.Kind == TiRefKind::IndexRef) - ? remapItemIndex(NewTI) - : remapTypeIndex(NewTI); - if (ThisSuccess && NewTI != TI) - Record.Mappings.emplace_back(Offset, NewTI); - Offset += sizeof(TypeIndex); - Success &= ThisSuccess; - } - } - return Success; - } + bool remapIndices(RemappedType &Record, ArrayRef Refs); bool remapIndex(TypeIndex &Idx, ArrayRef Map); @@ -128,21 +105,6 @@ private: return Error::success(); } - Error writeTypeRecord(const CVType &Record) { - TypeIndex DestIdx = - DestTypeStream->writeSerializedRecord(Record.RecordData); - addMapping(DestIdx); - return Error::success(); - } - - Error writeTypeRecord(const RemappedType &Record, bool RemapSuccess) { - return writeRecord(*DestTypeStream, Record, RemapSuccess); - } - - Error writeIdRecord(const RemappedType &Record, bool RemapSuccess) { - return writeRecord(*DestIdStream, Record, RemapSuccess); - } - Optional LastError; bool IsSecondPass = false; @@ -153,7 +115,6 @@ private: TypeTableBuilder *DestIdStream = nullptr; TypeTableBuilder *DestTypeStream = nullptr; - TypeServerHandler *Handler = nullptr; // If we're only mapping id records, this array contains the mapping for // type records. @@ -168,12 +129,8 @@ private: const TypeIndex TypeStreamMerger::Untranslated(SimpleTypeKind::NotTranslated); -Error TypeStreamMerger::visitTypeBegin(CVType &Rec) { - RemappedType R(Rec); - SmallVector Refs; - discoverTypeIndices(Rec.RecordData, Refs); - bool Success = remapIndices(R, Refs); - switch (Rec.kind()) { +static bool isIdRecord(TypeLeafKind K) { + switch (K) { case TypeLeafKind::LF_FUNC_ID: case TypeLeafKind::LF_MFUNC_ID: case TypeLeafKind::LF_STRING_ID: @@ -181,19 +138,10 @@ Error TypeStreamMerger::visitTypeBegin(CVType &Rec) { case TypeLeafKind::LF_BUILDINFO: case TypeLeafKind::LF_UDT_SRC_LINE: case TypeLeafKind::LF_UDT_MOD_SRC_LINE: - return writeIdRecord(R, Success); + return true; default: - return writeTypeRecord(R, Success); + return false; } - return Error::success(); -} - -Error TypeStreamMerger::visitTypeEnd(CVType &Rec) { - ++CurIndex; - if (!IsSecondPass) - assert(IndexMap.size() == slotForIndex(CurIndex) && - "visitKnownRecord should add one index map entry"); - return Error::success(); } void TypeStreamMerger::addMapping(TypeIndex Idx) { @@ -256,7 +204,7 @@ bool TypeStreamMerger::remapItemIndex(TypeIndex &Idx) { } Error TypeStreamMerger::mergeTypeRecords(TypeTableBuilder &Dest, - const CVTypeArray &Types) { + const CVTypeArray &Types) { DestTypeStream = &Dest; return doit(Types); @@ -264,7 +212,7 @@ Error TypeStreamMerger::mergeTypeRecords(TypeTableBuilder &Dest, Error TypeStreamMerger::mergeIdRecords(TypeTableBuilder &Dest, ArrayRef TypeSourceToDest, - const CVTypeArray &Ids) { + const CVTypeArray &Ids) { DestIdStream = &Dest; TypeLookup = TypeSourceToDest; @@ -273,25 +221,14 @@ Error TypeStreamMerger::mergeIdRecords(TypeTableBuilder &Dest, Error TypeStreamMerger::mergeTypesAndIds(TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes, - const CVTypeArray &IdsAndTypes) { + const CVTypeArray &IdsAndTypes) { DestIdStream = &DestIds; DestTypeStream = &DestTypes; - return doit(IdsAndTypes); } Error TypeStreamMerger::doit(const CVTypeArray &Types) { - LastError = Error::success(); - - // We don't want to deserialize records. I guess this flag is poorly named, - // but it really means "Don't deserialize records before switching on the - // concrete type. - // FIXME: We can probably get even more speed here if we don't use the visitor - // pipeline here, but instead write the switch ourselves. I don't think it - // would buy us much since it's already pretty fast, but it's probably worth - // a few cycles. - if (auto EC = - codeview::visitTypeStream(Types, *this, VDS_BytesExternal, Handler)) + if (auto EC = remapAllTypes(Types)) return EC; // If we found bad indices but no other errors, try doing another pass and see @@ -301,50 +238,92 @@ Error TypeStreamMerger::doit(const CVTypeArray &Types) { // topologically sorted. The standard library contains MASM-produced objects, // so this is important to handle correctly, but we don't have to be too // efficient. MASM type streams are usually very small. - while (!*LastError && NumBadIndices > 0) { + while (!LastError && NumBadIndices > 0) { unsigned BadIndicesRemaining = NumBadIndices; IsSecondPass = true; NumBadIndices = 0; CurIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex); - if (auto EC = - codeview::visitTypeStream(Types, *this, VDS_BytesExternal, Handler)) + if (auto EC = remapAllTypes(Types)) return EC; assert(NumBadIndices <= BadIndicesRemaining && "second pass found more bad indices"); - if (!*LastError && NumBadIndices == BadIndicesRemaining) { + if (!LastError && NumBadIndices == BadIndicesRemaining) { return llvm::make_error( cv_error_code::corrupt_record, "input type graph contains cycles"); } } - Error Ret = std::move(*LastError); - LastError.reset(); - return Ret; + if (LastError) + return std::move(*LastError); + return Error::success(); +} + +Error TypeStreamMerger::remapAllTypes(const CVTypeArray &Types) { + for (const CVType &Type : Types) + if (auto EC = remapType(Type)) + return EC; + return Error::success(); +} + +Error TypeStreamMerger::remapType(const CVType &Type) { + RemappedType R(Type); + SmallVector Refs; + discoverTypeIndices(Type.RecordData, Refs); + bool MappedAllIndices = remapIndices(R, Refs); + TypeTableBuilder &Dest = + isIdRecord(Type.kind()) ? *DestIdStream : *DestTypeStream; + if (auto EC = writeRecord(Dest, R, MappedAllIndices)) + return EC; + + ++CurIndex; + assert((IsSecondPass || IndexMap.size() == slotForIndex(CurIndex)) && + "visitKnownRecord should add one index map entry"); + return Error::success(); +} + +bool TypeStreamMerger::remapIndices(RemappedType &Record, + ArrayRef Refs) { + ArrayRef OriginalData = Record.OriginalRecord.content(); + bool Success = true; + for (auto &Ref : Refs) { + uint32_t Offset = Ref.Offset; + ArrayRef Bytes = OriginalData.slice(Ref.Offset, sizeof(TypeIndex)); + ArrayRef TIs(reinterpret_cast(Bytes.data()), + Ref.Count); + for (auto TI : TIs) { + TypeIndex NewTI = TI; + bool ThisSuccess = (Ref.Kind == TiRefKind::IndexRef) + ? remapItemIndex(NewTI) + : remapTypeIndex(NewTI); + if (ThisSuccess && NewTI != TI) + Record.Mappings.emplace_back(Offset, NewTI); + Offset += sizeof(TypeIndex); + Success &= ThisSuccess; + } + } + return Success; } Error llvm::codeview::mergeTypeRecords(TypeTableBuilder &Dest, SmallVectorImpl &SourceToDest, - TypeServerHandler *Handler, - const CVTypeArray &Types) { - TypeStreamMerger M(SourceToDest, Handler); + const CVTypeArray &Types) { + TypeStreamMerger M(SourceToDest); return M.mergeTypeRecords(Dest, Types); } Error llvm::codeview::mergeIdRecords(TypeTableBuilder &Dest, ArrayRef TypeSourceToDest, SmallVectorImpl &SourceToDest, - const CVTypeArray &Ids) { - TypeStreamMerger M(SourceToDest, nullptr); + const CVTypeArray &Ids) { + TypeStreamMerger M(SourceToDest); return M.mergeIdRecords(Dest, TypeSourceToDest, Ids); } Error llvm::codeview::mergeTypeAndIdRecords( TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes, - SmallVectorImpl &SourceToDest, TypeServerHandler *Handler, - const CVTypeArray &IdsAndTypes) { - - TypeStreamMerger M(SourceToDest, Handler); + SmallVectorImpl &SourceToDest, const CVTypeArray &IdsAndTypes) { + TypeStreamMerger M(SourceToDest); return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes); } diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 0a10e6b78911..6cf44ffa3796 100644 --- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -24,22 +24,166 @@ using namespace llvm; using namespace dwarf; using namespace object; -void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, - DWARFAttribute &AttrValue) { +bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData, + uint32_t *Offset, unsigned UnitIndex, + uint8_t &UnitType, bool &isUnitDWARF64) { + uint32_t AbbrOffset, Length; + uint8_t AddrSize = 0; + uint16_t Version; + bool Success = true; + + bool ValidLength = false; + bool ValidVersion = false; + bool ValidAddrSize = false; + bool ValidType = true; + bool ValidAbbrevOffset = true; + + uint32_t OffsetStart = *Offset; + Length = DebugInfoData.getU32(Offset); + if (Length == UINT32_MAX) { + isUnitDWARF64 = true; + OS << format( + "Unit[%d] is in 64-bit DWARF format; cannot verify from this point.\n", + UnitIndex); + return false; + } + Version = DebugInfoData.getU16(Offset); + + if (Version >= 5) { + UnitType = DebugInfoData.getU8(Offset); + AddrSize = DebugInfoData.getU8(Offset); + AbbrOffset = DebugInfoData.getU32(Offset); + ValidType = DWARFUnit::isValidUnitType(UnitType); + } else { + UnitType = 0; + AbbrOffset = DebugInfoData.getU32(Offset); + AddrSize = DebugInfoData.getU8(Offset); + } + + if (!DCtx.getDebugAbbrev()->getAbbreviationDeclarationSet(AbbrOffset)) + ValidAbbrevOffset = false; + + ValidLength = DebugInfoData.isValidOffset(OffsetStart + Length + 3); + ValidVersion = DWARFContext::isSupportedVersion(Version); + ValidAddrSize = AddrSize == 4 || AddrSize == 8; + if (!ValidLength || !ValidVersion || !ValidAddrSize || !ValidAbbrevOffset || + !ValidType) { + Success = false; + OS << format("Units[%d] - start offset: 0x%08x \n", UnitIndex, OffsetStart); + if (!ValidLength) + OS << "\tError: The length for this unit is too " + "large for the .debug_info provided.\n"; + if (!ValidVersion) + OS << "\tError: The 16 bit unit header version is not valid.\n"; + if (!ValidType) + OS << "\tError: The unit type encoding is not valid.\n"; + if (!ValidAbbrevOffset) + OS << "\tError: The offset into the .debug_abbrev section is " + "not valid.\n"; + if (!ValidAddrSize) + OS << "\tError: The address size is unsupported.\n"; + } + *Offset = OffsetStart + Length + 4; + return Success; +} + +bool DWARFVerifier::verifyUnitContents(DWARFUnit Unit) { + uint32_t NumUnitErrors = 0; + unsigned NumDies = Unit.getNumDIEs(); + for (unsigned I = 0; I < NumDies; ++I) { + auto Die = Unit.getDIEAtIndex(I); + if (Die.getTag() == DW_TAG_null) + continue; + for (auto AttrValue : Die.attributes()) { + NumUnitErrors += verifyDebugInfoAttribute(Die, AttrValue); + NumUnitErrors += verifyDebugInfoForm(Die, AttrValue); + } + } + return NumUnitErrors == 0; +} + +bool DWARFVerifier::handleDebugInfo() { + OS << "Verifying .debug_info Unit Header Chain...\n"; + + DWARFDataExtractor DebugInfoData(DCtx.getInfoSection(), DCtx.isLittleEndian(), + 0); + uint32_t NumDebugInfoErrors = 0; + uint32_t OffsetStart = 0, Offset = 0, UnitIdx = 0; + uint8_t UnitType = 0; + bool isUnitDWARF64 = false; + bool isHeaderChainValid = true; + bool hasDIE = DebugInfoData.isValidOffset(Offset); + while (hasDIE) { + OffsetStart = Offset; + if (!verifyUnitHeader(DebugInfoData, &Offset, UnitIdx, UnitType, + isUnitDWARF64)) { + isHeaderChainValid = false; + if (isUnitDWARF64) + break; + } else { + std::unique_ptr Unit; + switch (UnitType) { + case dwarf::DW_UT_type: + case dwarf::DW_UT_split_type: { + DWARFUnitSection TUSection{}; + Unit.reset(new DWARFTypeUnit( + DCtx, DCtx.getInfoSection(), DCtx.getDebugAbbrev(), + &DCtx.getRangeSection(), DCtx.getStringSection(), + DCtx.getStringOffsetSection(), &DCtx.getAppleObjCSection(), + DCtx.getLineSection(), DCtx.isLittleEndian(), false, TUSection, + nullptr)); + break; + } + case dwarf::DW_UT_skeleton: + case dwarf::DW_UT_split_compile: + case dwarf::DW_UT_compile: + case dwarf::DW_UT_partial: + // UnitType = 0 means that we are + // verifying a compile unit in DWARF v4. + case 0: { + DWARFUnitSection CUSection{}; + Unit.reset(new DWARFCompileUnit( + DCtx, DCtx.getInfoSection(), DCtx.getDebugAbbrev(), + &DCtx.getRangeSection(), DCtx.getStringSection(), + DCtx.getStringOffsetSection(), &DCtx.getAppleObjCSection(), + DCtx.getLineSection(), DCtx.isLittleEndian(), false, CUSection, + nullptr)); + break; + } + default: { llvm_unreachable("Invalid UnitType."); } + } + Unit->extract(DebugInfoData, &OffsetStart); + if (!verifyUnitContents(*Unit)) + ++NumDebugInfoErrors; + } + hasDIE = DebugInfoData.isValidOffset(Offset); + ++UnitIdx; + } + if (UnitIdx == 0 && !hasDIE) { + OS << "Warning: .debug_info is empty.\n"; + isHeaderChainValid = true; + } + NumDebugInfoErrors += verifyDebugInfoReferences(); + return (isHeaderChainValid && NumDebugInfoErrors == 0); +} + +unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, + DWARFAttribute &AttrValue) { + unsigned NumErrors = 0; const auto Attr = AttrValue.Attr; switch (Attr) { case DW_AT_ranges: // Make sure the offset in the DW_AT_ranges attribute is valid. if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) { if (*SectionOffset >= DCtx.getRangeSection().Data.size()) { - ++NumDebugInfoErrors; + ++NumErrors; OS << "error: DW_AT_ranges offset is beyond .debug_ranges " "bounds:\n"; Die.dump(OS, 0); OS << "\n"; } } else { - ++NumDebugInfoErrors; + ++NumErrors; OS << "error: DIE has invalid DW_AT_ranges encoding:\n"; Die.dump(OS, 0); OS << "\n"; @@ -49,7 +193,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, // Make sure the offset in the DW_AT_stmt_list attribute is valid. if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) { if (*SectionOffset >= DCtx.getLineSection().Data.size()) { - ++NumDebugInfoErrors; + ++NumErrors; OS << "error: DW_AT_stmt_list offset is beyond .debug_line " "bounds: " << format("0x%08" PRIx32, *SectionOffset) << "\n"; @@ -57,7 +201,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, OS << "\n"; } } else { - ++NumDebugInfoErrors; + ++NumErrors; OS << "error: DIE has invalid DW_AT_stmt_list encoding:\n"; Die.dump(OS, 0); OS << "\n"; @@ -67,10 +211,12 @@ void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, default: break; } + return NumErrors; } -void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, - DWARFAttribute &AttrValue) { +unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, + DWARFAttribute &AttrValue) { + unsigned NumErrors = 0; const auto Form = AttrValue.Value.getForm(); switch (Form) { case DW_FORM_ref1: @@ -86,7 +232,7 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, auto CUSize = DieCU->getNextUnitOffset() - DieCU->getOffset(); auto CUOffset = AttrValue.Value.getRawUValue(); if (CUOffset >= CUSize) { - ++NumDebugInfoErrors; + ++NumErrors; OS << "error: " << FormEncodingString(Form) << " CU offset " << format("0x%08" PRIx32, CUOffset) << " is invalid (must be less than CU size of " @@ -108,7 +254,7 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, assert(RefVal); if (RefVal) { if (*RefVal >= DCtx.getInfoSection().Data.size()) { - ++NumDebugInfoErrors; + ++NumErrors; OS << "error: DW_FORM_ref_addr offset beyond .debug_info " "bounds:\n"; Die.dump(OS, 0); @@ -125,7 +271,7 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, auto SecOffset = AttrValue.Value.getAsSectionOffset(); assert(SecOffset); // DW_FORM_strp is a section offset. if (SecOffset && *SecOffset >= DCtx.getStringSection().size()) { - ++NumDebugInfoErrors; + ++NumErrors; OS << "error: DW_FORM_strp offset beyond .debug_str bounds:\n"; Die.dump(OS, 0); OS << "\n"; @@ -135,17 +281,19 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, default: break; } + return NumErrors; } -void DWARFVerifier::verifyDebugInfoReferences() { +unsigned DWARFVerifier::verifyDebugInfoReferences() { // Take all references and make sure they point to an actual DIE by // getting the DIE by offset and emitting an error OS << "Verifying .debug_info references...\n"; + unsigned NumErrors = 0; for (auto Pair : ReferenceToDIEOffsets) { auto Die = DCtx.getDIEForOffset(Pair.first); if (Die) continue; - ++NumDebugInfoErrors; + ++NumErrors; OS << "error: invalid DIE reference " << format("0x%08" PRIx64, Pair.first) << ". Offset is in between DIEs:\n"; for (auto Offset : Pair.second) { @@ -155,26 +303,7 @@ void DWARFVerifier::verifyDebugInfoReferences() { } OS << "\n"; } -} - -bool DWARFVerifier::handleDebugInfo() { - NumDebugInfoErrors = 0; - OS << "Verifying .debug_info...\n"; - for (const auto &CU : DCtx.compile_units()) { - unsigned NumDies = CU->getNumDIEs(); - for (unsigned I = 0; I < NumDies; ++I) { - auto Die = CU->getDIEAtIndex(I); - const auto Tag = Die.getTag(); - if (Tag == DW_TAG_null) - continue; - for (auto AttrValue : Die.attributes()) { - verifyDebugInfoAttribute(Die, AttrValue); - verifyDebugInfoForm(Die, AttrValue); - } - } - } - verifyDebugInfoReferences(); - return NumDebugInfoErrors == 0; + return NumErrors; } void DWARFVerifier::verifyDebugLineStmtOffsets() { diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt index ff01c948e099..9b1f37943e67 100644 --- a/lib/DebugInfo/PDB/CMakeLists.txt +++ b/lib/DebugInfo/PDB/CMakeLists.txt @@ -52,7 +52,6 @@ add_pdb_impl_folder(Native Native/PDBFileBuilder.cpp Native/PDBStringTable.cpp Native/PDBStringTableBuilder.cpp - Native/PDBTypeServerHandler.cpp Native/PublicsStream.cpp Native/PublicsStreamBuilder.cpp Native/RawError.cpp diff --git a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp index 0b48a366bd24..4c59d2f2a9d9 100644 --- a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp +++ b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp @@ -125,16 +125,16 @@ PrivateGetDIAValue(IDiaSymbol *Symbol, return Result8; } -PDB_UniqueId +codeview::GUID PrivateGetDIAValue(IDiaSymbol *Symbol, HRESULT (__stdcall IDiaSymbol::*Method)(GUID *)) { GUID Result; if (S_OK != (Symbol->*Method)(&Result)) - return PDB_UniqueId(); + return codeview::GUID(); - static_assert(sizeof(PDB_UniqueId) == sizeof(GUID), - "PDB_UniqueId is the wrong size!"); - PDB_UniqueId IdResult; + static_assert(sizeof(codeview::GUID) == sizeof(GUID), + "GUID is the wrong size!"); + codeview::GUID IdResult; ::memcpy(&IdResult, &Result, sizeof(GUID)); return IdResult; } @@ -746,7 +746,7 @@ PDB_SymType DIARawSymbol::getSymTag() const { &IDiaSymbol::get_symTag); } -PDB_UniqueId DIARawSymbol::getGuid() const { +codeview::GUID DIARawSymbol::getGuid() const { return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_guid); } diff --git a/lib/DebugInfo/PDB/GenericError.cpp b/lib/DebugInfo/PDB/GenericError.cpp index 789f3b813170..4fcecb92fd15 100644 --- a/lib/DebugInfo/PDB/GenericError.cpp +++ b/lib/DebugInfo/PDB/GenericError.cpp @@ -26,6 +26,8 @@ public: switch (static_cast(Condition)) { case generic_error_code::unspecified: return "An unknown error has occurred."; + case generic_error_code::type_server_not_found: + return "Type server PDB was not found."; case generic_error_code::dia_sdk_not_present: return "LLVM was not compiled with support for DIA. This usually means " "that you are are not using MSVC, or your Visual Studio " diff --git a/lib/DebugInfo/PDB/Native/InfoStream.cpp b/lib/DebugInfo/PDB/Native/InfoStream.cpp index 21b66b3e7bcf..829879060c33 100644 --- a/lib/DebugInfo/PDB/Native/InfoStream.cpp +++ b/lib/DebugInfo/PDB/Native/InfoStream.cpp @@ -118,7 +118,7 @@ uint32_t InfoStream::getSignature() const { return Signature; } uint32_t InfoStream::getAge() const { return Age; } -PDB_UniqueId InfoStream::getGuid() const { return Guid; } +GUID InfoStream::getGuid() const { return Guid; } uint32_t InfoStream::getNamedStreamMapByteSize() const { return NamedStreamMapByteSize; diff --git a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp index 707128f7efd4..6450ae752f96 100644 --- a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp +++ b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp @@ -34,7 +34,7 @@ void InfoStreamBuilder::setSignature(uint32_t S) { Sig = S; } void InfoStreamBuilder::setAge(uint32_t A) { Age = A; } -void InfoStreamBuilder::setGuid(PDB_UniqueId G) { Guid = G; } +void InfoStreamBuilder::setGuid(GUID G) { Guid = G; } void InfoStreamBuilder::addFeature(PdbRaw_FeatureSig Sig) { Features.push_back(Sig); diff --git a/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp index cb0830f453c8..3241000b06db 100644 --- a/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp +++ b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp @@ -56,12 +56,12 @@ std::string NativeExeSymbol::getSymbolsFileName() const { return File.getFilePath(); } -PDB_UniqueId NativeExeSymbol::getGuid() const { +codeview::GUID NativeExeSymbol::getGuid() const { auto IS = File.getPDBInfoStream(); if (IS) return IS->getGuid(); consumeError(IS.takeError()); - return PDB_UniqueId{{0}}; + return codeview::GUID{{0}}; } bool NativeExeSymbol::hasCTypes() const { diff --git a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp index 92612bcea4ac..df3f418052a9 100644 --- a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp +++ b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp @@ -323,9 +323,7 @@ PDB_SymType NativeRawSymbol::getSymTag() const { return PDB_SymType::None; } -PDB_UniqueId NativeRawSymbol::getGuid() const { - return PDB_UniqueId{{0}}; -} +codeview::GUID NativeRawSymbol::getGuid() const { return codeview::GUID{{0}}; } int32_t NativeRawSymbol::getOffset() const { return 0; diff --git a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp deleted file mode 100644 index 9fd90102f72c..000000000000 --- a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp +++ /dev/null @@ -1,126 +0,0 @@ -//===- PDBTypeServerHandler.cpp ---------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Handles CodeView LF_TYPESERVER2 records by attempting to locate a matching -// PDB file, then loading the PDB file and visiting all types from the -// referenced PDB using the original supplied visitor. -// -// The net effect of this is that when visiting a PDB containing a TypeServer -// record, the TypeServer record is "replaced" with all of the records in -// the referenced PDB file. If a single instance of PDBTypeServerHandler -// encounters the same TypeServer multiple times (for example reusing one -// PDBTypeServerHandler across multiple visitations of distinct object files or -// PDB files), PDBTypeServerHandler will optionally revisit all the records -// again, or simply consume the record and do nothing. -//===----------------------------------------------------------------------===// - -#include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h" - -#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" -#include "llvm/DebugInfo/CodeView/CodeViewError.h" -#include "llvm/DebugInfo/PDB/GenericError.h" -#include "llvm/DebugInfo/PDB/Native/InfoStream.h" -#include "llvm/DebugInfo/PDB/Native/NativeSession.h" -#include "llvm/DebugInfo/PDB/Native/PDBFile.h" -#include "llvm/DebugInfo/PDB/Native/TpiStream.h" -#include "llvm/DebugInfo/PDB/PDB.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/Path.h" - -using namespace llvm; -using namespace llvm::codeview; -using namespace llvm::pdb; - -static void ignoreErrors(Error EC) { - llvm::handleAllErrors(std::move(EC), [&](ErrorInfoBase &EIB) {}); -} - -PDBTypeServerHandler::PDBTypeServerHandler(bool RevisitAlways) - : RevisitAlways(RevisitAlways) {} - -void PDBTypeServerHandler::addSearchPath(StringRef Path) { - if (Path.empty() || !sys::fs::is_directory(Path)) - return; - - SearchPaths.insert(Path); -} - -Expected -PDBTypeServerHandler::handleInternal(PDBFile &File, - TypeVisitorCallbacks &Callbacks) { - auto ExpectedTpi = File.getPDBTpiStream(); - if (!ExpectedTpi) - return ExpectedTpi.takeError(); - - // For handling a type server, we should be using whatever the callback array - // was - // that is being used for the original file. We shouldn't allow the visitor - // to - // arbitrarily stick a deserializer in there. - if (auto EC = codeview::visitTypeStream(ExpectedTpi->typeArray(), Callbacks, - VDS_BytesExternal)) - return std::move(EC); - - return true; -} - -Expected PDBTypeServerHandler::handle(TypeServer2Record &TS, - TypeVisitorCallbacks &Callbacks) { - if (Session) { - // If we've already handled this TypeServer and we only want to handle each - // TypeServer once, consume the record without doing anything. - if (!RevisitAlways) - return true; - - return handleInternal(Session->getPDBFile(), Callbacks); - } - - StringRef File = sys::path::filename(TS.Name); - if (File.empty()) - return make_error( - cv_error_code::corrupt_record, - "TypeServer2Record does not contain filename!"); - - for (auto &Path : SearchPaths) { - SmallString<64> PathStr = Path.getKey(); - sys::path::append(PathStr, File); - if (!sys::fs::exists(PathStr)) - continue; - - std::unique_ptr ThisSession; - if (auto EC = loadDataForPDB(PDB_ReaderType::Native, PathStr, ThisSession)) { - // It is not an error if this PDB fails to load, it just means that it - // doesn't match and we should continue searching. - ignoreErrors(std::move(EC)); - continue; - } - - std::unique_ptr NS( - static_cast(ThisSession.release())); - PDBFile &File = NS->getPDBFile(); - auto ExpectedInfo = File.getPDBInfoStream(); - // All PDB Files should have an Info stream. - if (!ExpectedInfo) - return ExpectedInfo.takeError(); - - // Just because a file with a matching name was found and it was an actual - // PDB file doesn't mean it matches. For it to match the InfoStream's GUID - // must match the GUID specified in the TypeServer2 record. - ArrayRef GuidBytes(ExpectedInfo->getGuid().Guid); - StringRef GuidStr(reinterpret_cast(GuidBytes.begin()), - GuidBytes.size()); - if (GuidStr != TS.Guid) - continue; - - Session = std::move(NS); - return handleInternal(File, Callbacks); - } - - // We couldn't find a matching PDB, so let it be handled by someone else. - return false; -} diff --git a/lib/DebugInfo/PDB/Native/TpiHashing.cpp b/lib/DebugInfo/PDB/Native/TpiHashing.cpp index 91b8d648fcf9..77a2d57a8369 100644 --- a/lib/DebugInfo/PDB/Native/TpiHashing.cpp +++ b/lib/DebugInfo/PDB/Native/TpiHashing.cpp @@ -11,101 +11,79 @@ #include "llvm/DebugInfo/CodeView/TypeDeserializer.h" #include "llvm/DebugInfo/PDB/Native/Hash.h" -#include "llvm/DebugInfo/PDB/Native/RawError.h" +#include "llvm/Support/JamCRC.h" using namespace llvm; using namespace llvm::codeview; using namespace llvm::pdb; // Corresponds to `fUDTAnon`. -template static bool isAnonymous(T &Rec) { - StringRef Name = Rec.getName(); +static bool isAnonymous(StringRef Name) { return Name == "" || Name == "__unnamed" || Name.endswith("::") || Name.endswith("::__unnamed"); } -// Computes a hash for a given TPI record. -template -static uint32_t getTpiHash(T &Rec, ArrayRef FullRecord) { - auto Opts = static_cast(Rec.getOptions()); - - bool ForwardRef = - Opts & static_cast(ClassOptions::ForwardReference); - bool Scoped = Opts & static_cast(ClassOptions::Scoped); - bool UniqueName = Opts & static_cast(ClassOptions::HasUniqueName); - bool IsAnon = UniqueName && isAnonymous(Rec); +// Computes the hash for a user-defined type record. This could be a struct, +// class, union, or enum. +static uint32_t getHashForUdt(const TagRecord &Rec, + ArrayRef FullRecord) { + ClassOptions Opts = Rec.getOptions(); + bool ForwardRef = bool(Opts & ClassOptions::ForwardReference); + bool Scoped = bool(Opts & ClassOptions::Scoped); + bool HasUniqueName = bool(Opts & ClassOptions::HasUniqueName); + bool IsAnon = HasUniqueName && isAnonymous(Rec.getName()); if (!ForwardRef && !Scoped && !IsAnon) return hashStringV1(Rec.getName()); - if (!ForwardRef && UniqueName && !IsAnon) + if (!ForwardRef && HasUniqueName && !IsAnon) return hashStringV1(Rec.getUniqueName()); return hashBufferV8(FullRecord); } -template static uint32_t getSourceLineHash(T &Rec) { - char Buf[4]; - support::endian::write32le(Buf, Rec.getUDT().getIndex()); - return hashStringV1(StringRef(Buf, 4)); -} - -void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, - UdtSourceLineRecord &Rec) { - CVR.Hash = getSourceLineHash(Rec); -} - -void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, - UdtModSourceLineRecord &Rec) { - CVR.Hash = getSourceLineHash(Rec); -} - -void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, ClassRecord &Rec) { - CVR.Hash = getTpiHash(Rec, CVR.data()); -} - -void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, EnumRecord &Rec) { - CVR.Hash = getTpiHash(Rec, CVR.data()); -} - -void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, UnionRecord &Rec) { - CVR.Hash = getTpiHash(Rec, CVR.data()); -} - -Error TpiHashVerifier::visitKnownRecord(CVType &CVR, UdtSourceLineRecord &Rec) { - return verifySourceLine(Rec.getUDT()); -} - -Error TpiHashVerifier::visitKnownRecord(CVType &CVR, - UdtModSourceLineRecord &Rec) { - return verifySourceLine(Rec.getUDT()); -} - -Error TpiHashVerifier::visitKnownRecord(CVType &CVR, ClassRecord &Rec) { - if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index]) - return errorInvalidHash(); - return Error::success(); -} -Error TpiHashVerifier::visitKnownRecord(CVType &CVR, EnumRecord &Rec) { - if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index]) - return errorInvalidHash(); - return Error::success(); -} -Error TpiHashVerifier::visitKnownRecord(CVType &CVR, UnionRecord &Rec) { - if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index]) - return errorInvalidHash(); - return Error::success(); +template +static Expected getHashForUdt(const CVType &Rec) { + T Deserialized; + if (auto E = TypeDeserializer::deserializeAs(const_cast(Rec), + Deserialized)) + return std::move(E); + return getHashForUdt(Deserialized, Rec.data()); } -Error TpiHashVerifier::verifySourceLine(codeview::TypeIndex TI) { +template +static Expected getSourceLineHash(const CVType &Rec) { + T Deserialized; + if (auto E = TypeDeserializer::deserializeAs(const_cast(Rec), + Deserialized)) + return std::move(E); char Buf[4]; - support::endian::write32le(Buf, TI.getIndex()); - uint32_t Hash = hashStringV1(StringRef(Buf, 4)); - if (Hash % NumHashBuckets != HashValues[Index]) - return errorInvalidHash(); - return Error::success(); + support::endian::write32le(Buf, Deserialized.getUDT().getIndex()); + return hashStringV1(StringRef(Buf, 4)); } -Error TpiHashVerifier::visitTypeBegin(CVType &Rec) { - ++Index; - RawRecord = Rec; - return Error::success(); +Expected llvm::pdb::hashTypeRecord(const CVType &Rec) { + switch (Rec.kind()) { + case LF_CLASS: + case LF_STRUCTURE: + case LF_INTERFACE: + return getHashForUdt(Rec); + case LF_UNION: + return getHashForUdt(Rec); + case LF_ENUM: + return getHashForUdt(Rec); + + case LF_UDT_SRC_LINE: + return getSourceLineHash(Rec); + case LF_UDT_MOD_SRC_LINE: + return getSourceLineHash(Rec); + + default: + break; + } + + // Run CRC32 over the bytes. This corresponds to `hashBufv8`. + JamCRC JC(/*Init=*/0U); + ArrayRef Bytes(reinterpret_cast(Rec.data().data()), + Rec.data().size()); + JC.update(Bytes); + return JC.getCRC(); } diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp index f917ef91f639..d3ef87d9009d 100644 --- a/lib/DebugInfo/PDB/Native/TpiStream.cpp +++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp @@ -14,7 +14,6 @@ #include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/DebugInfo/MSF/MappedBlockStream.h" #include "llvm/DebugInfo/PDB/Native/PDBFile.h" -#include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h" #include "llvm/DebugInfo/PDB/Native/RawConstants.h" #include "llvm/DebugInfo/PDB/Native/RawError.h" #include "llvm/DebugInfo/PDB/Native/RawTypes.h" diff --git a/lib/DebugInfo/PDB/PDBExtras.cpp b/lib/DebugInfo/PDB/PDBExtras.cpp index faf1142ddf17..c291185bc67a 100644 --- a/lib/DebugInfo/PDB/PDBExtras.cpp +++ b/lib/DebugInfo/PDB/PDBExtras.cpp @@ -260,12 +260,6 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, return OS; } -raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UniqueId &Guid) { - codeview::detail::GuidAdapter A(Guid.Guid); - A.format(OS, ""); - return OS; -} - raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UdtType &Type) { switch (Type) { CASE_OUTPUT_ENUM_CLASS_STR(PDB_UdtType, Class, "class", OS) diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h index adca0eeb08b4..43461de4c491 100644 --- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h +++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h @@ -288,7 +288,6 @@ private: HalfDiffKindBits); addRelocationForSection(R, SectionAID); - addRelocationForSection(R, SectionBID); return ++RelI; } diff --git a/lib/Fuzzer/CMakeLists.txt b/lib/Fuzzer/CMakeLists.txt index fa743c280e86..bc744890b997 100644 --- a/lib/Fuzzer/CMakeLists.txt +++ b/lib/Fuzzer/CMakeLists.txt @@ -46,7 +46,6 @@ if ( LLVM_USE_SANITIZE_COVERAGE OR CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux" ) FuzzerShmemPosix.cpp FuzzerShmemWindows.cpp FuzzerTracePC.cpp - FuzzerTraceState.cpp FuzzerUtil.cpp FuzzerUtilDarwin.cpp FuzzerUtilLinux.cpp diff --git a/lib/Fuzzer/FuzzerCorpus.h b/lib/Fuzzer/FuzzerCorpus.h index 218ae5b6ac4d..bae0aea78f13 100644 --- a/lib/Fuzzer/FuzzerCorpus.h +++ b/lib/Fuzzer/FuzzerCorpus.h @@ -34,7 +34,8 @@ struct InputInfo { size_t NumExecutedMutations = 0; size_t NumSuccessfullMutations = 0; bool MayDeleteFile = false; - std::vector FeatureSet; + bool Reduced = false; + std::vector UniqFeatureSet; }; class InputCorpus { @@ -79,7 +80,8 @@ class InputCorpus { II.U = U; II.NumFeatures = NumFeatures; II.MayDeleteFile = MayDeleteFile; - II.FeatureSet = FeatureSet; + II.UniqFeatureSet = FeatureSet; + std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end()); ComputeSHA1(U.data(), U.size(), II.Sha1); Hashes.insert(Sha1ToString(II.Sha1)); UpdateCorpusDistribution(); @@ -117,34 +119,21 @@ class InputCorpus { Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size()); PrintUnit(II->U); Printf(" "); - PrintFeatureSet(II->FeatureSet); + PrintFeatureSet(II->UniqFeatureSet); Printf("\n"); } i++; } } - // If FeatureSet is that same as in II, replace II->U with {Data,Size}. - bool TryToReplace(InputInfo *II, const uint8_t *Data, size_t Size, - const std::vector &FeatureSet) { - if (II->U.size() > Size && II->FeatureSet.size() && - II->FeatureSet == FeatureSet) { - if (FeatureDebug) - Printf("Replace: %zd => %zd\n", II->U.size(), Size); - Replace(II, {Data, Data + Size}); - PrintCorpus(); - return true; - } - return false; - } - void Replace(InputInfo *II, const Unit &U) { - assert(II->U.size()); + assert(II->U.size() > U.size()); Hashes.erase(Sha1ToString(II->Sha1)); DeleteFile(*II); ComputeSHA1(U.data(), U.size(), II->Sha1); Hashes.insert(Sha1ToString(II->Sha1)); II->U = U; + II->Reduced = true; } bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } @@ -198,7 +187,7 @@ class InputCorpus { Printf("EVICTED %zd\n", Idx); } - void AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { + bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { assert(NewSize); Idx = Idx % kFeatureSetSize; uint32_t OldSize = GetFeature(Idx); @@ -218,8 +207,9 @@ class InputCorpus { Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); SmallestElementPerFeature[Idx] = Inputs.size(); InputSizesPerFeature[Idx] = NewSize; - CountingFeatures = true; + return true; } + return false; } size_t NumFeatures() const { return NumAddedFeatures; } @@ -238,7 +228,6 @@ private: size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } void ValidateFeatureSet() { - if (!CountingFeatures) return; if (FeatureDebug) PrintFeatureSet(); for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) @@ -256,14 +245,12 @@ private: // Must be called whenever the corpus or unit weights are changed. void UpdateCorpusDistribution() { size_t N = Inputs.size(); + assert(N); Intervals.resize(N + 1); Weights.resize(N); std::iota(Intervals.begin(), Intervals.end(), 0); - if (CountingFeatures) - for (size_t i = 0; i < N; i++) - Weights[i] = Inputs[i]->NumFeatures * (i + 1); - else - std::iota(Weights.begin(), Weights.end(), 1); + for (size_t i = 0; i < N; i++) + Weights[i] = Inputs[i]->NumFeatures * (i + 1); CorpusDistribution = std::piecewise_constant_distribution( Intervals.begin(), Intervals.end(), Weights.begin()); } @@ -275,7 +262,6 @@ private: std::unordered_set Hashes; std::vector Inputs; - bool CountingFeatures = false; size_t NumAddedFeatures = 0; size_t NumUpdatedFeatures = 0; uint32_t InputSizesPerFeature[kFeatureSetSize]; diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp index 87968893853e..fd8cab38a7bb 100644 --- a/lib/Fuzzer/FuzzerDriver.cpp +++ b/lib/Fuzzer/FuzzerDriver.cpp @@ -186,7 +186,11 @@ static void ParseFlags(const std::vector &Args) { } Inputs = new std::vector; for (size_t A = 1; A < Args.size(); A++) { - if (ParseOneFlag(Args[A].c_str())) continue; + if (ParseOneFlag(Args[A].c_str())) { + if (Flags.ignore_remaining_args) + break; + continue; + } Inputs->push_back(Args[A]); } } @@ -356,16 +360,17 @@ int MinimizeCrashInput(const std::vector &Args, exit(1); } std::string InputFilePath = Inputs->at(0); - std::string BaseCmd = - CloneArgsWithoutX(Args, "minimize_crash", "exact_artifact_path"); - auto InputPos = BaseCmd.find(" " + InputFilePath + " "); + auto BaseCmd = SplitBefore( + "-ignore_remaining_args=1", + CloneArgsWithoutX(Args, "minimize_crash", "exact_artifact_path")); + auto InputPos = BaseCmd.first.find(" " + InputFilePath + " "); assert(InputPos != std::string::npos); - BaseCmd.erase(InputPos, InputFilePath.size() + 1); + BaseCmd.first.erase(InputPos, InputFilePath.size() + 1); if (Flags.runs <= 0 && Flags.max_total_time == 0) { Printf("INFO: you need to specify -runs=N or " "-max_total_time=N with -minimize_crash=1\n" "INFO: defaulting to -max_total_time=600\n"); - BaseCmd += " -max_total_time=600"; + BaseCmd.first += " -max_total_time=600"; } auto LogFilePath = DirPlusFile( @@ -378,7 +383,8 @@ int MinimizeCrashInput(const std::vector &Args, Printf("CRASH_MIN: minimizing crash input: '%s' (%zd bytes)\n", CurrentFilePath.c_str(), U.size()); - auto Cmd = BaseCmd + " " + CurrentFilePath + LogFileRedirect; + auto Cmd = BaseCmd.first + " " + CurrentFilePath + LogFileRedirect + " " + + BaseCmd.second; Printf("CRASH_MIN: executing: %s\n", Cmd.c_str()); int ExitCode = ExecuteCommand(Cmd); diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def index 5e70cbad3cf1..526805705b20 100644 --- a/lib/Fuzzer/FuzzerFlags.def +++ b/lib/Fuzzer/FuzzerFlags.def @@ -121,6 +121,9 @@ FUZZER_FLAG_STRING(exit_on_src_pos, "Exit if a newly found PC originates" FUZZER_FLAG_STRING(exit_on_item, "Exit if an item with a given sha1 sum" " was added to the corpus. " "Used primarily for testing libFuzzer itself.") +FUZZER_FLAG_INT(ignore_remaining_args, 0, "If 1, ignore all arguments passed " + "after this one. Useful for fuzzers that need to do their own " + "argument parsing.") FUZZER_FLAG_STRING(run_equivalence_server, "Experimental") FUZZER_FLAG_STRING(use_equivalence_server, "Experimental") diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h index a732f895375e..3fc3fe004cef 100644 --- a/lib/Fuzzer/FuzzerInternal.h +++ b/lib/Fuzzer/FuzzerInternal.h @@ -38,7 +38,6 @@ public: void Loop(); void MinimizeCrashLoop(const Unit &U); void ShuffleAndMinimize(UnitVector *V); - void InitializeTraceState(); void RereadOutputCorpus(size_t MaxSize); size_t secondsSinceProcessStartUp() { @@ -100,19 +99,10 @@ private: void WriteToOutputCorpus(const Unit &U); void WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix); void PrintStats(const char *Where, const char *End = "\n", size_t Units = 0); - void PrintStatusForNewUnit(const Unit &U); + void PrintStatusForNewUnit(const Unit &U, const char *Text); void ShuffleCorpus(UnitVector *V); void CheckExitOnSrcPosOrItem(); - // Trace-based fuzzing: we run a unit with some kind of tracing - // enabled and record potentially useful mutations. Then - // We apply these mutations one by one to the unit and run it again. - - // Start tracing; forget all previously proposed mutations. - void StartTraceRecording(); - // Stop tracing. - void StopTraceRecording(); - static void StaticDeathCallback(); void DumpCurrentUnit(const char *Prefix); void DeathCallback(); @@ -142,7 +132,7 @@ private: size_t MaxInputLen = 0; size_t MaxMutationLen = 0; - std::vector FeatureSetTmp; + std::vector UniqFeatureSetTmp; // Need to know our own thread. static thread_local bool IsMyThread; diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp index 6816f3af8a6f..8ac7a847aef7 100644 --- a/lib/Fuzzer/FuzzerLoop.cpp +++ b/lib/Fuzzer/FuzzerLoop.cpp @@ -114,7 +114,6 @@ Fuzzer::Fuzzer(UserCallback CB, InputCorpus &Corpus, MutationDispatcher &MD, : CB(CB), Corpus(Corpus), MD(MD), Options(Options) { if (EF->__sanitizer_set_death_callback) EF->__sanitizer_set_death_callback(StaticDeathCallback); - InitializeTraceState(); assert(!F); F = this; TPC.ResetMaps(); @@ -403,22 +402,29 @@ bool Fuzzer::RunOne(const uint8_t *Data, size_t Size, bool MayDeleteFile, ExecuteCallback(Data, Size); - FeatureSetTmp.clear(); + UniqFeatureSetTmp.clear(); + size_t FoundUniqFeaturesOfII = 0; size_t NumUpdatesBefore = Corpus.NumFeatureUpdates(); TPC.CollectFeatures([&](size_t Feature) { - Corpus.AddFeature(Feature, Size, Options.Shrink); - if (Options.ReduceInputs) - FeatureSetTmp.push_back(Feature); + if (Corpus.AddFeature(Feature, Size, Options.Shrink)) + UniqFeatureSetTmp.push_back(Feature); + if (Options.ReduceInputs && II) + if (std::binary_search(II->UniqFeatureSet.begin(), + II->UniqFeatureSet.end(), Feature)) + FoundUniqFeaturesOfII++; }); PrintPulseAndReportSlowInput(Data, Size); size_t NumNewFeatures = Corpus.NumFeatureUpdates() - NumUpdatesBefore; if (NumNewFeatures) { Corpus.AddToCorpus({Data, Data + Size}, NumNewFeatures, MayDeleteFile, - FeatureSetTmp); + UniqFeatureSetTmp); CheckExitOnSrcPosOrItem(); return true; } - if (II && Corpus.TryToReplace(II, Data, Size, FeatureSetTmp)) { + if (II && FoundUniqFeaturesOfII && + FoundUniqFeaturesOfII == II->UniqFeatureSet.size() && + II->U.size() > Size) { + Corpus.Replace(II, {Data, Data + Size}); CheckExitOnSrcPosOrItem(); return true; } @@ -501,10 +507,10 @@ void Fuzzer::WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix) { Printf("Base64: %s\n", Base64(U).c_str()); } -void Fuzzer::PrintStatusForNewUnit(const Unit &U) { +void Fuzzer::PrintStatusForNewUnit(const Unit &U, const char *Text) { if (!Options.PrintNEW) return; - PrintStats("NEW ", ""); + PrintStats(Text, ""); if (Options.Verbosity) { Printf(" L: %zd ", U.size()); MD.PrintMutationSequence(); @@ -515,7 +521,8 @@ void Fuzzer::PrintStatusForNewUnit(const Unit &U) { void Fuzzer::ReportNewCoverage(InputInfo *II, const Unit &U) { II->NumSuccessfullMutations++; MD.RecordSuccessfulMutationSequence(); - PrintStatusForNewUnit(U); + PrintStatusForNewUnit(U, II->Reduced ? "REDUCE" : + "NEW "); WriteToOutputCorpus(U); NumberOfNewUnitsAdded++; TPC.PrintNewPCs(); @@ -600,13 +607,10 @@ void Fuzzer::MutateAndTestOne() { assert(NewSize > 0 && "Mutator returned empty unit"); assert(NewSize <= CurrentMaxMutationLen && "Mutator return overisized unit"); Size = NewSize; - if (i == 0) - StartTraceRecording(); II.NumExecutedMutations++; if (RunOne(CurrentUnitData, Size, /*MayDeleteFile=*/true, &II)) ReportNewCoverage(&II, {CurrentUnitData, CurrentUnitData + Size}); - StopTraceRecording(); TryDetectingAMemoryLeak(CurrentUnitData, Size, /*DuringInitialCorpusExecution*/ false); } diff --git a/lib/Fuzzer/FuzzerMerge.cpp b/lib/Fuzzer/FuzzerMerge.cpp index 612f4bbb28f2..616c0999aa39 100644 --- a/lib/Fuzzer/FuzzerMerge.cpp +++ b/lib/Fuzzer/FuzzerMerge.cpp @@ -241,7 +241,6 @@ void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath) { return true; }); // Show stats. - TotalNumberOfRuns++; if (!(TotalNumberOfRuns & (TotalNumberOfRuns - 1))) PrintStats("pulse "); // Write the post-run marker and the coverage. @@ -286,12 +285,13 @@ void Fuzzer::CrashResistantMerge(const std::vector &Args, // Execute the inner process untill it passes. // Every inner process should execute at least one input. - std::string BaseCmd = CloneArgsWithoutX(Args, "keep-all-flags"); + auto BaseCmd = SplitBefore("-ignore_remaining_args=1", + CloneArgsWithoutX(Args, "keep-all-flags")); bool Success = false; for (size_t i = 1; i <= AllFiles.size(); i++) { Printf("MERGE-OUTER: attempt %zd\n", i); - auto ExitCode = - ExecuteCommand(BaseCmd + " -merge_control_file=" + CFPath); + auto ExitCode = ExecuteCommand(BaseCmd.first + " -merge_control_file=" + + CFPath + " " + BaseCmd.second); if (!ExitCode) { Printf("MERGE-OUTER: succesfull in %zd attempt(s)\n", i); Success = true; diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp index 53cb9027e455..5998ef9d3193 100644 --- a/lib/Fuzzer/FuzzerMutate.cpp +++ b/lib/Fuzzer/FuzzerMutate.cpp @@ -43,8 +43,6 @@ MutationDispatcher::MutationDispatcher(Random &Rand, {&MutationDispatcher::Mutate_CrossOver, "CrossOver"}, {&MutationDispatcher::Mutate_AddWordFromManualDictionary, "ManualDict"}, - {&MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary, - "TempAutoDict"}, {&MutationDispatcher::Mutate_AddWordFromPersistentAutoDictionary, "PersAutoDict"}, }); @@ -165,11 +163,6 @@ size_t MutationDispatcher::Mutate_AddWordFromManualDictionary(uint8_t *Data, return AddWordFromDictionary(ManualDictionary, Data, Size, MaxSize); } -size_t MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary( - uint8_t *Data, size_t Size, size_t MaxSize) { - return AddWordFromDictionary(TempAutoDictionary, Data, Size, MaxSize); -} - size_t MutationDispatcher::ApplyDictionaryEntry(uint8_t *Data, size_t Size, size_t MaxSize, DictionaryEntry &DE) { @@ -251,7 +244,7 @@ size_t MutationDispatcher::Mutate_AddWordFromTORC( uint8_t *Data, size_t Size, size_t MaxSize) { Word W; DictionaryEntry DE; - switch (Rand(3)) { + switch (Rand(4)) { case 0: { auto X = TPC.TORC8.Get(Rand.Rand()); DE = MakeDictionaryEntryFromCMP(X.A, X.B, Data, Size); @@ -267,6 +260,10 @@ size_t MutationDispatcher::Mutate_AddWordFromTORC( auto X = TPC.TORCW.Get(Rand.Rand()); DE = MakeDictionaryEntryFromCMP(X.A, X.B, Data, Size); } break; + case 3: if (Options.UseMemmem) { + auto X = TPC.MMT.Get(Rand.Rand()); + DE = DictionaryEntry(X); + } break; default: assert(0); } @@ -533,14 +530,4 @@ void MutationDispatcher::AddWordToManualDictionary(const Word &W) { {W, std::numeric_limits::max()}); } -void MutationDispatcher::AddWordToAutoDictionary(DictionaryEntry DE) { - static const size_t kMaxAutoDictSize = 1 << 14; - if (TempAutoDictionary.size() >= kMaxAutoDictSize) return; - TempAutoDictionary.push_back(DE); -} - -void MutationDispatcher::ClearAutoDictionary() { - TempAutoDictionary.clear(); -} - } // namespace fuzzer diff --git a/lib/Fuzzer/FuzzerMutate.h b/lib/Fuzzer/FuzzerMutate.h index 8c8fb3fd74c7..84b04c0dbf3e 100644 --- a/lib/Fuzzer/FuzzerMutate.h +++ b/lib/Fuzzer/FuzzerMutate.h @@ -52,10 +52,6 @@ public: size_t Mutate_AddWordFromManualDictionary(uint8_t *Data, size_t Size, size_t MaxSize); - /// Mutates data by adding a word from the temporary automatic dictionary. - size_t Mutate_AddWordFromTemporaryAutoDictionary(uint8_t *Data, size_t Size, - size_t MaxSize); - /// Mutates data by adding a word from the TORC. size_t Mutate_AddWordFromTORC(uint8_t *Data, size_t Size, size_t MaxSize); @@ -84,8 +80,6 @@ public: void AddWordToManualDictionary(const Word &W); - void AddWordToAutoDictionary(DictionaryEntry DE); - void ClearAutoDictionary(); void PrintRecommendedDictionary(); void SetCorpus(const InputCorpus *Corpus) { this->Corpus = Corpus; } diff --git a/lib/Fuzzer/FuzzerTracePC.cpp b/lib/Fuzzer/FuzzerTracePC.cpp index 6f5c7be41062..ced0a2133340 100644 --- a/lib/Fuzzer/FuzzerTracePC.cpp +++ b/lib/Fuzzer/FuzzerTracePC.cpp @@ -37,6 +37,8 @@ namespace fuzzer { TracePC TPC; +int ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr; + uint8_t *TracePC::Counters() const { return __sancov_trace_pc_guard_8bit_counters; } @@ -293,6 +295,20 @@ void TracePC::HandleCmp(uintptr_t PC, T Arg1, T Arg2) { ValueProfileMap.AddValue(Idx); } +static size_t InternalStrnlen(const char *S, size_t MaxLen) { + size_t Len = 0; + for (; Len < MaxLen && S[Len]; Len++) {} + return Len; +} + +// Finds min of (strlen(S1), strlen(S2)). +// Needed bacause one of these strings may actually be non-zero terminated. +static size_t InternalStrnlen2(const char *S1, const char *S2) { + size_t Len = 0; + for (; S1[Len] && S2[Len]; Len++) {} + return Len; +} + } // namespace fuzzer extern "C" { @@ -415,4 +431,71 @@ void __sanitizer_cov_trace_gep(uintptr_t Idx) { uintptr_t PC = reinterpret_cast(__builtin_return_address(0)); fuzzer::TPC.HandleCmp(PC, Idx, (uintptr_t)0); } + +ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY +void __sanitizer_weak_hook_memcmp(void *caller_pc, const void *s1, + const void *s2, size_t n, int result) { + if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; + if (result == 0) return; // No reason to mutate. + if (n <= 1) return; // Not interesting. + fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/false); +} + +ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY +void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1, + const char *s2, size_t n, int result) { + if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; + if (result == 0) return; // No reason to mutate. + size_t Len1 = fuzzer::InternalStrnlen(s1, n); + size_t Len2 = fuzzer::InternalStrnlen(s2, n); + n = std::min(n, Len1); + n = std::min(n, Len2); + if (n <= 1) return; // Not interesting. + fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/true); +} + +ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY +void __sanitizer_weak_hook_strcmp(void *caller_pc, const char *s1, + const char *s2, int result) { + if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; + if (result == 0) return; // No reason to mutate. + size_t N = fuzzer::InternalStrnlen2(s1, s2); + if (N <= 1) return; // Not interesting. + fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, N, /*StopAtZero*/true); +} + +ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY +void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1, + const char *s2, size_t n, int result) { + if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; + return __sanitizer_weak_hook_strncmp(called_pc, s1, s2, n, result); +} + +ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY +void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1, + const char *s2, int result) { + if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; + return __sanitizer_weak_hook_strcmp(called_pc, s1, s2, result); +} + +ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY +void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1, + const char *s2, char *result) { + if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; + fuzzer::TPC.MMT.Add(reinterpret_cast(s2), strlen(s2)); +} + +ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY +void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1, + const char *s2, char *result) { + if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; + fuzzer::TPC.MMT.Add(reinterpret_cast(s2), strlen(s2)); +} + +ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY +void __sanitizer_weak_hook_memmem(void *called_pc, const void *s1, size_t len1, + const void *s2, size_t len2, void *result) { + if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; + fuzzer::TPC.MMT.Add(reinterpret_cast(s2), len2); +} } // extern "C" diff --git a/lib/Fuzzer/FuzzerTracePC.h b/lib/Fuzzer/FuzzerTracePC.h index 5ec8c590b4df..b36c4f54306c 100644 --- a/lib/Fuzzer/FuzzerTracePC.h +++ b/lib/Fuzzer/FuzzerTracePC.h @@ -45,6 +45,28 @@ struct TableOfRecentCompares { Pair Table[kSize]; }; +template +struct MemMemTable { + static const size_t kSize = kSizeT; + Word MemMemWords[kSize]; + Word EmptyWord; + + void Add(const uint8_t *Data, size_t Size) { + if (Size <= 2) return; + Size = std::min(Size, Word::GetMaxSize()); + size_t Idx = SimpleFastHash(Data, Size) % kSize; + MemMemWords[Idx].Set(Data, Size); + } + const Word &Get(size_t Idx) { + for (size_t i = 0; i < kSize; i++) { + const Word &W = MemMemWords[(Idx + i) % kSize]; + if (W.size()) return W; + } + EmptyWord.Set(nullptr, 0); + return EmptyWord; + } +}; + class TracePC { public: static const size_t kNumPCs = 1 << 21; @@ -81,6 +103,7 @@ class TracePC { TableOfRecentCompares TORC4; TableOfRecentCompares TORC8; TableOfRecentCompares TORCW; + MemMemTable<1024> MMT; void PrintNewPCs(); void InitializePrintNewPCs(); diff --git a/lib/Fuzzer/FuzzerTraceState.cpp b/lib/Fuzzer/FuzzerTraceState.cpp deleted file mode 100644 index 8670e2ad6727..000000000000 --- a/lib/Fuzzer/FuzzerTraceState.cpp +++ /dev/null @@ -1,181 +0,0 @@ -//===- FuzzerTraceState.cpp - Trace-based fuzzer mutator ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Data tracing. -//===----------------------------------------------------------------------===// - -#include "FuzzerDictionary.h" -#include "FuzzerIO.h" -#include "FuzzerInternal.h" -#include "FuzzerMutate.h" -#include "FuzzerTracePC.h" -#include -#include -#include -#include -#include - -namespace fuzzer { - -// Declared as static globals for faster checks inside the hooks. -static bool RecordingMemmem = false; - -int ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr; - -class TraceState { -public: - TraceState(MutationDispatcher &MD, const FuzzingOptions &Options, - const Fuzzer *F) - : MD(MD), Options(Options), F(F) {} - - void StartTraceRecording() { - if (!Options.UseMemmem) - return; - RecordingMemmem = true; - InterestingWords.clear(); - MD.ClearAutoDictionary(); - } - - void StopTraceRecording() { - if (!RecordingMemmem) - return; - for (auto &W : InterestingWords) - MD.AddWordToAutoDictionary({W}); - } - - void AddInterestingWord(const uint8_t *Data, size_t Size) { - if (!RecordingMemmem || !F->InFuzzingThread()) return; - if (Size <= 1) return; - Size = std::min(Size, Word::GetMaxSize()); - Word W(Data, Size); - InterestingWords.insert(W); - } - - private: - - // TODO: std::set is too inefficient, need to have a custom DS here. - std::set InterestingWords; - MutationDispatcher &MD; - const FuzzingOptions Options; - const Fuzzer *F; -}; - -static TraceState *TS; - -void Fuzzer::StartTraceRecording() { - if (!TS) return; - TS->StartTraceRecording(); -} - -void Fuzzer::StopTraceRecording() { - if (!TS) return; - TS->StopTraceRecording(); -} - -void Fuzzer::InitializeTraceState() { - if (!Options.UseMemmem) return; - TS = new TraceState(MD, Options, this); -} - -static size_t InternalStrnlen(const char *S, size_t MaxLen) { - size_t Len = 0; - for (; Len < MaxLen && S[Len]; Len++) {} - return Len; -} - -// Finds min of (strlen(S1), strlen(S2)). -// Needed bacause one of these strings may actually be non-zero terminated. -static size_t InternalStrnlen2(const char *S1, const char *S2) { - size_t Len = 0; - for (; S1[Len] && S2[Len]; Len++) {} - return Len; -} - -} // namespace fuzzer - -using fuzzer::TS; - -extern "C" { - -// We may need to avoid defining weak hooks to stay compatible with older clang. -#ifndef LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS -# define LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS 1 -#endif - -#if LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS - -ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY -void __sanitizer_weak_hook_memcmp(void *caller_pc, const void *s1, - const void *s2, size_t n, int result) { - if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; - if (result == 0) return; // No reason to mutate. - if (n <= 1) return; // Not interesting. - fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/false); -} - -ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY -void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1, - const char *s2, size_t n, int result) { - if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; - if (result == 0) return; // No reason to mutate. - size_t Len1 = fuzzer::InternalStrnlen(s1, n); - size_t Len2 = fuzzer::InternalStrnlen(s2, n); - n = std::min(n, Len1); - n = std::min(n, Len2); - if (n <= 1) return; // Not interesting. - fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/true); -} - - -ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY -void __sanitizer_weak_hook_strcmp(void *caller_pc, const char *s1, - const char *s2, int result) { - if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; - if (result == 0) return; // No reason to mutate. - size_t N = fuzzer::InternalStrnlen2(s1, s2); - if (N <= 1) return; // Not interesting. - fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, N, /*StopAtZero*/true); -} - -ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY -void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1, - const char *s2, size_t n, int result) { - if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; - return __sanitizer_weak_hook_strncmp(called_pc, s1, s2, n, result); -} - -ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY -void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1, - const char *s2, int result) { - if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; - return __sanitizer_weak_hook_strcmp(called_pc, s1, s2, result); -} - -ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY -void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1, - const char *s2, char *result) { - if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; - TS->AddInterestingWord(reinterpret_cast(s2), strlen(s2)); -} - -ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY -void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1, - const char *s2, char *result) { - if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; - TS->AddInterestingWord(reinterpret_cast(s2), strlen(s2)); -} - -ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY -void __sanitizer_weak_hook_memmem(void *called_pc, const void *s1, size_t len1, - const void *s2, size_t len2, void *result) { - if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return; - TS->AddInterestingWord(reinterpret_cast(s2), len2); -} - -#endif // LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS -} // extern "C" diff --git a/lib/Fuzzer/FuzzerUtil.cpp b/lib/Fuzzer/FuzzerUtil.cpp index f5fd3a85187c..2d95f40e46a1 100644 --- a/lib/Fuzzer/FuzzerUtil.cpp +++ b/lib/Fuzzer/FuzzerUtil.cpp @@ -215,4 +215,11 @@ bool ExecuteCommandAndReadOutput(const std::string &Command, std::string *Out) { return true; } +size_t SimpleFastHash(const uint8_t *Data, size_t Size) { + size_t Res = 0; + for (size_t i = 0; i < Size; i++) + Res = Res * 11 + Data[i]; + return Res; +} + } // namespace fuzzer diff --git a/lib/Fuzzer/FuzzerUtil.h b/lib/Fuzzer/FuzzerUtil.h index f84fd9ef0fce..62d6e61dcf17 100644 --- a/lib/Fuzzer/FuzzerUtil.h +++ b/lib/Fuzzer/FuzzerUtil.h @@ -67,10 +67,20 @@ inline std::string CloneArgsWithoutX(const std::vector &Args, return CloneArgsWithoutX(Args, X, X); } +inline std::pair SplitBefore(std::string X, + std::string S) { + auto Pos = S.find(X); + if (Pos == std::string::npos) + return std::make_pair(S, ""); + return std::make_pair(S.substr(0, Pos), S.substr(Pos)); +} + std::string DisassembleCmd(const std::string &FileName); std::string SearchRegexCmd(const std::string &Regex); +size_t SimpleFastHash(const uint8_t *Data, size_t Size); + } // namespace fuzzer #endif // LLVM_FUZZER_UTIL_H diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp index d0521bdfdd67..15bceb896e17 100644 --- a/lib/Fuzzer/afl/afl_driver.cpp +++ b/lib/Fuzzer/afl/afl_driver.cpp @@ -22,8 +22,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { return 0; } EOF -# Build your target with -fsanitize-coverage=trace-pc using fresh clang. -clang -g -fsanitize-coverage=trace-pc test_fuzzer.cc -c +# Build your target with -fsanitize-coverage=trace-pc-guard using fresh clang. +clang -g -fsanitize-coverage=trace-pc-guard test_fuzzer.cc -c # Build afl-llvm-rt.o.c from the AFL distribution. clang -c -w $AFL_HOME/llvm_mode/afl-llvm-rt.o.c # Build this file, link it with afl-llvm-rt.o.o and the target code. diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt index 30566bdc87ae..43aea2b7a186 100644 --- a/lib/Fuzzer/test/CMakeLists.txt +++ b/lib/Fuzzer/test/CMakeLists.txt @@ -90,6 +90,7 @@ set(Tests EmptyTest EquivalenceATest EquivalenceBTest + FlagsTest FourIndependentBranchesTest FullCoverageSetTest InitializeTest @@ -272,5 +273,5 @@ add_lit_testsuite(check-fuzzer "Running Fuzzer tests" # Don't add dependencies on Windows. The linker step would fail on Windows, # since cmake will use link.exe for linking and won't include compiler-rt libs. if(NOT MSVC) - add_dependencies(check-fuzzer FileCheck sancov not llvm-symbolizer) + add_dependencies(check-fuzzer FileCheck sancov not) endif() diff --git a/lib/Fuzzer/test/FlagsTest.cpp b/lib/Fuzzer/test/FlagsTest.cpp new file mode 100644 index 000000000000..ac64b9d48df5 --- /dev/null +++ b/lib/Fuzzer/test/FlagsTest.cpp @@ -0,0 +1,32 @@ +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. + +// Parse some flags +#include +#include + +static std::vector Flags; + +extern "C" int LLVMFuzzerInitialize(int *Argc, char ***Argv) { + // Parse --flags and anything after -ignore_remaining_args=1 is passed. + int I = 1; + while (I < *Argc) { + std::string S((*Argv)[I++]); + if (S == "-ignore_remaining_args=1") + break; + if (S.substr(0, 2) == "--") + Flags.push_back(S); + } + while (I < *Argc) + Flags.push_back(std::string((*Argv)[I++])); + + return 0; +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { + fprintf(stderr, "BINGO "); + for (auto Flag : Flags) + fprintf(stderr, "%s ", Flag.c_str()); + fprintf(stderr, "\n"); + exit(0); +} diff --git a/lib/Fuzzer/test/FuzzerUnittest.cpp b/lib/Fuzzer/test/FuzzerUnittest.cpp index 1053c28527bf..eba2663029b2 100644 --- a/lib/Fuzzer/test/FuzzerUnittest.cpp +++ b/lib/Fuzzer/test/FuzzerUnittest.cpp @@ -424,35 +424,6 @@ TEST(FuzzerMutate, AddWordFromDictionary2) { TestAddWordFromDictionary(&MutationDispatcher::Mutate, 1 << 15); } -void TestAddWordFromDictionaryWithHint(Mutator M, int NumIter) { - std::unique_ptr t(new ExternalFunctions()); - fuzzer::EF = t.get(); - Random Rand(0); - MutationDispatcher MD(Rand, {}); - uint8_t W[] = {0xAA, 0xBB, 0xCC, 0xDD, 0xFF, 0xEE, 0xEF}; - size_t PosHint = 7777; - MD.AddWordToAutoDictionary({Word(W, sizeof(W)), PosHint}); - int FoundMask = 0; - for (int i = 0; i < NumIter; i++) { - uint8_t T[10000]; - memset(T, 0, sizeof(T)); - size_t NewSize = (MD.*M)(T, 9000, 10000); - if (NewSize >= PosHint + sizeof(W) && - !memcmp(W, T + PosHint, sizeof(W))) - FoundMask = 1; - } - EXPECT_EQ(FoundMask, 1); -} - -TEST(FuzzerMutate, AddWordFromDictionaryWithHint1) { - TestAddWordFromDictionaryWithHint( - &MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary, 1 << 5); -} - -TEST(FuzzerMutate, AddWordFromDictionaryWithHint2) { - TestAddWordFromDictionaryWithHint(&MutationDispatcher::Mutate, 1 << 10); -} - void TestChangeASCIIInteger(Mutator M, int NumIter) { std::unique_ptr t(new ExternalFunctions()); fuzzer::EF = t.get(); @@ -593,7 +564,7 @@ TEST(Corpus, Distribution) { size_t N = 10; size_t TriesPerUnit = 1<<16; for (size_t i = 0; i < N; i++) - C->AddToCorpus(Unit{ static_cast(i) }, 0, false, {}); + C->AddToCorpus(Unit{ static_cast(i) }, 1, false, {}); std::vector Hist(N); for (size_t i = 0; i < N * TriesPerUnit; i++) { diff --git a/lib/Fuzzer/test/fuzzer-flags.test b/lib/Fuzzer/test/fuzzer-flags.test index 76ea27705750..976da2906d7c 100644 --- a/lib/Fuzzer/test/fuzzer-flags.test +++ b/lib/Fuzzer/test/fuzzer-flags.test @@ -1,10 +1,21 @@ -RUN: LLVMFuzzer-SimpleTest -foo_bar=1 2>&1 | FileCheck %s --check-prefix=FOO_BAR +# Does not work on windows for unknown reason. +UNSUPPORTED: windows + +RUN: LLVMFuzzer-FlagsTest -foo_bar=1 2>&1 | FileCheck %s --check-prefix=FOO_BAR FOO_BAR: WARNING: unrecognized flag '-foo_bar=1'; use -help=1 to list all flags FOO_BAR: BINGO -RUN: LLVMFuzzer-SimpleTest -runs=10 --max_len=100 2>&1 | FileCheck %s --check-prefix=DASH_DASH +RUN: LLVMFuzzer-FlagsTest -runs=10 --max_len=100 2>&1 | FileCheck %s --check-prefix=DASH_DASH DASH_DASH: WARNING: did you mean '-max_len=100' (single dash)? DASH_DASH: INFO: A corpus is not provided, starting from an empty corpus -RUN: LLVMFuzzer-SimpleTest -help=1 2>&1 | FileCheck %s --check-prefix=NO_INTERNAL +RUN: LLVMFuzzer-FlagsTest -help=1 2>&1 | FileCheck %s --check-prefix=NO_INTERNAL NO_INTERNAL-NOT: internal flag + +RUN: LLVMFuzzer-FlagsTest --foo-bar -runs=10 -ignore_remaining_args=1 --baz -help=1 test 2>&1 | FileCheck %s --check-prefix=PASSTHRU +PASSTHRU: BINGO --foo-bar --baz -help=1 test + +RUN: mkdir -p %t/T0 %t/T1 +RUN: touch %t/T1/empty +RUN: LLVMFuzzer-FlagsTest --foo-bar -merge=1 %t/T0 %t/T1 -ignore_remaining_args=1 --baz -help=1 test 2>&1 | FileCheck %s --check-prefix=PASSTHRU-MERGE +PASSTHRU-MERGE: BINGO --foo-bar --baz -help=1 test diff --git a/lib/Fuzzer/test/fuzzer-traces-hooks.test b/lib/Fuzzer/test/fuzzer-traces-hooks.test index f93a8b7199e2..77ca4b47bd01 100644 --- a/lib/Fuzzer/test/fuzzer-traces-hooks.test +++ b/lib/Fuzzer/test/fuzzer-traces-hooks.test @@ -10,7 +10,7 @@ RUN: not LLVMFuzzer-StrstrTest -seed=1 -runs=2000000 2>&1 | File RUN: not LLVMFuzzer-Memcmp64BytesTest -seed=1 -runs=1000000 2>&1 | FileCheck %s -RUN: LLVMFuzzer-RepeatedMemcmp -seed=11 -runs=100000 2>&1 | FileCheck %s --check-prefix=RECOMMENDED_DICT +RUN: LLVMFuzzer-RepeatedMemcmp -seed=11 -runs=100000 -max_len=20 2>&1 | FileCheck %s --check-prefix=RECOMMENDED_DICT RECOMMENDED_DICT:###### Recommended dictionary. ###### RECOMMENDED_DICT-DAG: "foo" RECOMMENDED_DICT-DAG: "bar" diff --git a/lib/Fuzzer/test/reduce_inputs.test b/lib/Fuzzer/test/reduce_inputs.test index a4a5c57123d3..5ce4440788f4 100644 --- a/lib/Fuzzer/test/reduce_inputs.test +++ b/lib/Fuzzer/test/reduce_inputs.test @@ -9,5 +9,6 @@ CHECK: INFO: found item with checksum '0eb8e4ed029b774d80f2b66408203801cb982a60' RUN: LLVMFuzzer-ShrinkControlFlowSimpleTest -runs=0 %t/C 2>&1 | FileCheck %s --check-prefix=COUNT COUNT: READ units: 3 - +# a bit longer test +RUN: LLVMFuzzer-ShrinkControlFlowTest -exit_on_item=0eb8e4ed029b774d80f2b66408203801cb982a60 -seed=1 -reduce_inputs=1 -runs=1000000 2>&1 | FileCheck %s diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp index 80371780fb6d..170bc544d53f 100644 --- a/lib/IR/AsmWriter.cpp +++ b/lib/IR/AsmWriter.cpp @@ -365,7 +365,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { case CallingConv::PTX_Kernel: Out << "ptx_kernel"; break; case CallingConv::PTX_Device: Out << "ptx_device"; break; case CallingConv::X86_64_SysV: Out << "x86_64_sysvcc"; break; - case CallingConv::X86_64_Win64: Out << "x86_64_win64cc"; break; + case CallingConv::Win64: Out << "win64cc"; break; case CallingConv::SPIR_FUNC: Out << "spir_func"; break; case CallingConv::SPIR_KERNEL: Out << "spir_kernel"; break; case CallingConv::Swift: Out << "swiftcc"; break; @@ -1964,6 +1964,7 @@ static void writeDIImportedEntity(raw_ostream &Out, const DIImportedEntity *N, Printer.printString("name", N->getName()); Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false); Printer.printMetadata("entity", N->getRawEntity()); + Printer.printMetadata("file", N->getRawFile()); Printer.printInt("line", N->getLine()); Out << ")"; } diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp index e31779c83e3a..f56fe7089807 100644 --- a/lib/IR/Constants.cpp +++ b/lib/IR/Constants.cpp @@ -44,8 +44,8 @@ bool Constant::isNegativeZeroValue() const { // Equivalent for a vector of -0.0's. if (const ConstantDataVector *CV = dyn_cast(this)) - if (ConstantFP *SplatCFP = dyn_cast_or_null(CV->getSplatValue())) - if (SplatCFP && SplatCFP->isZero() && SplatCFP->isNegative()) + if (CV->getElementType()->isFloatingPointTy() && CV->isSplat()) + if (CV->getElementAsAPFloat(0).isNegZero()) return true; if (const ConstantVector *CV = dyn_cast(this)) @@ -70,8 +70,8 @@ bool Constant::isZeroValue() const { // Equivalent for a vector of -0.0's. if (const ConstantDataVector *CV = dyn_cast(this)) - if (ConstantFP *SplatCFP = dyn_cast_or_null(CV->getSplatValue())) - if (SplatCFP && SplatCFP->isZero()) + if (CV->getElementType()->isFloatingPointTy() && CV->isSplat()) + if (CV->getElementAsAPFloat(0).isZero()) return true; if (const ConstantVector *CV = dyn_cast(this)) @@ -113,9 +113,13 @@ bool Constant::isAllOnesValue() const { return Splat->isAllOnesValue(); // Check for constant vectors which are splats of -1 values. - if (const ConstantDataVector *CV = dyn_cast(this)) - if (Constant *Splat = CV->getSplatValue()) - return Splat->isAllOnesValue(); + if (const ConstantDataVector *CV = dyn_cast(this)) { + if (CV->isSplat()) { + if (CV->getElementType()->isFloatingPointTy()) + return CV->getElementAsAPFloat(0).bitcastToAPInt().isAllOnesValue(); + return CV->getElementAsAPInt(0).isAllOnesValue(); + } + } return false; } @@ -135,9 +139,13 @@ bool Constant::isOneValue() const { return Splat->isOneValue(); // Check for constant vectors which are splats of 1 values. - if (const ConstantDataVector *CV = dyn_cast(this)) - if (Constant *Splat = CV->getSplatValue()) - return Splat->isOneValue(); + if (const ConstantDataVector *CV = dyn_cast(this)) { + if (CV->isSplat()) { + if (CV->getElementType()->isFloatingPointTy()) + return CV->getElementAsAPFloat(0).bitcastToAPInt().isOneValue(); + return CV->getElementAsAPInt(0).isOneValue(); + } + } return false; } @@ -157,9 +165,13 @@ bool Constant::isMinSignedValue() const { return Splat->isMinSignedValue(); // Check for constant vectors which are splats of INT_MIN values. - if (const ConstantDataVector *CV = dyn_cast(this)) - if (Constant *Splat = CV->getSplatValue()) - return Splat->isMinSignedValue(); + if (const ConstantDataVector *CV = dyn_cast(this)) { + if (CV->isSplat()) { + if (CV->getElementType()->isFloatingPointTy()) + return CV->getElementAsAPFloat(0).bitcastToAPInt().isMinSignedValue(); + return CV->getElementAsAPInt(0).isMinSignedValue(); + } + } return false; } @@ -179,9 +191,13 @@ bool Constant::isNotMinSignedValue() const { return Splat->isNotMinSignedValue(); // Check for constant vectors which are splats of INT_MIN values. - if (const ConstantDataVector *CV = dyn_cast(this)) - if (Constant *Splat = CV->getSplatValue()) - return Splat->isNotMinSignedValue(); + if (const ConstantDataVector *CV = dyn_cast(this)) { + if (CV->isSplat()) { + if (CV->getElementType()->isFloatingPointTy()) + return !CV->getElementAsAPFloat(0).bitcastToAPInt().isMinSignedValue(); + return !CV->getElementAsAPInt(0).isMinSignedValue(); + } + } // It *may* contain INT_MIN, we can't tell. return false; @@ -2565,6 +2581,34 @@ uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const { } } +APInt ConstantDataSequential::getElementAsAPInt(unsigned Elt) const { + assert(isa(getElementType()) && + "Accessor can only be used when element is an integer"); + const char *EltPtr = getElementPointer(Elt); + + // The data is stored in host byte order, make sure to cast back to the right + // type to load with the right endianness. + switch (getElementType()->getIntegerBitWidth()) { + default: llvm_unreachable("Invalid bitwidth for CDS"); + case 8: { + auto EltVal = *reinterpret_cast(EltPtr); + return APInt(8, EltVal); + } + case 16: { + auto EltVal = *reinterpret_cast(EltPtr); + return APInt(16, EltVal); + } + case 32: { + auto EltVal = *reinterpret_cast(EltPtr); + return APInt(32, EltVal); + } + case 64: { + auto EltVal = *reinterpret_cast(EltPtr); + return APInt(64, EltVal); + } + } +} + APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const { const char *EltPtr = getElementPointer(Elt); @@ -2623,17 +2667,21 @@ bool ConstantDataSequential::isCString() const { return Str.drop_back().find(0) == StringRef::npos; } -Constant *ConstantDataVector::getSplatValue() const { +bool ConstantDataVector::isSplat() const { const char *Base = getRawDataValues().data(); // Compare elements 1+ to the 0'th element. unsigned EltSize = getElementByteSize(); for (unsigned i = 1, e = getNumElements(); i != e; ++i) if (memcmp(Base, Base+i*EltSize, EltSize)) - return nullptr; + return false; + + return true; +} +Constant *ConstantDataVector::getSplatValue() const { // If they're all the same, return the 0th one as a representative. - return getElementAsConstant(0); + return isSplat() ? getElementAsConstant(0) : nullptr; } //===----------------------------------------------------------------------===// diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp index 2165ae5a9470..aba770457e2f 100644 --- a/lib/IR/Core.cpp +++ b/lib/IR/Core.cpp @@ -14,7 +14,6 @@ #include "llvm-c/Core.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp index 7e598b43ac16..bce28ba3b950 100644 --- a/lib/IR/DIBuilder.cpp +++ b/lib/IR/DIBuilder.cpp @@ -148,10 +148,13 @@ DICompileUnit *DIBuilder::createCompileUnit( static DIImportedEntity * createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context, - Metadata *NS, unsigned Line, StringRef Name, + Metadata *NS, DIFile *File, unsigned Line, StringRef Name, SmallVectorImpl &AllImportedModules) { + if (Line) + assert(File && "Source location has line number but no file"); unsigned EntitiesCount = C.pImpl->DIImportedEntitys.size(); - auto *M = DIImportedEntity::get(C, Tag, Context, DINodeRef(NS), Line, Name); + auto *M = + DIImportedEntity::get(C, Tag, Context, DINodeRef(NS), File, Line, Name); if (EntitiesCount < C.pImpl->DIImportedEntitys.size()) // A new Imported Entity was just added to the context. // Add it to the Imported Modules list. @@ -160,33 +163,38 @@ createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context, } DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context, - DINamespace *NS, + DINamespace *NS, DIFile *File, unsigned Line) { return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module, - Context, NS, Line, StringRef(), AllImportedModules); + Context, NS, File, Line, StringRef(), + AllImportedModules); } DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context, DIImportedEntity *NS, - unsigned Line) { + DIFile *File, unsigned Line) { return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module, - Context, NS, Line, StringRef(), AllImportedModules); + Context, NS, File, Line, StringRef(), + AllImportedModules); } DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context, DIModule *M, - unsigned Line) { + DIFile *File, unsigned Line) { return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module, - Context, M, Line, StringRef(), AllImportedModules); + Context, M, File, Line, StringRef(), + AllImportedModules); } DIImportedEntity *DIBuilder::createImportedDeclaration(DIScope *Context, DINode *Decl, + DIFile *File, unsigned Line, StringRef Name) { // Make sure to use the unique identifier based metadata reference for // types that have one. return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_declaration, - Context, Decl, Line, Name, AllImportedModules); + Context, Decl, File, Line, Name, + AllImportedModules); } DIFile *DIBuilder::createFile(StringRef Filename, StringRef Directory, diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp index 0bf68b4c53bb..c14940bad45d 100644 --- a/lib/IR/DebugInfoMetadata.cpp +++ b/lib/IR/DebugInfoMetadata.cpp @@ -760,12 +760,13 @@ DIObjCProperty *DIObjCProperty::getImpl( DIImportedEntity *DIImportedEntity::getImpl(LLVMContext &Context, unsigned Tag, Metadata *Scope, Metadata *Entity, - unsigned Line, MDString *Name, - StorageType Storage, + Metadata *File, unsigned Line, + MDString *Name, StorageType Storage, bool ShouldCreate) { assert(isCanonical(Name) && "Expected canonical MDString"); - DEFINE_GETIMPL_LOOKUP(DIImportedEntity, (Tag, Scope, Entity, Line, Name)); - Metadata *Ops[] = {Scope, Entity, Name}; + DEFINE_GETIMPL_LOOKUP(DIImportedEntity, + (Tag, Scope, Entity, File, Line, Name)); + Metadata *Ops[] = {Scope, Entity, Name, File}; DEFINE_GETIMPL_STORE(DIImportedEntity, (Tag, Line), Ops); } diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp index 9bd0e297f4ef..4d7e3040ecd7 100644 --- a/lib/IR/Dominators.cpp +++ b/lib/IR/Dominators.cpp @@ -61,24 +61,30 @@ bool BasicBlockEdge::isSingleEdge() const { //===----------------------------------------------------------------------===// template class llvm::DomTreeNodeBase; -template class llvm::DominatorTreeBase; - -template void llvm::DomTreeBuilder::Calculate( - DominatorTreeBase< - typename std::remove_pointer::NodeRef>::type> - &DT, - Function &F); -template void llvm::DomTreeBuilder::Calculate>( - DominatorTreeBase>::NodeRef>::type> &DT, - Function &F); -template bool llvm::DomTreeBuilder::Verify( - const DominatorTreeBase< - typename std::remove_pointer::NodeRef>::type> - &DT); -template bool llvm::DomTreeBuilder::Verify>( - const DominatorTreeBase>::NodeRef>::type> &DT); +template class llvm::DominatorTreeBase; // DomTreeBase +template class llvm::DominatorTreeBase; // PostDomTreeBase + +template void +llvm::DomTreeBuilder::Calculate( + DomTreeBuilder::BBDomTree &DT, Function &F); +template void +llvm::DomTreeBuilder::Calculate( + DomTreeBuilder::BBPostDomTree &DT, Function &F); + +template void llvm::DomTreeBuilder::InsertEdge( + DomTreeBuilder::BBDomTree &DT, BasicBlock *From, BasicBlock *To); +template void llvm::DomTreeBuilder::InsertEdge( + DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To); + +template void llvm::DomTreeBuilder::DeleteEdge( + DomTreeBuilder::BBDomTree &DT, BasicBlock *From, BasicBlock *To); +template void llvm::DomTreeBuilder::DeleteEdge( + DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To); + +template bool llvm::DomTreeBuilder::Verify( + const DomTreeBuilder::BBDomTree &DT); +template bool llvm::DomTreeBuilder::Verify( + const DomTreeBuilder::BBPostDomTree &DT); bool DominatorTree::invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &) { diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h index e413a4f34432..bea2c7ae8ff2 100644 --- a/lib/IR/LLVMContextImpl.h +++ b/lib/IR/LLVMContextImpl.h @@ -990,24 +990,26 @@ template <> struct MDNodeKeyImpl { unsigned Tag; Metadata *Scope; Metadata *Entity; + Metadata *File; unsigned Line; MDString *Name; - MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, unsigned Line, - MDString *Name) - : Tag(Tag), Scope(Scope), Entity(Entity), Line(Line), Name(Name) {} + MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, Metadata *File, + unsigned Line, MDString *Name) + : Tag(Tag), Scope(Scope), Entity(Entity), File(File), Line(Line), + Name(Name) {} MDNodeKeyImpl(const DIImportedEntity *N) : Tag(N->getTag()), Scope(N->getRawScope()), Entity(N->getRawEntity()), - Line(N->getLine()), Name(N->getRawName()) {} + File(N->getRawFile()), Line(N->getLine()), Name(N->getRawName()) {} bool isKeyOf(const DIImportedEntity *RHS) const { return Tag == RHS->getTag() && Scope == RHS->getRawScope() && - Entity == RHS->getRawEntity() && Line == RHS->getLine() && - Name == RHS->getRawName(); + Entity == RHS->getRawEntity() && File == RHS->getFile() && + Line == RHS->getLine() && Name == RHS->getRawName(); } unsigned getHashValue() const { - return hash_combine(Tag, Scope, Entity, Line, Name); + return hash_combine(Tag, Scope, Entity, File, Line, Name); } }; diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp index 29e2f42d3e05..995e1e570340 100644 --- a/lib/IR/LegacyPassManager.cpp +++ b/lib/IR/LegacyPassManager.cpp @@ -625,21 +625,21 @@ void PMTopLevelManager::schedulePass(Pass *P) { checkAnalysis = false; const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet(); - for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(), - E = RequiredSet.end(); I != E; ++I) { + for (const AnalysisID ID : RequiredSet) { - Pass *AnalysisPass = findAnalysisPass(*I); + Pass *AnalysisPass = findAnalysisPass(ID); if (!AnalysisPass) { - const PassInfo *PI = findAnalysisPassInfo(*I); + const PassInfo *PI = findAnalysisPassInfo(ID); if (!PI) { // Pass P is not in the global PassRegistry dbgs() << "Pass '" << P->getPassName() << "' is not initialized." << "\n"; dbgs() << "Verify if there is a pass dependency cycle." << "\n"; dbgs() << "Required Passes:" << "\n"; - for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(), - E = RequiredSet.end(); I2 != E && I2 != I; ++I2) { - Pass *AnalysisPass2 = findAnalysisPass(*I2); + for (const AnalysisID ID2 : RequiredSet) { + if (ID == ID2) + break; + Pass *AnalysisPass2 = findAnalysisPass(ID2); if (AnalysisPass2) { dbgs() << "\t" << AnalysisPass2->getPassName() << "\n"; } else { @@ -1070,17 +1070,15 @@ void PMDataManager::collectRequiredAndUsedAnalyses( void PMDataManager::initializeAnalysisImpl(Pass *P) { AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P); - for (AnalysisUsage::VectorType::const_iterator - I = AnUsage->getRequiredSet().begin(), - E = AnUsage->getRequiredSet().end(); I != E; ++I) { - Pass *Impl = findAnalysisPass(*I, true); + for (const AnalysisID ID : AnUsage->getRequiredSet()) { + Pass *Impl = findAnalysisPass(ID, true); if (!Impl) // This may be analysis pass that is initialized on the fly. // If that is not the case then it will raise an assert when it is used. continue; AnalysisResolver *AR = P->getResolver(); assert(AR && "Analysis Resolver is not set"); - AR->addAnalysisImplsPair(*I, Impl); + AR->addAnalysisImplsPair(ID, Impl); } } @@ -1112,21 +1110,19 @@ void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{ TPM->collectLastUses(LUses, P); - for (SmallVectorImpl::iterator I = LUses.begin(), - E = LUses.end(); I != E; ++I) { + for (Pass *P : LUses) { dbgs() << "--" << std::string(Offset*2, ' '); - (*I)->dumpPassStructure(0); + P->dumpPassStructure(0); } } void PMDataManager::dumpPassArguments() const { - for (SmallVectorImpl::const_iterator I = PassVector.begin(), - E = PassVector.end(); I != E; ++I) { - if (PMDataManager *PMD = (*I)->getAsPMDataManager()) + for (Pass *P : PassVector) { + if (PMDataManager *PMD = P->getAsPMDataManager()) PMD->dumpPassArguments(); else if (const PassInfo *PI = - TPM->findAnalysisPassInfo((*I)->getPassID())) + TPM->findAnalysisPassInfo(P->getPassID())) if (!PI->isAnalysisGroup()) dbgs() << " -" << PI->getPassArgument(); } @@ -1255,9 +1251,8 @@ Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) { // Destructor PMDataManager::~PMDataManager() { - for (SmallVectorImpl::iterator I = PassVector.begin(), - E = PassVector.end(); I != E; ++I) - delete *I; + for (Pass *P : PassVector) + delete P; } //===----------------------------------------------------------------------===// @@ -1284,35 +1279,35 @@ bool BBPassManager::runOnFunction(Function &F) { bool Changed = doInitialization(F); - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) + for (BasicBlock &BB : F) for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) { BasicBlockPass *BP = getContainedPass(Index); bool LocalChanged = false; - dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName()); + dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, BB.getName()); dumpRequiredSet(BP); initializeAnalysisImpl(BP); { // If the pass crashes, remember this. - PassManagerPrettyStackEntry X(BP, *I); + PassManagerPrettyStackEntry X(BP, BB); TimeRegion PassTimer(getPassTimer(BP)); - LocalChanged |= BP->runOnBasicBlock(*I); + LocalChanged |= BP->runOnBasicBlock(BB); } Changed |= LocalChanged; if (LocalChanged) dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG, - I->getName()); + BB.getName()); dumpPreservedSet(BP); dumpUsedSet(BP); verifyPreservedAnalysis(BP); removeNotPreservedAnalysis(BP); recordAvailableAnalysis(BP); - removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG); + removeDeadPasses(BP, BB.getName(), ON_BASICBLOCK_MSG); } return doFinalization(F) || Changed; diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp index fdc7de6eaa34..c230a50044c7 100644 --- a/lib/IR/Module.cpp +++ b/lib/IR/Module.cpp @@ -103,7 +103,7 @@ std::unique_ptr Module::createRNG(const Pass* P) const { // store salt metadata from the Module constructor. Salt += sys::path::filename(getModuleIdentifier()); - return std::unique_ptr{new RandomNumberGenerator(Salt)}; + return std::unique_ptr(new RandomNumberGenerator(Salt)); } /// getNamedValue - Return the first global value in the module with diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp index 4034f9039dda..b052c76d1fed 100644 --- a/lib/Object/ArchiveWriter.cpp +++ b/lib/Object/ArchiveWriter.cpp @@ -318,7 +318,8 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind, continue; if (!(Symflags & object::SymbolRef::SF_Global)) continue; - if (Symflags & object::SymbolRef::SF_Undefined) + if (Symflags & object::SymbolRef::SF_Undefined && + !(Symflags & object::SymbolRef::SF_Indirect)) continue; unsigned NameOffset = NameOS.tell(); diff --git a/lib/Object/COFFImportFile.cpp b/lib/Object/COFFImportFile.cpp index 740bf94d40e0..d1f46fdfa292 100644 --- a/lib/Object/COFFImportFile.cpp +++ b/lib/Object/COFFImportFile.cpp @@ -131,14 +131,14 @@ class ObjectFactory { using u32 = support::ulittle32_t; MachineTypes Machine; BumpPtrAllocator Alloc; - StringRef DLLName; + StringRef ImportName; StringRef Library; std::string ImportDescriptorSymbolName; std::string NullThunkSymbolName; public: ObjectFactory(StringRef S, MachineTypes M) - : Machine(M), DLLName(S), Library(S.drop_back(4)), + : Machine(M), ImportName(S), Library(S.drop_back(4)), ImportDescriptorSymbolName(("__IMPORT_DESCRIPTOR_" + Library).str()), NullThunkSymbolName(("\x7f" + Library + "_NULL_THUNK_DATA").str()) {} @@ -162,14 +162,17 @@ public: // Library Format. NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal, ImportType Type, ImportNameType NameType); + + // Create a weak external file which is described in PE/COFF Aux Format 3. + NewArchiveMember createWeakExternal(StringRef Sym, StringRef Weak, bool Imp); }; } // namespace NewArchiveMember ObjectFactory::createImportDescriptor(std::vector &Buffer) { - static const uint32_t NumberOfSections = 2; - static const uint32_t NumberOfSymbols = 7; - static const uint32_t NumberOfRelocations = 3; + const uint32_t NumberOfSections = 2; + const uint32_t NumberOfSymbols = 7; + const uint32_t NumberOfRelocations = 3; // COFF Header coff_file_header Header{ @@ -181,7 +184,7 @@ ObjectFactory::createImportDescriptor(std::vector &Buffer) { sizeof(coff_import_directory_table_entry) + NumberOfRelocations * sizeof(coff_relocation) + // .idata$4 - (DLLName.size() + 1)), + (ImportName.size() + 1)), u32(NumberOfSymbols), u16(0), u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0), @@ -189,7 +192,7 @@ ObjectFactory::createImportDescriptor(std::vector &Buffer) { append(Buffer, Header); // Section Header Table - static const coff_section SectionTable[NumberOfSections] = { + const coff_section SectionTable[NumberOfSections] = { {{'.', 'i', 'd', 'a', 't', 'a', '$', '2'}, u32(0), u32(0), @@ -205,7 +208,7 @@ ObjectFactory::createImportDescriptor(std::vector &Buffer) { {{'.', 'i', 'd', 'a', 't', 'a', '$', '6'}, u32(0), u32(0), - u32(DLLName.size() + 1), + u32(ImportName.size() + 1), u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) + sizeof(coff_import_directory_table_entry) + NumberOfRelocations * sizeof(coff_relocation)), @@ -219,12 +222,12 @@ ObjectFactory::createImportDescriptor(std::vector &Buffer) { append(Buffer, SectionTable); // .idata$2 - static const coff_import_directory_table_entry ImportDescriptor{ + const coff_import_directory_table_entry ImportDescriptor{ u32(0), u32(0), u32(0), u32(0), u32(0), }; append(Buffer, ImportDescriptor); - static const coff_relocation RelocationTable[NumberOfRelocations] = { + const coff_relocation RelocationTable[NumberOfRelocations] = { {u32(offsetof(coff_import_directory_table_entry, NameRVA)), u32(2), u16(getImgRelRelocation(Machine))}, {u32(offsetof(coff_import_directory_table_entry, ImportLookupTableRVA)), @@ -236,9 +239,9 @@ ObjectFactory::createImportDescriptor(std::vector &Buffer) { // .idata$6 auto S = Buffer.size(); - Buffer.resize(S + DLLName.size() + 1); - memcpy(&Buffer[S], DLLName.data(), DLLName.size()); - Buffer[S + DLLName.size()] = '\0'; + Buffer.resize(S + ImportName.size() + 1); + memcpy(&Buffer[S], ImportName.data(), ImportName.size()); + Buffer[S + ImportName.size()] = '\0'; // Symbol Table coff_symbol16 SymbolTable[NumberOfSymbols] = { @@ -302,13 +305,13 @@ ObjectFactory::createImportDescriptor(std::vector &Buffer) { NullThunkSymbolName}); StringRef F{reinterpret_cast(Buffer.data()), Buffer.size()}; - return {MemoryBufferRef(F, DLLName)}; + return {MemoryBufferRef(F, ImportName)}; } NewArchiveMember ObjectFactory::createNullImportDescriptor(std::vector &Buffer) { - static const uint32_t NumberOfSections = 1; - static const uint32_t NumberOfSymbols = 1; + const uint32_t NumberOfSections = 1; + const uint32_t NumberOfSymbols = 1; // COFF Header coff_file_header Header{ @@ -325,7 +328,7 @@ ObjectFactory::createNullImportDescriptor(std::vector &Buffer) { append(Buffer, Header); // Section Header Table - static const coff_section SectionTable[NumberOfSections] = { + const coff_section SectionTable[NumberOfSections] = { {{'.', 'i', 'd', 'a', 't', 'a', '$', '3'}, u32(0), u32(0), @@ -342,7 +345,7 @@ ObjectFactory::createNullImportDescriptor(std::vector &Buffer) { append(Buffer, SectionTable); // .idata$3 - static const coff_import_directory_table_entry ImportDescriptor{ + const coff_import_directory_table_entry ImportDescriptor{ u32(0), u32(0), u32(0), u32(0), u32(0), }; append(Buffer, ImportDescriptor); @@ -363,12 +366,12 @@ ObjectFactory::createNullImportDescriptor(std::vector &Buffer) { writeStringTable(Buffer, {NullImportDescriptorSymbolName}); StringRef F{reinterpret_cast(Buffer.data()), Buffer.size()}; - return {MemoryBufferRef(F, DLLName)}; + return {MemoryBufferRef(F, ImportName)}; } NewArchiveMember ObjectFactory::createNullThunk(std::vector &Buffer) { - static const uint32_t NumberOfSections = 2; - static const uint32_t NumberOfSymbols = 1; + const uint32_t NumberOfSections = 2; + const uint32_t NumberOfSymbols = 1; uint32_t VASize = is32bit(Machine) ? 4 : 8; // COFF Header @@ -388,7 +391,7 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector &Buffer) { append(Buffer, Header); // Section Header Table - static const coff_section SectionTable[NumberOfSections] = { + const coff_section SectionTable[NumberOfSections] = { {{'.', 'i', 'd', 'a', 't', 'a', '$', '5'}, u32(0), u32(0), @@ -445,14 +448,14 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector &Buffer) { writeStringTable(Buffer, {NullThunkSymbolName}); StringRef F{reinterpret_cast(Buffer.data()), Buffer.size()}; - return {MemoryBufferRef{F, DLLName}}; + return {MemoryBufferRef{F, ImportName}}; } NewArchiveMember ObjectFactory::createShortImport(StringRef Sym, uint16_t Ordinal, ImportType ImportType, ImportNameType NameType) { - size_t ImpSize = DLLName.size() + Sym.size() + 2; // +2 for NULs + size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs size_t Size = sizeof(coff_import_header) + ImpSize; char *Buf = Alloc.Allocate(Size); memset(Buf, 0, Size); @@ -471,17 +474,96 @@ NewArchiveMember ObjectFactory::createShortImport(StringRef Sym, // Write symbol name and DLL name. memcpy(P, Sym.data(), Sym.size()); P += Sym.size() + 1; - memcpy(P, DLLName.data(), DLLName.size()); + memcpy(P, ImportName.data(), ImportName.size()); - return {MemoryBufferRef(StringRef(Buf, Size), DLLName)}; + return {MemoryBufferRef(StringRef(Buf, Size), ImportName)}; } -std::error_code writeImportLibrary(StringRef DLLName, StringRef Path, +NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym, + StringRef Weak, bool Imp) { + std::vector Buffer; + const uint32_t NumberOfSections = 1; + const uint32_t NumberOfSymbols = 5; + + // COFF Header + coff_file_header Header{ + u16(0), + u16(NumberOfSections), + u32(0), + u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section))), + u32(NumberOfSymbols), + u16(0), + u16(0), + }; + append(Buffer, Header); + + // Section Header Table + const coff_section SectionTable[NumberOfSections] = { + {{'.', 'd', 'r', 'e', 'c', 't', 'v', 'e'}, + u32(0), + u32(0), + u32(0), + u32(0), + u32(0), + u32(0), + u16(0), + u16(0), + u32(IMAGE_SCN_LNK_INFO | IMAGE_SCN_LNK_REMOVE)}}; + append(Buffer, SectionTable); + + // Symbol Table + coff_symbol16 SymbolTable[NumberOfSymbols] = { + {{{'@', 'c', 'o', 'm', 'p', '.', 'i', 'd'}}, + u32(0), + u16(0xFFFF), + u16(0), + IMAGE_SYM_CLASS_STATIC, + 0}, + {{{'@', 'f', 'e', 'a', 't', '.', '0', '0'}}, + u32(0), + u16(0xFFFF), + u16(0), + IMAGE_SYM_CLASS_STATIC, + 0}, + {{{0, 0, 0, 0, 0, 0, 0, 0}}, + u32(0), + u16(0), + u16(0), + IMAGE_SYM_CLASS_EXTERNAL, + 0}, + {{{0, 0, 0, 0, 0, 0, 0, 0}}, + u32(0), + u16(0), + u16(0), + IMAGE_SYM_CLASS_WEAK_EXTERNAL, + 1}, + {{{2, 0, 0, 0, 3, 0, 0, 0}}, u32(0), u16(0), u16(0), uint8_t(0), 0}, + }; + SymbolTable[2].Name.Offset.Offset = sizeof(uint32_t); + + //__imp_ String Table + if (Imp) { + SymbolTable[3].Name.Offset.Offset = sizeof(uint32_t) + Sym.size() + 7; + writeStringTable(Buffer, {std::string("__imp_").append(Sym), + std::string("__imp_").append(Weak)}); + } else { + SymbolTable[3].Name.Offset.Offset = sizeof(uint32_t) + Sym.size() + 1; + writeStringTable(Buffer, {Sym, Weak}); + } + append(Buffer, SymbolTable); + + // Copied here so we can still use writeStringTable + char *Buf = Alloc.Allocate(Buffer.size()); + memcpy(Buf, Buffer.data(), Buffer.size()); + return {MemoryBufferRef(StringRef(Buf, Buffer.size()), ImportName)}; +} + +std::error_code writeImportLibrary(StringRef ImportName, StringRef Path, ArrayRef Exports, MachineTypes Machine) { std::vector Members; - ObjectFactory OF(llvm::sys::path::filename(DLLName), Machine); + ObjectFactory OF(llvm::sys::path::filename(ImportName), Machine); std::vector ImportDescriptor; Members.push_back(OF.createImportDescriptor(ImportDescriptor)); @@ -496,6 +578,12 @@ std::error_code writeImportLibrary(StringRef DLLName, StringRef Path, if (E.Private) continue; + if (E.isWeak()) { + Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, false)); + Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, true)); + continue; + } + ImportType ImportType = IMPORT_CODE; if (E.Data) ImportType = IMPORT_DATA; diff --git a/lib/Object/COFFModuleDefinition.cpp b/lib/Object/COFFModuleDefinition.cpp index 0d69cb6b709c..ed9140d1fe08 100644 --- a/lib/Object/COFFModuleDefinition.cpp +++ b/lib/Object/COFFModuleDefinition.cpp @@ -22,6 +22,7 @@ #include "llvm/Object/COFFImportFile.h" #include "llvm/Object/Error.h" #include "llvm/Support/Error.h" +#include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" using namespace llvm::COFF; @@ -55,8 +56,10 @@ struct Token { StringRef Value; }; -static bool isDecorated(StringRef Sym) { - return Sym.startswith("_") || Sym.startswith("@") || Sym.startswith("?"); +static bool isDecorated(StringRef Sym, bool MingwDef) { + // mingw does not prepend "_". + return (!MingwDef && Sym.startswith("_")) || Sym.startswith("@") || + Sym.startswith("?"); } static Error createError(const Twine &Err) { @@ -83,6 +86,9 @@ public: } case '=': Buf = Buf.drop_front(); + // GNU dlltool accepts both = and ==. + if (Buf.startswith("=")) + Buf = Buf.drop_front(); return Token(Equal, "="); case ',': Buf = Buf.drop_front(); @@ -120,7 +126,8 @@ private: class Parser { public: - explicit Parser(StringRef S, MachineTypes M) : Lex(S), Machine(M) {} + explicit Parser(StringRef S, MachineTypes M, bool B) + : Lex(S), Machine(M), MingwDef(B) {} Expected parse() { do { @@ -181,14 +188,17 @@ private: std::string Name; if (Error Err = parseName(&Name, &Info.ImageBase)) return Err; - // Append the appropriate file extension if not already present. - StringRef Ext = IsDll ? ".dll" : ".exe"; - if (!StringRef(Name).endswith_lower(Ext)) - Name += Ext; + + Info.ImportName = Name; // Set the output file, but don't override /out if it was already passed. - if (Info.OutputFile.empty()) + if (Info.OutputFile.empty()) { Info.OutputFile = Name; + // Append the appropriate file extension if not already present. + if (!sys::path::has_extension(Name)) + Info.OutputFile += IsDll ? ".dll" : ".exe"; + } + return Error::success(); } case KwVersion: @@ -213,9 +223,9 @@ private: } if (Machine == IMAGE_FILE_MACHINE_I386) { - if (!isDecorated(E.Name)) + if (!isDecorated(E.Name, MingwDef)) E.Name = (std::string("_").append(E.Name)); - if (!E.ExtName.empty() && !isDecorated(E.ExtName)) + if (!E.ExtName.empty() && !isDecorated(E.ExtName, MingwDef)) E.ExtName = (std::string("_").append(E.ExtName)); } @@ -308,11 +318,13 @@ private: std::vector Stack; MachineTypes Machine; COFFModuleDefinition Info; + bool MingwDef; }; Expected parseCOFFModuleDefinition(MemoryBufferRef MB, - MachineTypes Machine) { - return Parser(MB.getBuffer(), Machine).parse(); + MachineTypes Machine, + bool MingwDef) { + return Parser(MB.getBuffer(), Machine, MingwDef).parse(); } } // namespace object diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp index 1e9b0c5b0454..0a2053477caf 100644 --- a/lib/Object/COFFObjectFile.cpp +++ b/lib/Object/COFFObjectFile.cpp @@ -227,8 +227,11 @@ uint32_t COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const { if (Symb.isExternal() || Symb.isWeakExternal()) Result |= SymbolRef::SF_Global; - if (Symb.isWeakExternal()) + if (Symb.isWeakExternal()) { Result |= SymbolRef::SF_Weak; + // We use indirect to allow the archiver to write weak externs + Result |= SymbolRef::SF_Indirect; + } if (Symb.getSectionNumber() == COFF::IMAGE_SYM_ABSOLUTE) Result |= SymbolRef::SF_Absolute; diff --git a/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/lib/ObjectYAML/CodeViewYAMLTypes.cpp index 0b2ea61c5fe0..81046b217862 100644 --- a/lib/ObjectYAML/CodeViewYAMLTypes.cpp +++ b/lib/ObjectYAML/CodeViewYAMLTypes.cpp @@ -141,6 +141,33 @@ template struct MemberRecordImpl : public MemberRecordBase { } // end namespace CodeViewYAML } // end namespace llvm +void ScalarTraits::output(const GUID &G, void *, llvm::raw_ostream &OS) { + OS << G; +} + +StringRef ScalarTraits::input(StringRef Scalar, void *Ctx, GUID &S) { + if (Scalar.size() != 38) + return "GUID strings are 38 characters long"; + if (Scalar[0] != '{' || Scalar[37] != '}') + return "GUID is not enclosed in {}"; + if (Scalar[9] != '-' || Scalar[14] != '-' || Scalar[19] != '-' || + Scalar[24] != '-') + return "GUID sections are not properly delineated with dashes"; + + uint8_t *OutBuffer = S.Guid; + for (auto Iter = Scalar.begin(); Iter != Scalar.end();) { + if (*Iter == '-' || *Iter == '{' || *Iter == '}') { + ++Iter; + continue; + } + uint8_t Value = (llvm::hexDigitValue(*Iter++) << 4); + Value |= llvm::hexDigitValue(*Iter++); + *OutBuffer++ = Value; + } + + return ""; +} + void ScalarTraits::output(const TypeIndex &S, void *, raw_ostream &OS) { OS << S.getIndex(); diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp index bcd365236e46..f3b438e829d6 100644 --- a/lib/Option/OptTable.cpp +++ b/lib/Option/OptTable.cpp @@ -390,27 +390,29 @@ static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) { return Name; } +namespace { +struct OptionInfo { + std::string Name; + StringRef HelpText; +}; +} // namespace + static void PrintHelpOptionList(raw_ostream &OS, StringRef Title, - std::vector> &OptionHelp) { + std::vector &OptionHelp) { OS << Title << ":\n"; // Find the maximum option length. unsigned OptionFieldWidth = 0; for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) { - // Skip titles. - if (!OptionHelp[i].second) - continue; - // Limit the amount of padding we are willing to give up for alignment. - unsigned Length = OptionHelp[i].first.size(); + unsigned Length = OptionHelp[i].Name.size(); if (Length <= 23) OptionFieldWidth = std::max(OptionFieldWidth, Length); } const unsigned InitialPad = 2; for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) { - const std::string &Option = OptionHelp[i].first; + const std::string &Option = OptionHelp[i].Name; int Pad = OptionFieldWidth - int(Option.size()); OS.indent(InitialPad) << Option; @@ -419,7 +421,7 @@ static void PrintHelpOptionList(raw_ostream &OS, StringRef Title, OS << "\n"; Pad = OptionFieldWidth + InitialPad; } - OS.indent(Pad + 1) << OptionHelp[i].second << '\n'; + OS.indent(Pad + 1) << OptionHelp[i].HelpText << '\n'; } } @@ -458,8 +460,7 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title, // Render help text into a map of group-name to a list of (option, help) // pairs. - using helpmap_ty = - std::map>>; + using helpmap_ty = std::map>; helpmap_ty GroupedOptionHelp; for (unsigned i = 0, e = getNumOptions(); i != e; ++i) { @@ -478,7 +479,7 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title, if (const char *Text = getOptionHelpText(Id)) { const char *HelpGroup = getOptionHelpGroup(*this, Id); const std::string &OptName = getOptionHelpName(*this, Id); - GroupedOptionHelp[HelpGroup].push_back(std::make_pair(OptName, Text)); + GroupedOptionHelp[HelpGroup].push_back({OptName, Text}); } } diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp index fe69151665c6..2fd4f3ea0d45 100644 --- a/lib/Support/ErrorHandling.cpp +++ b/lib/Support/ErrorHandling.cpp @@ -45,22 +45,36 @@ static void *ErrorHandlerUserData = nullptr; static fatal_error_handler_t BadAllocErrorHandler = nullptr; static void *BadAllocErrorHandlerUserData = nullptr; +#if LLVM_ENABLE_THREADS == 1 // Mutexes to synchronize installing error handlers and calling error handlers. // Do not use ManagedStatic, or that may allocate memory while attempting to // report an OOM. +// +// This usage of std::mutex has to be conditionalized behind ifdefs because +// of this script: +// compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh +// That script attempts to statically link the LLVM symbolizer library with the +// STL and hide all of its symbols with 'opt -internalize'. To reduce size, it +// cuts out the threading portions of the hermetic copy of libc++ that it +// builds. We can remove these ifdefs if that script goes away. static std::mutex ErrorHandlerMutex; static std::mutex BadAllocErrorHandlerMutex; +#endif void llvm::install_fatal_error_handler(fatal_error_handler_t handler, void *user_data) { +#if LLVM_ENABLE_THREADS == 1 std::lock_guard Lock(ErrorHandlerMutex); +#endif assert(!ErrorHandler && "Error handler already registered!\n"); ErrorHandler = handler; ErrorHandlerUserData = user_data; } void llvm::remove_fatal_error_handler() { +#if LLVM_ENABLE_THREADS == 1 std::lock_guard Lock(ErrorHandlerMutex); +#endif ErrorHandler = nullptr; ErrorHandlerUserData = nullptr; } @@ -83,7 +97,9 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) { { // Only acquire the mutex while reading the handler, so as not to invoke a // user-supplied callback under a lock. +#if LLVM_ENABLE_THREADS == 1 std::lock_guard Lock(ErrorHandlerMutex); +#endif handler = ErrorHandler; handlerData = ErrorHandlerUserData; } @@ -112,14 +128,18 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) { void llvm::install_bad_alloc_error_handler(fatal_error_handler_t handler, void *user_data) { +#if LLVM_ENABLE_THREADS == 1 std::lock_guard Lock(BadAllocErrorHandlerMutex); +#endif assert(!ErrorHandler && "Bad alloc error handler already registered!\n"); BadAllocErrorHandler = handler; BadAllocErrorHandlerUserData = user_data; } void llvm::remove_bad_alloc_error_handler() { +#if LLVM_ENABLE_THREADS == 1 std::lock_guard Lock(BadAllocErrorHandlerMutex); +#endif BadAllocErrorHandler = nullptr; BadAllocErrorHandlerUserData = nullptr; } @@ -130,7 +150,9 @@ void llvm::report_bad_alloc_error(const char *Reason, bool GenCrashDiag) { { // Only acquire the mutex while reading the handler, so as not to invoke a // user-supplied callback under a lock. +#if LLVM_ENABLE_THREADS == 1 std::lock_guard Lock(BadAllocErrorHandlerMutex); +#endif Handler = BadAllocErrorHandler; HandlerData = BadAllocErrorHandlerUserData; } diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 9f22f89b3c9e..5cf0316d4d71 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -250,6 +250,8 @@ StringRef sys::detail::getHostCPUNameForS390x( Pos += sizeof("machine = ") - 1; unsigned int Id; if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) { + if (Id >= 3906 && HaveVectorSupport) + return "z14"; if (Id >= 2964 && HaveVectorSupport) return "z13"; if (Id >= 2827) @@ -460,8 +462,8 @@ static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX, static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { -#if defined(__x86_64__) || defined(_M_X64) #if defined(__GNUC__) || defined(__clang__) +#if defined(__x86_64__) // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually. // FIXME: should we save this for Clang? __asm__("movq\t%%rbx, %%rsi\n\t" @@ -470,43 +472,24 @@ static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf, : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX) : "a"(value), "c"(subleaf)); return false; -#elif defined(_MSC_VER) - int registers[4]; - __cpuidex(registers, value, subleaf); - *rEAX = registers[0]; - *rEBX = registers[1]; - *rECX = registers[2]; - *rEDX = registers[3]; - return false; -#else - return true; -#endif -#elif defined(__i386__) || defined(_M_IX86) -#if defined(__GNUC__) || defined(__clang__) +#elif defined(__i386__) __asm__("movl\t%%ebx, %%esi\n\t" "cpuid\n\t" "xchgl\t%%ebx, %%esi\n\t" : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX) : "a"(value), "c"(subleaf)); return false; -#elif defined(_MSC_VER) - __asm { - mov eax,value - mov ecx,subleaf - cpuid - mov esi,rEAX - mov dword ptr [esi],eax - mov esi,rEBX - mov dword ptr [esi],ebx - mov esi,rECX - mov dword ptr [esi],ecx - mov esi,rEDX - mov dword ptr [esi],edx - } - return false; #else return true; #endif +#elif defined(_MSC_VER) + int registers[4]; + __cpuidex(registers, value, subleaf); + *rEAX = registers[0]; + *rEBX = registers[1]; + *rECX = registers[2]; + *rEDX = registers[3]; + return false; #else return true; #endif diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp index e58f856ca244..ea59ba62d7bd 100644 --- a/lib/Support/Path.cpp +++ b/lib/Support/Path.cpp @@ -13,8 +13,6 @@ #include "llvm/Support/Path.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/BinaryFormat/COFF.h" -#include "llvm/BinaryFormat/MachO.h" #include "llvm/Support/Endian.h" #include "llvm/Support/Errc.h" #include "llvm/Support/ErrorHandling.h" diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp index 13bb6f23bc83..e8ef1d2fd8b9 100644 --- a/lib/Support/TargetParser.cpp +++ b/lib/Support/TargetParser.cpp @@ -452,6 +452,8 @@ bool llvm::AArch64::getExtensionFeatures(unsigned Extensions, Features.push_back("+ras"); if (Extensions & AArch64::AEK_LSE) Features.push_back("+lse"); + if (Extensions & AArch64::AEK_SVE) + Features.push_back("+sve"); return true; } diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp index 601084f9eae3..65eda246a7fe 100644 --- a/lib/Support/YAMLTraits.cpp +++ b/lib/Support/YAMLTraits.cpp @@ -60,6 +60,14 @@ Input::Input(StringRef InputContent, void *Ctxt, DocIterator = Strm->begin(); } +Input::Input(MemoryBufferRef Input, void *Ctxt, + SourceMgr::DiagHandlerTy DiagHandler, void *DiagHandlerCtxt) + : IO(Ctxt), Strm(new Stream(Input, SrcMgr, false, &EC)) { + if (DiagHandler) + SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt); + DocIterator = Strm->begin(); +} + Input::~Input() = default; std::error_code Input::error() { return EC; } diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp index 9480cd46d28f..dd58eccee957 100644 --- a/lib/Support/raw_ostream.cpp +++ b/lib/Support/raw_ostream.cpp @@ -326,13 +326,30 @@ raw_ostream &raw_ostream::operator<<(const formatv_object_base &Obj) { } raw_ostream &raw_ostream::operator<<(const FormattedString &FS) { - unsigned Len = FS.Str.size(); - int PadAmount = FS.Width - Len; - if (FS.RightJustify && (PadAmount > 0)) - this->indent(PadAmount); - this->operator<<(FS.Str); - if (!FS.RightJustify && (PadAmount > 0)) + if (FS.Str.size() >= FS.Width || FS.Justify == FormattedString::JustifyNone) { + this->operator<<(FS.Str); + return *this; + } + const size_t Difference = FS.Width - FS.Str.size(); + switch (FS.Justify) { + case FormattedString::JustifyLeft: + this->operator<<(FS.Str); + this->indent(Difference); + break; + case FormattedString::JustifyRight: + this->indent(Difference); + this->operator<<(FS.Str); + break; + case FormattedString::JustifyCenter: { + int PadAmount = Difference / 2; this->indent(PadAmount); + this->operator<<(FS.Str); + this->indent(Difference - PadAmount); + break; + } + default: + llvm_unreachable("Bad Justification"); + } return *this; } diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 37b9690d0434..1dda746a6be1 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -44,6 +44,8 @@ ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64A57FPLoadBalancing(); FunctionPass *createAArch64A53Fix835769(); +FunctionPass *createFalkorHWPFFixPass(); +FunctionPass *createFalkorMarkStridedAccessesPass(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); @@ -66,6 +68,8 @@ void initializeAArch64VectorByElementOptPass(PassRegistry&); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); +void initializeFalkorHWPFFixPass(PassRegistry&); +void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 53eef79c4df3..436bf1193304 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -50,6 +50,9 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", "Enable Statistical Profiling extension">; +def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", + "Enable Scalable Vector Extension (SVE) instructions">; + /// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; @@ -269,6 +272,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 938779d23690..291bc5ea858e 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -118,6 +118,13 @@ def RetCC_AArch64_AAPCS : CallingConv<[ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> ]>; +// Vararg functions on windows pass floats in integer registers +def CC_AArch64_Win64_VarArg : CallingConv<[ + CCIfType<[f16, f32], CCPromoteToType>, + CCIfType<[f64], CCBitConvertToType>, + CCDelegateTo +]>; + // Darwin uses a calling convention which differs in only two ways // from the standard one at this level: diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index ee54550c9900..b72f23b109d9 100644 --- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -102,6 +102,10 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( case AArch64::LDADDALh: case AArch64::LDADDALs: case AArch64::LDADDALd: + case AArch64::LDCLRALb: + case AArch64::LDCLRALh: + case AArch64::LDCLRALs: + case AArch64::LDCLRALd: case AArch64::LDEORALb: case AArch64::LDEORALh: case AArch64::LDEORALs: diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp new file mode 100644 index 000000000000..c0e22355a9ff --- /dev/null +++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -0,0 +1,790 @@ +//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions +/// that may inhibit the HW prefetching. This is done in two steps. Before +/// ISel, we mark strided loads (i.e. those that will likely benefit from +/// prefetching) with metadata. Then, after opcodes have been finalized, we +/// insert MOVs and re-write loads to prevent unintnentional tag collisions. +// ===---------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64TargetMachine.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "falkor-hwpf-fix" + +STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked"); +STATISTIC(NumCollisionsAvoided, + "Number of HW prefetch tag collisions avoided"); +STATISTIC(NumCollisionsNotAvoided, + "Number of HW prefetch tag collisions not avoided due to lack of regsiters"); + +namespace { + +class FalkorMarkStridedAccesses { +public: + FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE) + : LI(LI), SE(SE) {} + + bool run(); + +private: + bool runOnLoop(Loop &L); + + LoopInfo &LI; + ScalarEvolution &SE; +}; + +class FalkorMarkStridedAccessesLegacy : public FunctionPass { +public: + static char ID; // Pass ID, replacement for typeid + FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) { + initializeFalkorMarkStridedAccessesLegacyPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + // FIXME: For some reason, preserving SE here breaks LSR (even if + // this pass changes nothing). + // AU.addPreserved(); + } + + bool runOnFunction(Function &F) override; +}; +} // namespace + +char FalkorMarkStridedAccessesLegacy::ID = 0; +INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, + "Falkor HW Prefetch Fix", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, + "Falkor HW Prefetch Fix", false, false) + +FunctionPass *llvm::createFalkorMarkStridedAccessesPass() { + return new FalkorMarkStridedAccessesLegacy(); +} + +bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) { + TargetPassConfig &TPC = getAnalysis(); + const AArch64Subtarget *ST = + TPC.getTM().getSubtargetImpl(F); + if (ST->getProcFamily() != AArch64Subtarget::Falkor) + return false; + + if (skipFunction(F)) + return false; + + LoopInfo &LI = getAnalysis().getLoopInfo(); + ScalarEvolution &SE = getAnalysis().getSE(); + + FalkorMarkStridedAccesses LDP(LI, SE); + return LDP.run(); +} + +bool FalkorMarkStridedAccesses::run() { + bool MadeChange = false; + + for (Loop *L : LI) + for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt) + MadeChange |= runOnLoop(**LIt); + + return MadeChange; +} + +bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) { + // Only mark strided loads in the inner-most loop + if (!L.empty()) + return false; + + bool MadeChange = false; + + for (BasicBlock *BB : L.blocks()) { + for (Instruction &I : *BB) { + LoadInst *LoadI = dyn_cast(&I); + if (!LoadI) + continue; + + Value *PtrValue = LoadI->getPointerOperand(); + if (L.isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE.getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) + continue; + + LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD, + MDNode::get(LoadI->getContext(), {})); + ++NumStridedLoadsMarked; + DEBUG(dbgs() << "Load: " << I << " marked as strided\n"); + MadeChange = true; + } + } + + return MadeChange; +} + +namespace { + +class FalkorHWPFFix : public MachineFunctionPass { +public: + static char ID; + + FalkorHWPFFix() : MachineFunctionPass(ID) { + initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + void runOnLoop(MachineLoop &L, MachineFunction &Fn); + + const AArch64InstrInfo *TII; + const TargetRegisterInfo *TRI; + DenseMap> TagMap; + bool Modified; +}; + +/// Bits from load opcodes used to compute HW prefetcher instruction tags. +struct LoadInfo { + LoadInfo() + : DestReg(0), BaseReg(0), BaseRegIdx(-1), OffsetOpnd(nullptr), + IsPrePost(false) {} + unsigned DestReg; + unsigned BaseReg; + int BaseRegIdx; + const MachineOperand *OffsetOpnd; + bool IsPrePost; +}; + +} // namespace + +char FalkorHWPFFix::ID = 0; + +INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late", + "Falkor HW Prefetch Fix Late Phase", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late", + "Falkor HW Prefetch Fix Late Phase", false, false) + +static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) { + return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8); +} + +static Optional getLoadInfo(const MachineInstr &MI) { + int DestRegIdx; + int BaseRegIdx; + int OffsetIdx; + bool IsPrePost; + + switch (MI.getOpcode()) { + default: + return None; + + case AArch64::LD1i8: + case AArch64::LD1i16: + case AArch64::LD1i32: + case AArch64::LD1i64: + case AArch64::LD2i8: + case AArch64::LD2i16: + case AArch64::LD2i32: + case AArch64::LD2i64: + case AArch64::LD3i8: + case AArch64::LD3i16: + case AArch64::LD3i32: + case AArch64::LD4i8: + case AArch64::LD4i16: + case AArch64::LD4i32: + DestRegIdx = 0; + BaseRegIdx = 3; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD3i64: + case AArch64::LD4i64: + DestRegIdx = -1; + BaseRegIdx = 3; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD1Onev1d: + case AArch64::LD1Onev2s: + case AArch64::LD1Onev4h: + case AArch64::LD1Onev8b: + case AArch64::LD1Onev2d: + case AArch64::LD1Onev4s: + case AArch64::LD1Onev8h: + case AArch64::LD1Onev16b: + case AArch64::LD1Rv1d: + case AArch64::LD1Rv2s: + case AArch64::LD1Rv4h: + case AArch64::LD1Rv8b: + case AArch64::LD1Rv2d: + case AArch64::LD1Rv4s: + case AArch64::LD1Rv8h: + case AArch64::LD1Rv16b: + case AArch64::LD1Twov1d: + case AArch64::LD1Twov2s: + case AArch64::LD1Twov4h: + case AArch64::LD1Twov8b: + case AArch64::LD2Twov2s: + case AArch64::LD2Twov4s: + case AArch64::LD2Twov8b: + case AArch64::LD2Rv1d: + case AArch64::LD2Rv2s: + case AArch64::LD2Rv4s: + case AArch64::LD2Rv8b: + DestRegIdx = 0; + BaseRegIdx = 1; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD1Twov2d: + case AArch64::LD1Twov4s: + case AArch64::LD1Twov8h: + case AArch64::LD1Twov16b: + case AArch64::LD1Threev1d: + case AArch64::LD1Threev2s: + case AArch64::LD1Threev4h: + case AArch64::LD1Threev8b: + case AArch64::LD1Threev2d: + case AArch64::LD1Threev4s: + case AArch64::LD1Threev8h: + case AArch64::LD1Threev16b: + case AArch64::LD1Fourv1d: + case AArch64::LD1Fourv2s: + case AArch64::LD1Fourv4h: + case AArch64::LD1Fourv8b: + case AArch64::LD1Fourv2d: + case AArch64::LD1Fourv4s: + case AArch64::LD1Fourv8h: + case AArch64::LD1Fourv16b: + case AArch64::LD2Twov2d: + case AArch64::LD2Twov4h: + case AArch64::LD2Twov8h: + case AArch64::LD2Twov16b: + case AArch64::LD2Rv2d: + case AArch64::LD2Rv4h: + case AArch64::LD2Rv8h: + case AArch64::LD2Rv16b: + case AArch64::LD3Threev2s: + case AArch64::LD3Threev4h: + case AArch64::LD3Threev8b: + case AArch64::LD3Threev2d: + case AArch64::LD3Threev4s: + case AArch64::LD3Threev8h: + case AArch64::LD3Threev16b: + case AArch64::LD3Rv1d: + case AArch64::LD3Rv2s: + case AArch64::LD3Rv4h: + case AArch64::LD3Rv8b: + case AArch64::LD3Rv2d: + case AArch64::LD3Rv4s: + case AArch64::LD3Rv8h: + case AArch64::LD3Rv16b: + case AArch64::LD4Fourv2s: + case AArch64::LD4Fourv4h: + case AArch64::LD4Fourv8b: + case AArch64::LD4Fourv2d: + case AArch64::LD4Fourv4s: + case AArch64::LD4Fourv8h: + case AArch64::LD4Fourv16b: + case AArch64::LD4Rv1d: + case AArch64::LD4Rv2s: + case AArch64::LD4Rv4h: + case AArch64::LD4Rv8b: + case AArch64::LD4Rv2d: + case AArch64::LD4Rv4s: + case AArch64::LD4Rv8h: + case AArch64::LD4Rv16b: + DestRegIdx = -1; + BaseRegIdx = 1; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD1i8_POST: + case AArch64::LD1i16_POST: + case AArch64::LD1i32_POST: + case AArch64::LD1i64_POST: + case AArch64::LD2i8_POST: + case AArch64::LD2i16_POST: + case AArch64::LD2i32_POST: + case AArch64::LD2i64_POST: + case AArch64::LD3i8_POST: + case AArch64::LD3i16_POST: + case AArch64::LD3i32_POST: + case AArch64::LD4i8_POST: + case AArch64::LD4i16_POST: + case AArch64::LD4i32_POST: + DestRegIdx = 1; + BaseRegIdx = 4; + OffsetIdx = 5; + IsPrePost = false; + break; + + case AArch64::LD3i64_POST: + case AArch64::LD4i64_POST: + DestRegIdx = -1; + BaseRegIdx = 4; + OffsetIdx = 5; + IsPrePost = false; + break; + + case AArch64::LD1Onev1d_POST: + case AArch64::LD1Onev2s_POST: + case AArch64::LD1Onev4h_POST: + case AArch64::LD1Onev8b_POST: + case AArch64::LD1Onev2d_POST: + case AArch64::LD1Onev4s_POST: + case AArch64::LD1Onev8h_POST: + case AArch64::LD1Onev16b_POST: + case AArch64::LD1Rv1d_POST: + case AArch64::LD1Rv2s_POST: + case AArch64::LD1Rv4h_POST: + case AArch64::LD1Rv8b_POST: + case AArch64::LD1Rv2d_POST: + case AArch64::LD1Rv4s_POST: + case AArch64::LD1Rv8h_POST: + case AArch64::LD1Rv16b_POST: + case AArch64::LD1Twov1d_POST: + case AArch64::LD1Twov2s_POST: + case AArch64::LD1Twov4h_POST: + case AArch64::LD1Twov8b_POST: + case AArch64::LD2Twov2s_POST: + case AArch64::LD2Twov4s_POST: + case AArch64::LD2Twov8b_POST: + case AArch64::LD2Rv1d_POST: + case AArch64::LD2Rv2s_POST: + case AArch64::LD2Rv4s_POST: + case AArch64::LD2Rv8b_POST: + DestRegIdx = 1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LD1Twov2d_POST: + case AArch64::LD1Twov4s_POST: + case AArch64::LD1Twov8h_POST: + case AArch64::LD1Twov16b_POST: + case AArch64::LD1Threev1d_POST: + case AArch64::LD1Threev2s_POST: + case AArch64::LD1Threev4h_POST: + case AArch64::LD1Threev8b_POST: + case AArch64::LD1Threev2d_POST: + case AArch64::LD1Threev4s_POST: + case AArch64::LD1Threev8h_POST: + case AArch64::LD1Threev16b_POST: + case AArch64::LD1Fourv1d_POST: + case AArch64::LD1Fourv2s_POST: + case AArch64::LD1Fourv4h_POST: + case AArch64::LD1Fourv8b_POST: + case AArch64::LD1Fourv2d_POST: + case AArch64::LD1Fourv4s_POST: + case AArch64::LD1Fourv8h_POST: + case AArch64::LD1Fourv16b_POST: + case AArch64::LD2Twov2d_POST: + case AArch64::LD2Twov4h_POST: + case AArch64::LD2Twov8h_POST: + case AArch64::LD2Twov16b_POST: + case AArch64::LD2Rv2d_POST: + case AArch64::LD2Rv4h_POST: + case AArch64::LD2Rv8h_POST: + case AArch64::LD2Rv16b_POST: + case AArch64::LD3Threev2s_POST: + case AArch64::LD3Threev4h_POST: + case AArch64::LD3Threev8b_POST: + case AArch64::LD3Threev2d_POST: + case AArch64::LD3Threev4s_POST: + case AArch64::LD3Threev8h_POST: + case AArch64::LD3Threev16b_POST: + case AArch64::LD3Rv1d_POST: + case AArch64::LD3Rv2s_POST: + case AArch64::LD3Rv4h_POST: + case AArch64::LD3Rv8b_POST: + case AArch64::LD3Rv2d_POST: + case AArch64::LD3Rv4s_POST: + case AArch64::LD3Rv8h_POST: + case AArch64::LD3Rv16b_POST: + case AArch64::LD4Fourv2s_POST: + case AArch64::LD4Fourv4h_POST: + case AArch64::LD4Fourv8b_POST: + case AArch64::LD4Fourv2d_POST: + case AArch64::LD4Fourv4s_POST: + case AArch64::LD4Fourv8h_POST: + case AArch64::LD4Fourv16b_POST: + case AArch64::LD4Rv1d_POST: + case AArch64::LD4Rv2s_POST: + case AArch64::LD4Rv4h_POST: + case AArch64::LD4Rv8b_POST: + case AArch64::LD4Rv2d_POST: + case AArch64::LD4Rv4s_POST: + case AArch64::LD4Rv8h_POST: + case AArch64::LD4Rv16b_POST: + DestRegIdx = -1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LDRBBroW: + case AArch64::LDRBBroX: + case AArch64::LDRBBui: + case AArch64::LDRBroW: + case AArch64::LDRBroX: + case AArch64::LDRBui: + case AArch64::LDRDl: + case AArch64::LDRDroW: + case AArch64::LDRDroX: + case AArch64::LDRDui: + case AArch64::LDRHHroW: + case AArch64::LDRHHroX: + case AArch64::LDRHHui: + case AArch64::LDRHroW: + case AArch64::LDRHroX: + case AArch64::LDRHui: + case AArch64::LDRQl: + case AArch64::LDRQroW: + case AArch64::LDRQroX: + case AArch64::LDRQui: + case AArch64::LDRSBWroW: + case AArch64::LDRSBWroX: + case AArch64::LDRSBWui: + case AArch64::LDRSBXroW: + case AArch64::LDRSBXroX: + case AArch64::LDRSBXui: + case AArch64::LDRSHWroW: + case AArch64::LDRSHWroX: + case AArch64::LDRSHWui: + case AArch64::LDRSHXroW: + case AArch64::LDRSHXroX: + case AArch64::LDRSHXui: + case AArch64::LDRSWl: + case AArch64::LDRSWroW: + case AArch64::LDRSWroX: + case AArch64::LDRSWui: + case AArch64::LDRSl: + case AArch64::LDRSroW: + case AArch64::LDRSroX: + case AArch64::LDRSui: + case AArch64::LDRWl: + case AArch64::LDRWroW: + case AArch64::LDRWroX: + case AArch64::LDRWui: + case AArch64::LDRXl: + case AArch64::LDRXroW: + case AArch64::LDRXroX: + case AArch64::LDRXui: + case AArch64::LDURBBi: + case AArch64::LDURBi: + case AArch64::LDURDi: + case AArch64::LDURHHi: + case AArch64::LDURHi: + case AArch64::LDURQi: + case AArch64::LDURSBWi: + case AArch64::LDURSBXi: + case AArch64::LDURSHWi: + case AArch64::LDURSHXi: + case AArch64::LDURSWi: + case AArch64::LDURSi: + case AArch64::LDURWi: + case AArch64::LDURXi: + DestRegIdx = 0; + BaseRegIdx = 1; + OffsetIdx = 2; + IsPrePost = false; + break; + + case AArch64::LDRBBpost: + case AArch64::LDRBBpre: + case AArch64::LDRBpost: + case AArch64::LDRBpre: + case AArch64::LDRDpost: + case AArch64::LDRDpre: + case AArch64::LDRHHpost: + case AArch64::LDRHHpre: + case AArch64::LDRHpost: + case AArch64::LDRHpre: + case AArch64::LDRQpost: + case AArch64::LDRQpre: + case AArch64::LDRSBWpost: + case AArch64::LDRSBWpre: + case AArch64::LDRSBXpost: + case AArch64::LDRSBXpre: + case AArch64::LDRSHWpost: + case AArch64::LDRSHWpre: + case AArch64::LDRSHXpost: + case AArch64::LDRSHXpre: + case AArch64::LDRSWpost: + case AArch64::LDRSWpre: + case AArch64::LDRSpost: + case AArch64::LDRSpre: + case AArch64::LDRWpost: + case AArch64::LDRWpre: + case AArch64::LDRXpost: + case AArch64::LDRXpre: + DestRegIdx = 1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = true; + break; + + case AArch64::LDPDi: + case AArch64::LDPQi: + DestRegIdx = -1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LDPSWi: + case AArch64::LDPSi: + case AArch64::LDPWi: + case AArch64::LDPXi: + DestRegIdx = 0; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LDPQpost: + case AArch64::LDPQpre: + DestRegIdx = -1; + BaseRegIdx = 3; + OffsetIdx = 4; + IsPrePost = true; + break; + + case AArch64::LDPDpost: + case AArch64::LDPDpre: + case AArch64::LDPSWpost: + case AArch64::LDPSWpre: + case AArch64::LDPSpost: + case AArch64::LDPSpre: + case AArch64::LDPWpost: + case AArch64::LDPWpre: + case AArch64::LDPXpost: + case AArch64::LDPXpre: + DestRegIdx = 1; + BaseRegIdx = 3; + OffsetIdx = 4; + IsPrePost = true; + break; + } + + LoadInfo LI; + LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg(); + LI.BaseReg = MI.getOperand(BaseRegIdx).getReg(); + LI.BaseRegIdx = BaseRegIdx; + LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx); + LI.IsPrePost = IsPrePost; + return LI; +} + +static Optional getTag(const TargetRegisterInfo *TRI, + const MachineInstr &MI, const LoadInfo &LI) { + unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0; + unsigned Base = TRI->getEncodingValue(LI.BaseReg); + unsigned Off; + if (LI.OffsetOpnd == nullptr) + Off = 0; + else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() || + LI.OffsetOpnd->isCPI()) + return None; + else if (LI.OffsetOpnd->isReg()) + Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg()); + else + Off = LI.OffsetOpnd->getImm() >> 2; + + return makeTag(Dest, Base, Off); +} + +void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) { + // Build the initial tag map for the whole loop. + TagMap.clear(); + for (MachineBasicBlock *MBB : L.getBlocks()) + for (MachineInstr &MI : *MBB) { + Optional LInfo = getLoadInfo(MI); + if (!LInfo) + continue; + Optional Tag = getTag(TRI, MI, *LInfo); + if (!Tag) + continue; + TagMap[*Tag].push_back(&MI); + } + + bool AnyCollisions = false; + for (auto &P : TagMap) { + auto Size = P.second.size(); + if (Size > 1) { + for (auto *MI : P.second) { + if (TII->isStridedAccess(*MI)) { + AnyCollisions = true; + break; + } + } + } + if (AnyCollisions) + break; + } + // Nothing to fix. + if (!AnyCollisions) + return; + + MachineRegisterInfo &MRI = Fn.getRegInfo(); + + // Go through all the basic blocks in the current loop and fix any streaming + // loads to avoid collisions with any other loads. + LiveRegUnits LR(*TRI); + for (MachineBasicBlock *MBB : L.getBlocks()) { + LR.clear(); + LR.addLiveOuts(*MBB); + for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) { + MachineInstr &MI = *I; + if (!TII->isStridedAccess(MI)) + continue; + + LoadInfo LdI = *getLoadInfo(MI); + unsigned OldTag = *getTag(TRI, MI, LdI); + auto &OldCollisions = TagMap[OldTag]; + if (OldCollisions.size() <= 1) + continue; + + bool Fixed = false; + DEBUG(dbgs() << "Attempting to fix tag collision: " << MI); + + for (unsigned ScratchReg : AArch64::GPR64RegClass) { + if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg)) + continue; + + LoadInfo NewLdI(LdI); + NewLdI.BaseReg = ScratchReg; + unsigned NewTag = *getTag(TRI, MI, NewLdI); + // Scratch reg tag would collide too, so don't use it. + if (TagMap.count(NewTag)) + continue; + + DEBUG(dbgs() << "Changing base reg to: " << PrintReg(ScratchReg, TRI) + << '\n'); + + // Rewrite: + // Xd = LOAD Xb, off + // to: + // Xc = MOV Xb + // Xd = LOAD Xc, off + DebugLoc DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg) + .addReg(AArch64::XZR) + .addReg(LdI.BaseReg) + .addImm(0); + MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx); + BaseOpnd.setReg(ScratchReg); + + // If the load does a pre/post increment, then insert a MOV after as + // well to update the real base register. + if (LdI.IsPrePost) { + DEBUG(dbgs() << "Doing post MOV of incremented reg: " + << PrintReg(ScratchReg, TRI) << '\n'); + MI.getOperand(0).setReg( + ScratchReg); // Change tied operand pre/post update dest. + BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL, + TII->get(AArch64::ORRXrs), LdI.BaseReg) + .addReg(AArch64::XZR) + .addReg(ScratchReg) + .addImm(0); + } + + for (int I = 0, E = OldCollisions.size(); I != E; ++I) + if (OldCollisions[I] == &MI) { + std::swap(OldCollisions[I], OldCollisions[E - 1]); + OldCollisions.pop_back(); + break; + } + + // Update TagMap to reflect instruction changes to reduce the number + // of later MOVs to be inserted. This needs to be done after + // OldCollisions is updated since it may be relocated by this + // insertion. + TagMap[NewTag].push_back(&MI); + ++NumCollisionsAvoided; + Fixed = true; + Modified = true; + break; + } + if (!Fixed) + ++NumCollisionsNotAvoided; + } + } +} + +bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { + auto &ST = static_cast(Fn.getSubtarget()); + if (ST.getProcFamily() != AArch64Subtarget::Falkor) + return false; + + if (skipFunction(*Fn.getFunction())) + return false; + + TII = static_cast(ST.getInstrInfo()); + TRI = ST.getRegisterInfo(); + + assert(TRI->trackLivenessAfterRegAlloc(Fn) && + "Register liveness not available!"); + + MachineLoopInfo &LI = getAnalysis(); + + Modified = false; + + for (MachineLoop *I : LI) + for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) + // Only process inner-loops + if (L->empty()) + runOnLoop(**L, Fn); + + return Modified; +} + +FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); } diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 3682b62d2b84..97396057dce0 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -5138,6 +5138,7 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { return selectOperator(I, I->getOpcode()); // Silence warnings. (void)&CC_AArch64_DarwinPCS_VarArg; + (void)&CC_AArch64_Win64_VarArg; } namespace llvm { diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index e96ee7d29b3e..4907d082eda0 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -41,6 +41,10 @@ // | | // |-----------------------------------| // | | +// | (Win64 only) varargs from reg | +// | | +// |-----------------------------------| +// | | // | prev_fp, prev_lr | // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) @@ -950,7 +954,13 @@ static void computeCalleeSaveRegisterPairs( CC == CallingConv::PreserveMost || (Count & 1) == 0) && "Odd number of callee-saved regs to spill!"); - unsigned Offset = AFI->getCalleeSavedStackSize(); + int Offset = AFI->getCalleeSavedStackSize(); + + unsigned GPRSaveSize = AFI->getVarArgsGPRSize(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + if (IsWin64) + Offset -= alignTo(GPRSaveSize, 16); for (unsigned i = 0; i < Count; ++i) { RegPairInfo RPI; diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 04687847c1a3..06005f6b6886 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -239,10 +239,17 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand( case InlineAsm::Constraint_i: case InlineAsm::Constraint_m: case InlineAsm::Constraint_Q: - // Require the address to be in a register. That is safe for all AArch64 - // variants and it is hard to do anything much smarter without knowing - // how the operand is used. - OutOps.push_back(Op); + // We need to make sure that this one operand does not end up in XZR, thus + // require the address to be in a PointerRegClass register. + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF); + SDLoc dl(Op); + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64); + SDValue NewOp = + SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, Op.getValueType(), + Op, RC), 0); + OutOps.push_back(NewOp); return false; } return true; diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 60fde5caa339..c6150f9e5d1d 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2650,9 +2650,13 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::PreserveMost: case CallingConv::CXX_FAST_TLS: case CallingConv::Swift: + if (Subtarget->isTargetWindows() && IsVarArg) + return CC_AArch64_Win64_VarArg; if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; + case CallingConv::Win64: + return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; } } @@ -2668,6 +2672,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); + bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; @@ -2824,10 +2829,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // varargs AArch64FunctionInfo *FuncInfo = MF.getInfo(); if (isVarArg) { - if (!Subtarget->isTargetDarwin()) { + if (!Subtarget->isTargetDarwin() || IsWin64) { // The AAPCS variadic function ABI is identical to the non-variadic // one. As a result there may be more arguments in registers and we should // save them for future reference. + // Win64 variadic functions also pass arguments in registers, but all float + // arguments are passed in integer registers. saveVarArgRegisters(CCInfo, DAG, DL, Chain); } @@ -2869,6 +2876,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, MachineFrameInfo &MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); auto PtrVT = getPointerTy(DAG.getDataLayout()); + bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()); SmallVector MemOps; @@ -2881,7 +2889,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; if (GPRSaveSize != 0) { - GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); + if (IsWin64) + GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); + else + GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); @@ -2890,7 +2901,11 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); + IsWin64 + ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), + GPRIdx, + (i - FirstVariadicGPR) * 8) + : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); @@ -2899,7 +2914,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, FuncInfo->setVarArgsGPRIndex(GPRIdx); FuncInfo->setVarArgsGPRSize(GPRSaveSize); - if (Subtarget->hasFPARMv8()) { + if (Subtarget->hasFPARMv8() && !IsWin64) { static const MCPhysReg FPRArgRegs[] = { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; @@ -4491,6 +4506,21 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, MachinePointerInfo(SV)); } +SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, + SelectionDAG &DAG) const { + AArch64FunctionInfo *FuncInfo = + DAG.getMachineFunction().getInfo(); + + SDLoc DL(Op); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 + ? FuncInfo->getVarArgsGPRIndex() + : FuncInfo->getVarArgsStackIndex(), + getPointerTy(DAG.getDataLayout())); + const Value *SV = cast(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), + MachinePointerInfo(SV)); +} + SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const { // The layout of the va_list struct is specified in the AArch64 Procedure Call @@ -4562,8 +4592,14 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { - return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) - : LowerAAPCS_VASTART(Op, DAG); + MachineFunction &MF = DAG.getMachineFunction(); + + if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + return LowerWin64_VASTART(Op, DAG); + else if (Subtarget->isTargetDarwin()) + return LowerDarwin_VASTART(Op, DAG); + else + return LowerAAPCS_VASTART(Op, DAG); } SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, @@ -4571,7 +4607,8 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); - unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; + unsigned VaListSize = + Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32; const Value *DestSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); @@ -7451,6 +7488,14 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, return (DL.getTypeSizeInBits(VecTy) + 127) / 128; } +MachineMemOperand::Flags +AArch64TargetLowering::getMMOFlags(const Instruction &I) const { + if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && + I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) + return MOStridedAccess; + return MachineMemOperand::MONone; +} + bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL) const { @@ -10567,9 +10612,6 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (Size > 128) return AtomicExpansionKind::None; // Nand not supported in LSE. if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; - // Currently leaving And and Sub to LLSC - if ((AI->getOperation() == AtomicRMWInst::And) || (AI->getOperation() == AtomicRMWInst::Sub)) - return AtomicExpansionKind::LLSC; // Leave 128 bits to LLSC. return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC; } @@ -10783,7 +10825,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { unsigned AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { - if (Subtarget->isTargetDarwin()) + if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) return getPointerTy(DL).getSizeInBits(); return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index ecc2517fb288..3b0e0f1de894 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -408,6 +408,19 @@ public: bool isIntDivCheap(EVT VT, AttributeList Attr) const override; + bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, + const SelectionDAG &DAG) const override { + // Do not merge to float value size (128 bytes) if no implicit + // float attribute is set. + + bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat); + + if (NoFloat) + return (MemVT.getSizeInBits() <= 64); + return true; + } + bool isCheapToSpeculateCttz() const override { return true; } @@ -455,6 +468,8 @@ public: unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const; + MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; + private: bool isExtFreeImpl(const Instruction *Ext) const override; @@ -541,6 +556,7 @@ private: SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index de283b70210f..eec41ddbc159 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -451,3 +451,13 @@ def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)> def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>; def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>; def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_sub_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_sub_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_sub_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_sub_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd (SUBXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>; + +def : Pat<(atomic_load_and_8 GPR64:$Rn, GPR32:$Rs), (LDCLRALb (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_and_16 GPR64:$Rn, GPR32:$Rs), (LDCLRALh (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_and_32 GPR64:$Rn, GPR32:$Rs), (LDCLRALs (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_and_64 GPR64:$Rn, GPR64:$Rs), (LDCLRALd (ORNXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index dba3e4bdf82f..c0c6055c358f 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -52,9 +52,6 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" -static const MachineMemOperand::Flags MOSuppressPair = - MachineMemOperand::MOTargetFlag1; - static cl::opt TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); @@ -1715,6 +1712,13 @@ void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const { (*MI.memoperands_begin())->setFlags(MOSuppressPair); } +/// Check all MachineMemOperands for a hint that the load/store is strided. +bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const { + return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { + return MMO->getFlags() & MOStridedAccess; + }); +} + bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const { switch (Opc) { default: @@ -4433,7 +4437,8 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { ArrayRef> AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { static const std::pair TargetFlags[] = - {{MOSuppressPair, "aarch64-suppress-pair"}}; + {{MOSuppressPair, "aarch64-suppress-pair"}, + {MOStridedAccess, "aarch64-strided-access"}}; return makeArrayRef(TargetFlags); } diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 0809ede4df2a..1765a0263ea4 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -27,6 +27,13 @@ namespace llvm { class AArch64Subtarget; class AArch64TargetMachine; +static const MachineMemOperand::Flags MOSuppressPair = + MachineMemOperand::MOTargetFlag1; +static const MachineMemOperand::Flags MOStridedAccess = + MachineMemOperand::MOTargetFlag2; + +#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access" + class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; const AArch64Subtarget &Subtarget; @@ -81,6 +88,9 @@ public: /// unprofitable. bool isLdStPairSuppressed(const MachineInstr &MI) const; + /// Return true if the given load or store is a strided memory access. + bool isStridedAccess(const MachineInstr &MI) const; + /// Return true if this is an unscaled load/store. bool isUnscaledLdSt(unsigned Opc) const; @@ -356,7 +366,7 @@ enum AArch64FrameOffsetStatus { /// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to /// use an offset.eq /// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be -/// rewriten in @p MI. +/// rewritten in @p MI. /// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the /// amount that is off the limit of the legal offset. /// If set, @p OutUseUnscaledOp will contain the whether @p MI should be diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 0be14673eb20..0dcf07f98412 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -37,6 +37,8 @@ def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; def HasSPE : Predicate<"Subtarget->hasSPE()">, AssemblerPredicate<"FeatureSPE", "spe">; +def HasSVE : Predicate<"Subtarget->hasSVE()">, + AssemblerPredicate<"FeatureSVE", "sve">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 4a0a7c36baf8..ffb27834c31c 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -82,7 +82,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({Op, 1, s1}, Legal); } - for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV}) + for (unsigned BinOp : {G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV}) for (auto Ty : {s32, s64}) setAction({BinOp, Ty}, Legal); diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index fab92e139dd0..9f7dcb3fe1c3 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -74,7 +74,7 @@ const uint32_t * AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { if (CC == CallingConv::GHC) - // This is academic becase all GHC calls are (supposed to be) tail calls + // This is academic because all GHC calls are (supposed to be) tail calls return CSR_AArch64_NoRegs_RegMask; if (CC == CallingConv::AnyReg) return CSR_AArch64_AllRegs_RegMask; @@ -167,7 +167,7 @@ bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const { const TargetRegisterClass * AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { - return &AArch64::GPR64RegClass; + return &AArch64::GPR64spRegClass; } const TargetRegisterClass * diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index a3238cf3b60f..ea6112452736 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -134,7 +134,9 @@ void AArch64Subtarget::initializeProperties() { case CortexA72: PrefFunctionAlignment = 4; break; - case CortexA73: break; + case CortexA73: + PrefFunctionAlignment = 4; + break; case Others: break; } } @@ -171,7 +173,8 @@ struct AArch64GISelActualAccessor : public GISelAccessor { AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) - : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()), + : AArch64GenSubtargetInfo(TT, CPU, FS), + ReserveX18(TT.isOSDarwin() || TT.isOSWindows()), IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(), TLInfo(TM, *this), GISel() { diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index db53946cbc77..5a1f45ee2552 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -70,6 +70,7 @@ protected: bool HasFullFP16 = false; bool HasSPE = false; bool HasLSLFast = false; + bool HasSVE = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; @@ -251,6 +252,7 @@ public: bool hasFullFP16() const { return HasFullFP16; } bool hasSPE() const { return HasSPE; } bool hasLSLFast() const { return HasLSLFast; } + bool hasSVE() const { return HasSVE; } bool isLittleEndian() const { return IsLittle; } @@ -304,6 +306,17 @@ public: bool enableEarlyIfConversion() const override; std::unique_ptr getCustomPBQPConstraints() const override; + + bool isCallingConvWin64(CallingConv::ID CC) const { + switch (CC) { + case CallingConv::C: + return isTargetWindows(); + case CallingConv::Win64: + return true; + default: + return false; + } + } }; } // End llvm namespace diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 6237b8f3e7b9..ba28c01a2eff 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -138,6 +138,9 @@ static cl::opt EnableGlobalISelAtO( cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(-1)); +static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", + cl::init(true), cl::Hidden); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(getTheAArch64leTarget()); @@ -158,6 +161,8 @@ extern "C" void LLVMInitializeAArch64Target() { initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); + initializeFalkorHWPFFixPass(*PR); + initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); } @@ -182,7 +187,7 @@ static std::string computeDataLayout(const Triple &TT, if (TT.isOSBinFormatMachO()) return "e-m:o-i64:64-i128:128-n32:64-S128"; if (TT.isOSBinFormatCOFF()) - return "e-m:w-i64:64-i128:128-n32:64-S128"; + return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"; if (LittleEndian) return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; @@ -346,8 +351,12 @@ void AArch64PassConfig::addIRPasses() { // // Run this before LSR to remove the multiplies involved in computing the // pointer values N iterations ahead. - if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch) - addPass(createLoopDataPrefetchPass()); + if (TM->getOptLevel() != CodeGenOpt::None) { + if (EnableLoopDataPrefetch) + addPass(createLoopDataPrefetchPass()); + if (EnableFalkorHWPFFix) + addPass(createFalkorMarkStridedAccessesPass()); + } TargetPassConfig::addIRPasses(); @@ -478,8 +487,12 @@ void AArch64PassConfig::addPreSched2() { // Expand some pseudo instructions to allow proper scheduling. addPass(createAArch64ExpandPseudoPass()); // Use load/store pair instructions when possible. - if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt) - addPass(createAArch64LoadStoreOptimizationPass()); + if (TM->getOptLevel() != CodeGenOpt::None) { + if (EnableLoadStoreOpt) + addPass(createAArch64LoadStoreOptimizationPass()); + if (EnableFalkorHWPFFix) + addPass(createFalkorHWPFFixPass()); + } } void AArch64PassConfig::addPreEmitPass() { diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index fefa7e26b79f..85de02e859e0 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -36,6 +36,8 @@ public: ~AArch64TargetMachine() override; const AArch64Subtarget *getSubtargetImpl(const Function &F) const override; + // The no argument getSubtargetImpl, while it exists on some, targets is + // deprecated and should not be used. const AArch64Subtarget *getSubtargetImpl() const = delete; // Pass Pipeline Configuration diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index e841fb894519..a79d51820545 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -86,7 +86,7 @@ private: bool parseOperand(OperandVector &Operands, bool isCondCode, bool invertCondCode); - bool showMatchError(SMLoc Loc, unsigned ErrCode); + bool showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands); bool parseDirectiveArch(SMLoc L); bool parseDirectiveCPU(SMLoc L); @@ -3257,7 +3257,10 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, } } -bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { +std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS); + +bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, + OperandVector &Operands) { switch (ErrCode) { case Match_MissingFeature: return Error(Loc, @@ -3380,8 +3383,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { return Error(Loc, "expected readable system register"); case Match_MSR: return Error(Loc, "expected writable system register or pstate"); - case Match_MnemonicFail: - return Error(Loc, "unrecognized instruction mnemonic"); + case Match_MnemonicFail: { + std::string Suggestion = AArch64MnemonicSpellCheck( + ((AArch64Operand &)*Operands[0]).getToken(), + ComputeAvailableFeatures(STI->getFeatureBits())); + return Error(Loc, "unrecognized instruction mnemonic" + Suggestion); + } default: llvm_unreachable("unexpected error code!"); } @@ -3707,7 +3714,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, Msg); } case Match_MnemonicFail: - return showMatchError(IDLoc, MatchResult); + return showMatchError(IDLoc, MatchResult, Operands); case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; @@ -3726,7 +3733,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix()) MatchResult = Match_InvalidSuffix; - return showMatchError(ErrorLoc, MatchResult); + return showMatchError(ErrorLoc, MatchResult, Operands); } case Match_InvalidMemoryIndexed1: case Match_InvalidMemoryIndexed2: @@ -3784,7 +3791,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; - return showMatchError(ErrorLoc, MatchResult); + return showMatchError(ErrorLoc, MatchResult, Operands); } } diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index 02b12b5e90ca..f7e0a5c7bed3 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -47,6 +47,7 @@ add_llvm_target(AArch64CodeGen AArch64ConditionalCompares.cpp AArch64DeadRegisterDefinitionsPass.cpp AArch64ExpandPseudoInsts.cpp + AArch64FalkorHWPFFix.cpp AArch64FastISel.cpp AArch64A53Fix835769.cpp AArch64FrameLowering.cpp diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index a7a7daf4b4a5..2bd0cbf9f7c6 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -104,8 +104,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case FK_Data_1: return 1; - case FK_Data_2: case AArch64::fixup_aarch64_movw: + case FK_Data_2: + case FK_SecRel_2: return 2; case AArch64::fixup_aarch64_pcrel_branch14: @@ -124,6 +125,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case AArch64::fixup_aarch64_pcrel_branch26: case AArch64::fixup_aarch64_pcrel_call26: case FK_Data_4: + case FK_SecRel_4: return 4; case FK_Data_8: @@ -218,6 +220,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, case FK_Data_2: case FK_Data_4: case FK_Data_8: + case FK_SecRel_2: + case FK_SecRel_4: return Value; } } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index 7862a03e771c..31762b9e4cd5 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -27,8 +27,7 @@ namespace { class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { public: AArch64WinCOFFObjectWriter() - : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) { - } + : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {} ~AArch64WinCOFFObjectWriter() override = default; @@ -36,19 +35,59 @@ public: const MCFixup &Fixup, bool IsCrossSection, const MCAsmBackend &MAB) const override; - bool recordRelocation(const MCFixup &) const override; + bool recordRelocation(const MCFixup &) const override; }; } // end anonymous namespace -unsigned -AArch64WinCOFFObjectWriter::getRelocType(MCContext &Ctx, - const MCValue &Target, - const MCFixup &Fixup, - bool IsCrossSection, - const MCAsmBackend &MAB) const { - const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind()); - report_fatal_error(Twine("unsupported relocation type: ") + Info.Name); +unsigned AArch64WinCOFFObjectWriter::getRelocType( + MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, + bool IsCrossSection, const MCAsmBackend &MAB) const { + auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None + : Target.getSymA()->getKind(); + + switch (static_cast(Fixup.getKind())) { + default: { + const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind()); + report_fatal_error(Twine("unsupported relocation type: ") + Info.Name); + } + + case FK_Data_4: + switch (Modifier) { + default: + return COFF::IMAGE_REL_ARM64_ADDR32; + case MCSymbolRefExpr::VK_COFF_IMGREL32: + return COFF::IMAGE_REL_ARM64_ADDR32NB; + case MCSymbolRefExpr::VK_SECREL: + return COFF::IMAGE_REL_ARM64_SECREL; + } + + case FK_Data_8: + return COFF::IMAGE_REL_ARM64_ADDR64; + + case FK_SecRel_2: + return COFF::IMAGE_REL_ARM64_SECTION; + + case FK_SecRel_4: + return COFF::IMAGE_REL_ARM64_SECREL; + + case AArch64::fixup_aarch64_add_imm12: + return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12A; + + case AArch64::fixup_aarch64_ldst_imm12_scale1: + case AArch64::fixup_aarch64_ldst_imm12_scale2: + case AArch64::fixup_aarch64_ldst_imm12_scale4: + case AArch64::fixup_aarch64_ldst_imm12_scale8: + case AArch64::fixup_aarch64_ldst_imm12_scale16: + return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L; + + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21; + + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + return COFF::IMAGE_REL_ARM64_BRANCH26; + } } bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const { diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 5a799b2d88d0..568682899be5 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -56,7 +56,7 @@ extern char &AMDGPUMachineCFGStructurizerID; void initializeAMDGPUAlwaysInlinePass(PassRegistry&); -ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +Pass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 7235d8fae332..c68e5861ff25 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -15,8 +15,10 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -26,26 +28,27 @@ using namespace llvm; namespace { -class AMDGPUAnnotateKernelFeatures : public ModulePass { +class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: + const TargetMachine *TM = nullptr; AMDGPUAS AS; - static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS); - void addAttrToCallers(Function *Intrin, StringRef AttrName); - bool addAttrsForIntrinsics(Module &M, ArrayRef); + bool addFeatureAttributes(Function &F); public: static char ID; - AMDGPUAnnotateKernelFeatures() : ModulePass(ID) {} - bool runOnModule(Module &M) override; + AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} + + bool doInitialization(CallGraph &CG) override; + bool runOnSCC(CallGraphSCC &SCC) override; StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); - ModulePass::getAnalysisUsage(AU); + CallGraphSCCPass::getAnalysisUsage(AU); } static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS); @@ -121,16 +124,130 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( return false; } -// Return true if an addrspacecast is used that requires the queue ptr. -bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F, - AMDGPUAS AS) { +// We do not need to note the x workitem or workgroup id because they are always +// initialized. +// +// TODO: We should not add the attributes if the known compile time workgroup +// size is 1 for y/z. +static StringRef intrinsicToAttrName(Intrinsic::ID ID, + bool &NonKernelOnly, + bool &IsQueuePtr) { + switch (ID) { + case Intrinsic::amdgcn_workitem_id_x: + NonKernelOnly = true; + return "amdgpu-work-item-id-x"; + case Intrinsic::amdgcn_workgroup_id_x: + NonKernelOnly = true; + return "amdgpu-work-group-id-x"; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + return "amdgpu-work-item-id-y"; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + return "amdgpu-work-item-id-z"; + case Intrinsic::amdgcn_workgroup_id_y: + case Intrinsic::r600_read_tgid_y: + return "amdgpu-work-group-id-y"; + case Intrinsic::amdgcn_workgroup_id_z: + case Intrinsic::r600_read_tgid_z: + return "amdgpu-work-group-id-z"; + case Intrinsic::amdgcn_dispatch_ptr: + return "amdgpu-dispatch-ptr"; + case Intrinsic::amdgcn_dispatch_id: + return "amdgpu-dispatch-id"; + case Intrinsic::amdgcn_kernarg_segment_ptr: + case Intrinsic::amdgcn_implicitarg_ptr: + return "amdgpu-kernarg-segment-ptr"; + case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::trap: + case Intrinsic::debugtrap: + IsQueuePtr = true; + return "amdgpu-queue-ptr"; + default: + return ""; + } +} + +static bool handleAttr(Function &Parent, const Function &Callee, + StringRef Name) { + if (Callee.hasFnAttribute(Name)) { + Parent.addFnAttr(Name); + return true; + } + + return false; +} + +static void copyFeaturesToFunction(Function &Parent, const Function &Callee, + bool &NeedQueuePtr) { + // X ids unnecessarily propagated to kernels. + static const StringRef AttrNames[] = { + { "amdgpu-work-item-id-x" }, + { "amdgpu-work-item-id-y" }, + { "amdgpu-work-item-id-z" }, + { "amdgpu-work-group-id-x" }, + { "amdgpu-work-group-id-y" }, + { "amdgpu-work-group-id-z" }, + { "amdgpu-dispatch-ptr" }, + { "amdgpu-dispatch-id" }, + { "amdgpu-kernarg-segment-ptr" } + }; + + if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) + NeedQueuePtr = true; + + for (StringRef AttrName : AttrNames) + handleAttr(Parent, Callee, AttrName); +} + +bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { + const AMDGPUSubtarget &ST = TM->getSubtarget(F); + bool HasFlat = ST.hasFlatAddressSpace(); + bool HasApertureRegs = ST.hasApertureRegs(); SmallPtrSet ConstantExprVisited; - for (const BasicBlock &BB : F) { - for (const Instruction &I : BB) { + bool Changed = false; + bool NeedQueuePtr = false; + bool HaveCall = false; + bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); + + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + CallSite CS(&I); + if (CS) { + Function *Callee = CS.getCalledFunction(); + + // TODO: Do something with indirect calls. + if (!Callee) { + if (!CS.isInlineAsm()) + HaveCall = true; + continue; + } + + Intrinsic::ID IID = Callee->getIntrinsicID(); + if (IID == Intrinsic::not_intrinsic) { + HaveCall = true; + copyFeaturesToFunction(F, *Callee, NeedQueuePtr); + Changed = true; + } else { + bool NonKernelOnly = false; + StringRef AttrName = intrinsicToAttrName(IID, + NonKernelOnly, NeedQueuePtr); + if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { + F.addFnAttr(AttrName); + Changed = true; + } + } + } + + if (NeedQueuePtr || HasApertureRegs) + continue; + if (const AddrSpaceCastInst *ASC = dyn_cast(&I)) { - if (castRequiresQueuePtr(ASC, AS)) - return true; + if (castRequiresQueuePtr(ASC, AS)) { + NeedQueuePtr = true; + continue; + } } for (const Use &U : I.operands()) { @@ -138,100 +255,57 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F, if (!OpC) continue; - if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) - return true; + if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) { + NeedQueuePtr = true; + break; + } } } } - return false; -} - -void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, - StringRef AttrName) { - SmallPtrSet SeenFuncs; - - for (User *U : Intrin->users()) { - // CallInst is the only valid user for an intrinsic. - CallInst *CI = cast(U); - - Function *CallingFunction = CI->getParent()->getParent(); - if (SeenFuncs.insert(CallingFunction).second) - CallingFunction->addFnAttr(AttrName); + if (NeedQueuePtr) { + F.addFnAttr("amdgpu-queue-ptr"); + Changed = true; } -} - -bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics( - Module &M, - ArrayRef IntrinsicToAttr) { - bool Changed = false; - for (const StringRef *Arr : IntrinsicToAttr) { - if (Function *Fn = M.getFunction(Arr[0])) { - addAttrToCallers(Fn, Arr[1]); - Changed = true; - } + // TODO: We could refine this to captured pointers that could possibly be + // accessed by flat instructions. For now this is mostly a poor way of + // estimating whether there are calls before argument lowering. + if (HasFlat && !IsFunc && HaveCall) { + F.addFnAttr("amdgpu-flat-scratch"); + Changed = true; } return Changed; } -bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { +bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { + Module &M = SCC.getCallGraph().getModule(); Triple TT(M.getTargetTriple()); - AS = AMDGPU::getAMDGPUAS(M); - - static const StringRef IntrinsicToAttr[][2] = { - // .x omitted - { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" }, - { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" }, - - { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" }, - { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" }, - - { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, - { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, - - // .x omitted - { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, - { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } - }; - static const StringRef HSAIntrinsicToAttr[][2] = { - { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }, - { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" }, - { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" }, - { "llvm.trap", "amdgpu-queue-ptr" }, - { "llvm.debugtrap", "amdgpu-queue-ptr" } - }; - - // TODO: We should not add the attributes if the known compile time workgroup - // size is 1 for y/z. - - // TODO: Intrinsics that require queue ptr. + bool Changed = false; + for (CallGraphNode *I : SCC) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration()) + continue; - // We do not need to note the x workitem or workgroup id because they are - // always initialized. + Changed |= addFeatureAttributes(*F); + } - bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); - if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) { - Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); - for (Function &F : M) { - if (F.hasFnAttribute("amdgpu-queue-ptr")) - continue; + return Changed; +} - auto *TPC = getAnalysisIfAvailable(); - bool HasApertureRegs = TPC && TPC->getTM() - .getSubtarget(F) - .hasApertureRegs(); - if (!HasApertureRegs && hasAddrSpaceCast(F, AS)) - F.addFnAttr("amdgpu-queue-ptr"); - } - } +bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + report_fatal_error("TargetMachine is required"); - return Changed; + AS = AMDGPU::getAMDGPUAS(CG.getModule()); + TM = &TPC->getTM(); + return false; } -ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { +Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { return new AMDGPUAnnotateKernelFeatures(); } diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 83ad1a5c6ee3..2247814cfe55 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -268,19 +268,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF)); - OutStreamer->emitRawComment(" codeLenInByte = " + - Twine(getFunctionCodeSize(MF)), false); - OutStreamer->emitRawComment( - " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false); - OutStreamer->emitRawComment( - " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false); - OutStreamer->emitRawComment( " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); OutStreamer->emitRawComment( " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); - OutStreamer->emitRawComment( - " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false); OutStreamer->emitRawComment( " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + " bytes/workgroup (compile time only)", false); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 2553cf4da0fe..258b1737deb3 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -573,6 +573,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FABS); + setTargetDAGCombine(ISD::AssertZext); + setTargetDAGCombine(ISD::AssertSext); } //===----------------------------------------------------------------------===// @@ -883,7 +885,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting /// input values across multiple registers. Each item in the Ins array -/// represents a single value that will be stored in regsters. Ins[x].VT is +/// represents a single value that will be stored in registers. Ins[x].VT is /// the value type of the value that will be stored in the register, so /// whatever SDNode we lower the argument to needs to be this type. /// @@ -2591,6 +2593,31 @@ SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N, return SDValue(CSrc, 0); } +// FIXME: This should go in generic DAG combiner with an isTruncateFree check, +// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU +// issues. +SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + + // (vt2 (assertzext (truncate vt0:x), vt1)) -> + // (vt2 (truncate (assertzext vt0:x, vt1))) + if (N0.getOpcode() == ISD::TRUNCATE) { + SDValue N1 = N->getOperand(1); + EVT ExtVT = cast(N1)->getVT(); + SDLoc SL(N); + + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + if (SrcVT.bitsGE(ExtVT)) { + SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1); + return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg); + } + } + + return SDValue(); +} /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( @@ -3521,6 +3548,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, break; } + case ISD::AssertZext: + case ISD::AssertSext: + return performAssertSZExtCombine(N, DCI); } return SDValue(); } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index a45234e2b39f..d85aada6053a 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -76,6 +76,7 @@ protected: SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 1bc5a52053ec..779617629010 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -277,7 +277,7 @@ std::pair AMDGPUSubtarget::getWavesPerEU( // Make sure requested values are compatible with values implied by requested // minimum/maximum flat work group sizes. if (RequestedFlatWorkGroupSize && - Requested.first > MinImpliedByFlatWorkGroupSize) + Requested.first < MinImpliedByFlatWorkGroupSize) return Default; return Requested; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 22cede59086a..d4b6a5fe8020 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -359,6 +359,10 @@ public: return FP64FP16Denormals; } + bool supportsMinMaxDenormModes() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + bool hasFPExceptions() const { return FPExceptions; } diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index e3c90f250600..b37c274102bc 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1208,7 +1208,7 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { } bool AMDGPUOperand::isLiteralImm(MVT type) const { - // Check that this imediate can be added as literal + // Check that this immediate can be added as literal if (!isImmTy(ImmTyNone)) { return false; } diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index f26e49295e69..966c6fec20c6 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -87,6 +87,7 @@ DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) DECODE_OPERAND_REG(VGPR_32) DECODE_OPERAND_REG(VS_32) DECODE_OPERAND_REG(VS_64) +DECODE_OPERAND_REG(VS_128) DECODE_OPERAND_REG(VReg_64) DECODE_OPERAND_REG(VReg_96) @@ -318,6 +319,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const { return decodeSrcOp(OPW64, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const { + return decodeSrcOp(OPW128, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const { return decodeSrcOp(OPW16, Val); } diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 3d71db909e20..4c755be09999 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -70,6 +70,7 @@ public: MCOperand decodeOperand_VGPR_32(unsigned Val) const; MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; + MCOperand decodeOperand_VS_128(unsigned Val) const; MCOperand decodeOperand_VSrc16(unsigned Val) const; MCOperand decodeOperand_VSrcV216(unsigned Val) const; diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 3af242d9ea66..0aad8f0843d6 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -653,6 +653,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // again. The same constant folded instruction could also have a second // use operand. NextUse = MRI->use_begin(Dst.getReg()); + FoldList.clear(); continue; } diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 08a64de38501..7334781916d8 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -158,7 +158,7 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( // No replacement necessary. if (ScratchWaveOffsetReg == AMDGPU::NoRegister || !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) { - assert(MFI->getStackPtrOffsetReg() == AMDGPU::NoRegister); + assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG); return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister); } @@ -246,13 +246,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // this point it appears we need the setup. This part of the prolog should be // emitted after frame indices are eliminated. - if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) + if (MFI->hasFlatScratchInit()) emitFlatScratchInit(ST, MF, MBB); unsigned SPReg = MFI->getStackPtrOffsetReg(); - if (SPReg != AMDGPU::NoRegister) { + if (SPReg != AMDGPU::SP_REG) { + assert(MRI.isReserved(SPReg) && "SPReg used but not reserved"); + DebugLoc DL; - int64_t StackSize = MF.getFrameInfo().getStackSize(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + int64_t StackSize = FrameInfo.getStackSize(); if (StackSize == 0) { BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 2ba570b9ebbb..2356405f0919 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1171,8 +1171,7 @@ static void allocateSystemSGPRs(CCState &CCInfo, static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info, - bool NeedSP) { + SIMachineFunctionInfo &Info) { // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1234,15 +1233,6 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } - - if (NeedSP) { - unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF); - Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg); - - assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg()); - assert(!TRI.isSubRegister(Info.getScratchRSrcReg(), - Info.getStackPtrOffsetReg())); - } } SDValue SITargetLowering::LowerFormalArguments( @@ -1380,10 +1370,37 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned Reg = VA.getLocReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + EVT ValVT = VA.getValVT(); Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + // If this is an 8 or 16-bit value, it is really passed promoted + // to 32 bits. Insert an assert[sz]ext to capture this, then + // truncate to the right size. + switch (VA.getLocInfo()) { + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val); + break; + case CCValAssign::SExt: + Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, + DAG.getValueType(ValVT)); + Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); + break; + case CCValAssign::ZExt: + Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, + DAG.getValueType(ValVT)); + Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); + break; + case CCValAssign::AExt: + Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); + break; + default: + llvm_unreachable("Unknown loc info!"); + } + if (IsShader && Arg.VT.isVector()) { // Build a vector from the registers Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); @@ -1410,25 +1427,13 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - - // TODO: Could maybe omit SP if only tail calls? - bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects(); - // Start adding system SGPRs. if (IsEntryFunc) { allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); - reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); CCInfo.AllocateReg(Info->getFrameOffsetReg()); - - if (NeedSP) { - unsigned StackPtrReg = findFirstFreeSGPR(CCInfo); - CCInfo.AllocateReg(StackPtrReg); - Info->setStackPtrOffsetReg(StackPtrReg); - } } return Chains.empty() ? Chain : @@ -4624,8 +4629,8 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { return DAG.isKnownNeverNaN(Op); } -static bool isCanonicalized(SDValue Op, const SISubtarget *ST, - unsigned MaxDepth=5) { +static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, + const SISubtarget *ST, unsigned MaxDepth=5) { // If source is a result of another standard FP operation it is already in // canonical form. @@ -4663,7 +4668,7 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST, case ISD::FNEG: case ISD::FABS: return (MaxDepth > 0) && - isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1); + isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1); case ISD::FSIN: case ISD::FCOS: @@ -4672,16 +4677,19 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST, // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. // For such targets need to check their input recursively. - // TODO: on GFX9+ we could return true without checking provided no-nan - // mode, since canonicalization is also used to quiet sNaNs. case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FMINNAN: case ISD::FMAXNAN: + if (ST->supportsMinMaxDenormModes() && + DAG.isKnownNeverNaN(Op.getOperand(0)) && + DAG.isKnownNeverNaN(Op.getOperand(1))) + return true; + return (MaxDepth > 0) && - isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) && - isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1); + isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) && + isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1); case ISD::ConstantFP: { auto F = cast(Op)->getValueAPF(); @@ -4700,11 +4708,19 @@ SDValue SITargetLowering::performFCanonicalizeCombine( if (!CFP) { SDValue N0 = N->getOperand(0); + EVT VT = N0.getValueType().getScalarType(); + auto ST = getSubtarget(); + + if (((VT == MVT::f32 && ST->hasFP32Denormals()) || + (VT == MVT::f64 && ST->hasFP64Denormals()) || + (VT == MVT::f16 && ST->hasFP16Denormals())) && + DAG.isKnownNeverNaN(N0)) + return N0; bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) && - isCanonicalized(N0, getSubtarget())) + isCanonicalized(DAG, N0, ST)) return N0; return SDValue(); @@ -5813,3 +5829,44 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { } return TargetLowering::getConstraintType(Constraint); } + +// Figure out which registers should be reserved for stack access. Only after +// the function is legalized do we know all of the non-spill stack objects or if +// calls are present. +void SITargetLowering::finalizeLowering(MachineFunction &MF) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *Info = MF.getInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const SISubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + if (Info->isEntryFunction()) { + // Callable functions have fixed registers used for stack access. + reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); + } + + // We have to assume the SP is needed in case there are calls in the function + // during lowering. Calls are only detected after the function is + // lowered. We're about to reserve registers, so don't bother using it if we + // aren't really going to use it. + bool NeedSP = !Info->isEntryFunction() || + MFI.hasVarSizedObjects() || + MFI.hasCalls(); + + if (NeedSP) { + unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF); + Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg); + + assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg()); + assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), + Info->getStackPtrOffsetReg())); + MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); + } + + MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); + MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); + MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, + Info->getScratchWaveOffsetReg()); + + TargetLoweringBase::finalizeLowering(MF); +} diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 83392a7ab1b2..e6bb3d6cd419 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -232,6 +232,8 @@ public: ConstraintType getConstraintType(StringRef Constraint) const override; SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const; + + void finalizeLowering(MachineFunction &MF) const override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 160f8837d49c..a7e0feb10b9f 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3408,8 +3408,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { } void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { - SmallVector Worklist; - Worklist.push_back(&TopInst); + SetVectorType Worklist; + Worklist.insert(&TopInst); while (!Worklist.empty()) { MachineInstr &Inst = *Worklist.pop_back_val(); @@ -3610,7 +3610,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } } -void SIInstrInfo::lowerScalarAbs(SmallVectorImpl &Worklist, +void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3635,7 +3635,7 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl &Worklist, } void SIInstrInfo::splitScalar64BitUnaryOp( - SmallVectorImpl &Worklist, MachineInstr &Inst, + SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3686,7 +3686,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp( } void SIInstrInfo::splitScalar64BitBinaryOp( - SmallVectorImpl &Worklist, MachineInstr &Inst, + SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3753,7 +3753,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp( } void SIInstrInfo::splitScalar64BitBCNT( - SmallVectorImpl &Worklist, MachineInstr &Inst) const { + SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3789,7 +3789,7 @@ void SIInstrInfo::splitScalar64BitBCNT( addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl &Worklist, +void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3853,12 +3853,12 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl &Worklist, void SIInstrInfo::addUsersToMoveToVALUWorklist( unsigned DstReg, MachineRegisterInfo &MRI, - SmallVectorImpl &Worklist) const { + SetVectorType &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); if (!canReadVGPR(UseMI, I.getOperandNo())) { - Worklist.push_back(&UseMI); + Worklist.insert(&UseMI); do { ++I; @@ -3869,7 +3869,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } -void SIInstrInfo::movePackToVALU(SmallVectorImpl &Worklist, +void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const { unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -3932,7 +3932,7 @@ void SIInstrInfo::movePackToVALU(SmallVectorImpl &Worklist, } void SIInstrInfo::addSCCDefUsersToVALUWorklist( - MachineInstr &SCCDefInst, SmallVectorImpl &Worklist) const { + MachineInstr &SCCDefInst, SetVectorType &Worklist) const { // This assumes that all the users of SCC are in the same block // as the SCC def. for (MachineInstr &MI : @@ -3943,7 +3943,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist( return; if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) - Worklist.push_back(&MI); + Worklist.insert(&MI); } } diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index d00c0d4a7f4e..3dd5bc89e6c7 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -19,6 +19,7 @@ #include "AMDGPUInstrInfo.h" #include "SIDefines.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/SetVector.h" namespace llvm { @@ -38,6 +39,8 @@ private: EXECZ = 3 }; + typedef SmallSetVector SetVectorType; + static unsigned getBranchOpcode(BranchPredicate Cond); static BranchPredicate getBranchPredicate(unsigned Opcode); @@ -56,30 +59,30 @@ private: void swapOperands(MachineInstr &Inst) const; - void lowerScalarAbs(SmallVectorImpl &Worklist, + void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; - void splitScalar64BitUnaryOp(SmallVectorImpl &Worklist, + void splitScalar64BitUnaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitBinaryOp(SmallVectorImpl &Worklist, + void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitBCNT(SmallVectorImpl &Worklist, + void splitScalar64BitBCNT(SetVectorType &Worklist, MachineInstr &Inst) const; - void splitScalar64BitBFE(SmallVectorImpl &Worklist, + void splitScalar64BitBFE(SetVectorType &Worklist, MachineInstr &Inst) const; - void movePackToVALU(SmallVectorImpl &Worklist, + void movePackToVALU(SetVectorType &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist( unsigned Reg, MachineRegisterInfo &MRI, - SmallVectorImpl &Worklist) const; + SetVectorType &Worklist) const; void addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst, - SmallVectorImpl &Worklist) const; + SetVectorType &Worklist) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index ffb01363e131..088173680fa8 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -1436,7 +1436,7 @@ class VOPProfile _ArgVT> { field bit IsPacked = isPackedType.ret; field bit HasOpSel = IsPacked; - field bit HasOMod = !if(HasOpSel, 0, HasModifiers); + field bit HasOMod = !if(HasOpSel, 0, isFloatType.ret); field bit HasSDWAOMod = isFloatType.ret; field bit HasExt = getHasExt.ret; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index bcc685015cf5..ba69e42d9125 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1060,7 +1060,7 @@ def : Pat < class FPToI1Pat : Pat < (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), - (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) + (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) >; def : FPToI1Pat; diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 3203c38dae34..a7c8166ff6d2 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -23,10 +23,10 @@ using namespace llvm; SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), - ScratchRSrcReg(AMDGPU::NoRegister), - ScratchWaveOffsetReg(AMDGPU::NoRegister), - FrameOffsetReg(AMDGPU::NoRegister), - StackPtrOffsetReg(AMDGPU::NoRegister), + ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG), + ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG), + FrameOffsetReg(AMDGPU::FP_REG), + StackPtrOffsetReg(AMDGPU::SP_REG), PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), DispatchPtrUserSGPR(AMDGPU::NoRegister), QueuePtrUserSGPR(AMDGPU::NoRegister), @@ -42,6 +42,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), + WorkItemIDXVGPR(AMDGPU::NoRegister), + WorkItemIDYVGPR(AMDGPU::NoRegister), + WorkItemIDZVGPR(AMDGPU::NoRegister), PSInputAddr(0), PSInputEnable(0), ReturnsVoid(true), @@ -87,12 +90,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ScratchWaveOffsetReg = AMDGPU::SGPR4; FrameOffsetReg = AMDGPU::SGPR5; StackPtrOffsetReg = AMDGPU::SGPR32; - return; + + // FIXME: Not really a system SGPR. + PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg; } CallingConv::ID CC = F->getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { - KernargSegmentPtr = true; + KernargSegmentPtr = !F->arg_empty(); WorkGroupIDX = true; WorkItemIDX = true; } else if (CC == CallingConv::AMDGPU_PS) { @@ -101,17 +106,25 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (ST.debuggerEmitPrologue()) { // Enable everything. + WorkGroupIDX = true; WorkGroupIDY = true; WorkGroupIDZ = true; + WorkItemIDX = true; WorkItemIDY = true; WorkItemIDZ = true; } else { + if (F->hasFnAttribute("amdgpu-work-group-id-x")) + WorkGroupIDX = true; + if (F->hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; if (F->hasFnAttribute("amdgpu-work-group-id-z")) WorkGroupIDZ = true; + if (F->hasFnAttribute("amdgpu-work-item-id-x")) + WorkItemIDX = true; + if (F->hasFnAttribute("amdgpu-work-item-id-y")) WorkItemIDY = true; @@ -119,25 +132,28 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDZ = true; } - // X, XY, and XYZ are the only supported combinations, so make sure Y is - // enabled if Z is. - if (WorkItemIDZ) - WorkItemIDY = true; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); bool MaySpill = ST.isVGPRSpillingEnabled(*F); - bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls(); + bool HasStackObjects = FrameInfo.hasStackObjects(); + + if (isEntryFunction()) { + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; - if (HasStackObjects || MaySpill) { - PrivateSegmentWaveByteOffset = true; + if (HasStackObjects || MaySpill) { + PrivateSegmentWaveByteOffset = true; - // HS and GS always have the scratch wave offset in SGPR5 on GFX9. - if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && - (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) - PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5; + // HS and GS always have the scratch wave offset in SGPR5 on GFX9. + if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && + (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) + PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5; + } } - if (ST.isAmdCodeObjectV2(MF)) { + bool IsCOV2 = ST.isAmdCodeObjectV2(MF); + if (IsCOV2) { if (HasStackObjects || MaySpill) PrivateSegmentBuffer = true; @@ -154,11 +170,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ImplicitBufferPtr = true; } - // We don't need to worry about accessing spills with flat instructions. - // TODO: On VI where we must use flat for global, we should be able to omit - // this if it is never used for generic access. - if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS()) - FlatScratchInit = true; + if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr")) + KernargSegmentPtr = true; + + if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) { + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls that may require it before argument lowering. + if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch")) + FlatScratchInit = true; + } } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 05aa249584bf..4c7f38a09a48 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -119,6 +119,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned WorkGroupInfoSystemSGPR; unsigned PrivateSegmentWaveByteOffsetSystemSGPR; + // VGPR inputs. These are always v0, v1 and v2 for entry functions. + unsigned WorkItemIDXVGPR; + unsigned WorkItemIDYVGPR; + unsigned WorkItemIDZVGPR; + // Graphics info. unsigned PSInputAddr; unsigned PSInputEnable; @@ -377,10 +382,13 @@ public: } void setStackPtrOffsetReg(unsigned Reg) { - assert(Reg != AMDGPU::NoRegister && "Should never be unset"); StackPtrOffsetReg = Reg; } + // Note the unset value for this is AMDGPU::SP_REG rather than + // NoRegister. This is mostly a workaround for MIR tests where state that + // can't be directly computed from the function is not preserved in serialized + // MIR. unsigned getStackPtrOffsetReg() const { return StackPtrOffsetReg; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index ef6ad4ad0c8f..4a3fbb4593bb 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -207,7 +207,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } + // We have to assume the SP is needed in case there are calls in the function, + // which is detected after the function is lowered. If we aren't really going + // to need SP, don't bother reserving it. unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); + if (StackPtrReg != AMDGPU::NoRegister) { reserveRegisterTuples(Reserved, StackPtrReg); assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index fc808011cd88..54ea7805e18d 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -23,6 +23,13 @@ class SIReg regIdx = 0> : Register, def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; +// Pseudo-registers: Used as placeholders during isel and immediately +// replaced, never seeing the verifier. +def PRIVATE_RSRC_REG : SIReg<"", 0>; +def FP_REG : SIReg<"", 0>; +def SP_REG : SIReg<"", 0>; +def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>; + // VCC for 64-bit instructions def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, DwarfRegAlias { @@ -267,7 +274,8 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, - SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { + SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, + FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { let AllocationPriority = 7; } @@ -314,7 +322,8 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R let isAllocatable = 0; } -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> { +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, + (add SGPR_128, TTMP_128)> { let AllocationPriority = 10; } @@ -464,7 +473,9 @@ defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ; defm VSrc : RegImmOperand<"VS", "VSrc">; -def VSrc_128 : RegisterOperand; +def VSrc_128 : RegisterOperand { + let DecoderMethod = "DecodeVS_128RegisterClass"; +} //===----------------------------------------------------------------------===// // VSrc_* Operands with an VGPR diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 26515b27bb77..67ad904ca972 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -539,23 +539,9 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { } bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { - - if (Reg0 == Reg1) { - return true; + for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) { + if (*R == Reg1) return true; } - - unsigned SubReg0 = TRI->getSubReg(Reg0, 1); - if (SubReg0 == 0) { - return TRI->getSubRegIndex(Reg1, Reg0) > 0; - } - - for (unsigned Idx = 2; SubReg0 > 0; ++Idx) { - if (isRegIntersect(Reg1, SubReg0, TRI)) { - return true; - } - SubReg0 = TRI->getSubReg(Reg0, Idx); - } - return false; } diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 7b9bc71ad4c7..d5acb49b4f39 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -117,7 +117,10 @@ class VOP2_SDWA_Pseudo pattern=[]> : class getVOP2Pat64 : LetDummies { list ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (node (P.Src0VT + !if(P.HasOMod, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]); } @@ -813,9 +816,11 @@ let SubtargetPredicate = isVI in { // Aliases to simplify matching of floating-point instructions that // are VOP2 on SI and VOP3 on VI. -class SI2_VI3Alias : InstAlias < +class SI2_VI3Alias : InstAlias < name#" $dst, $src0, $src1", - (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0) + !if(inst.Pfl.HasOMod, + (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0), + (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0)) >, PredicateControl { let UseInstAsmMatchConverter = 0; let AsmVariantName = AMDGPUAsmVariants.VOP3; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index a8ca593f14ed..92ed0706dc01 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -12,17 +12,21 @@ //===----------------------------------------------------------------------===// class getVOP3ModPat { + dag src0 = !if(P.HasOMod, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); + list ret3 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (node (P.Src0VT src0), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))]; list ret2 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (node (P.Src0VT src0), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))]; list ret1 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))]; + (node (P.Src0VT src0)))]; list ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, @@ -92,6 +96,7 @@ class VOP3_Profile : VOPProfile { class VOP3b_Profile : VOPProfile<[vt, vt, vt, vt]> { // v_div_scale_{f32|f64} do not support input modifiers. let HasModifiers = 0; + let HasOMod = 0; let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); let Asm64 = " $vdst, $sdst, $src0, $src1, $src2"; } diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index f2de1f995726..3becf758aaa3 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -34,6 +34,9 @@ class VOP3_VOP3PInst, fma>; +def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile>; +def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile>; + def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile, fadd>; def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile, fmul>; def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile, fmaxnum>; @@ -41,7 +44,6 @@ def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile, add>; def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile>; -def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile, sub>; def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile, mul>; def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile, smin>; @@ -50,6 +52,9 @@ def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile, umax>; } +def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile>; +def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile, sub>; + def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile, lshl_rev>; def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, ashr_rev>; def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, lshr_rev>; @@ -71,6 +76,7 @@ multiclass VOP3P_Real_vi op> { } } +defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>; defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>; defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>; defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>; @@ -79,8 +85,10 @@ defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>; defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>; defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>; defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>; +defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>; defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>; +defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>; defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>; defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>; defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>; diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index f3482a22d5dc..b636fc9be431 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -148,6 +148,19 @@ class VOPCInstAlias : let SubtargetPredicate = AssemblerPredicate; } +class getVOPCPat64 : LetDummies { + list ret = !if(P.HasModifiers, + [(set i1:$sdst, + (setcc (P.Src0VT + !if(P.HasOMod, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + cond))], + [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]); +} + + multiclass VOPC_Pseudos , + def _e64 : VOP3_Pseudo.ret>, Commutable_REV { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; @@ -634,7 +640,7 @@ class FCMP_Pattern : Pat < (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), (inst $src0_modifiers, $src0, $src1_modifiers, $src1, - DSTCLAMP.NONE, DSTOMOD.NONE) + DSTCLAMP.NONE) >; def : FCMP_Pattern ; diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 77b7952b22a8..b47538ba0349 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -136,6 +136,8 @@ class VOP3_Real : let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; + + VOPProfile Pfl = ps.Pfl; } // XXX - Is there any reason to distingusih this from regular VOP3 diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index c40b4450a5b5..e49c1babac21 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -17,144 +17,172 @@ include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// -// ARM Helper classes. +// ARM Subtarget state. // -class ProcNoItin Features> - : Processor; +def ModeThumb : SubtargetFeature<"thumb-mode", "InThumbMode", + "true", "Thumb mode">; + +def ModeSoftFloat : SubtargetFeature<"soft-float","UseSoftFloat", + "true", "Use software floating " + "point features.">; -class Architecture features > - : SubtargetFeature; //===----------------------------------------------------------------------===// -// ARM Subtarget state. +// ARM Subtarget features. // -def ModeThumb : SubtargetFeature<"thumb-mode", "InThumbMode", "true", - "Thumb mode">; +// Floating Point, HW Division and Neon Support +def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true", + "Enable VFP2 instructions">; -def ModeSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", - "Use software floating point features.">; +def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true", + "Enable VFP3 instructions", + [FeatureVFP2]>; -//===----------------------------------------------------------------------===// -// ARM Subtarget features. -// +def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", + "Enable NEON instructions", + [FeatureVFP3]>; + +def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", + "Enable half-precision " + "floating point">; + +def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", + "Enable VFP4 instructions", + [FeatureVFP3, FeatureFP16]>; + +def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", + "true", "Enable ARMv8 FP", + [FeatureVFP4]>; + +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Enable full half-precision " + "floating point", + [FeatureFPARMv8]>; + +def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", + "Floating point unit supports " + "single precision only">; + +def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", + "Restrict FP to 16 double registers">; + +def FeatureHWDivThumb : SubtargetFeature<"hwdiv", + "HasHardwareDivideInThumb", "true", + "Enable divide instructions in Thumb">; + +def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm", + "HasHardwareDivideInARM", "true", + "Enable divide instructions in ARM mode">; + +// Atomic Support +def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true", + "Has data barrier (dmb/dsb) instructions">; + +def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true", + "Has v7 clrex instruction">; -def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true", - "Enable VFP2 instructions">; -def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true", - "Enable VFP3 instructions", - [FeatureVFP2]>; -def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", - "Enable NEON instructions", - [FeatureVFP3]>; -def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true", - "Enable Thumb2 instructions">; -def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true", - "Does not support ARM mode execution", - [ModeThumb]>; -def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", - "Enable half-precision floating point">; -def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", - "Enable VFP4 instructions", - [FeatureVFP3, FeatureFP16]>; -def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", - "true", "Enable ARMv8 FP", - [FeatureVFP4]>; -def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", - "Enable full half-precision floating point", - [FeatureFPARMv8]>; -def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", - "Restrict FP to 16 double registers">; -def FeatureHWDivThumb : SubtargetFeature<"hwdiv", "HasHardwareDivideInThumb", - "true", - "Enable divide instructions in Thumb">; -def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm", - "HasHardwareDivideInARM", "true", - "Enable divide instructions in ARM mode">; -def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true", - "Has data barrier (dmb / dsb) instructions">; -def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true", - "Has v7 clrex instruction">; def FeatureAcquireRelease : SubtargetFeature<"acquire-release", "HasAcquireRelease", "true", - "Has v8 acquire/release (lda/ldaex etc) instructions">; -def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", - "FP compare + branch is slow">; -def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", - "Floating point unit supports single precision only">; -def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", - "Enable support for Performance Monitor extensions">; -def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true", - "Enable support for TrustZone security extensions">; -def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true", - "Enable support for ARMv8-M Security Extensions">; -def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", - "Enable support for Cryptography extensions", - [FeatureNEON]>; -def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", - "Enable support for CRC instructions">; + "Has v8 acquire/release (lda/ldaex " + " etc) instructions">; + + +def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", + "FP compare + branch is slow">; + +def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", + "Enable support for Performance " + "Monitor extensions">; + + +// TrustZone Security Extensions +def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true", + "Enable support for TrustZone " + "security extensions">; + +def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true", + "Enable support for ARMv8-M " + "Security Extensions">; + +def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", + "Enable support for " + "Cryptography extensions", + [FeatureNEON]>; + +def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", + "Enable support for CRC instructions">; + + // Not to be confused with FeatureHasRetAddrStack (return address stack) -def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", - "Enable Reliability, Availability and Serviceability extensions">; -def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true", - "Enable fast computation of positive address offsets">; -def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true", - "CPU fuses AES crypto operations">; - -// Cyclone has preferred instructions for zeroing VFP registers, which can -// execute in 0 cycles. -def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", - "Has zero-cycle zeroing instructions">; - -// Whether or not it may be profitable to unpredicate certain instructions -// during if conversion. +def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", + "Enable Reliability, Availability " + "and Serviceability extensions">; + +// Fast computation of non-negative address offsets +def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true", + "Enable fast computation of " + "positive address offsets">; + +// Fast execution of AES crypto operations +def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true", + "CPU fuses AES crypto operations">; + +// Cyclone can zero VFP registers in 0 cycles. +def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", + "Has zero-cycle zeroing instructions">; + +// Whether it is profitable to unpredicate certain instructions during if-conversion def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr", - "IsProfitableToUnpredicate", - "true", + "IsProfitableToUnpredicate", "true", "Is profitable to unpredicate">; // Some targets (e.g. Swift) have microcoded VGETLNi32. -def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32", - "HasSlowVGETLNi32", "true", - "Has slow VGETLNi32 - prefer VMOV">; +def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32", + "HasSlowVGETLNi32", "true", + "Has slow VGETLNi32 - prefer VMOV">; // Some targets (e.g. Swift) have microcoded VDUP32. -def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true", - "Has slow VDUP32 - prefer VMOV">; +def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", + "true", + "Has slow VDUP32 - prefer VMOV">; // Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON // for scalar FP, as this allows more effective execution domain optimization. -def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR", - "true", "Prefer VMOVSR">; +def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR", + "true", "Prefer VMOVSR">; // Swift has ISHST barriers compatible with Atomic Release semantics but weaker // than ISH def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST", - "true", "Prefer ISHST barriers">; + "true", "Prefer ISHST barriers">; // Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU. -def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true", - "Has muxed AGU and NEON/FPU">; +def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", + "true", + "Has muxed AGU and NEON/FPU">; -// On some targets, a VLDM/VSTM starting with an odd register number needs more -// microops than single VLDRS. +// Whether VLDM/VSTM starting with odd register number need more microops +// than single VLDRS def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister", - "true", "VLDM/VSTM starting with an odd register is slow">; + "true", "VLDM/VSTM starting " + "with an odd register is slow">; // Some targets have a renaming dependency when loading into D subregisters. def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg", "SlowLoadDSubregister", "true", "Loading into D subregs is slow">; + // Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD. def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs", "DontWidenVMOVS", "true", "Don't widen VMOVS to VMOVD">; // Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions. -def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true", - "Expand VFP/NEON MLA/MLS instructions">; +def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", + "ExpandMLx", "true", + "Expand VFP/NEON MLA/MLS instructions">; // Some targets have special RAW hazards for VFP/NEON VMLA/VMLS. def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards", @@ -162,15 +190,18 @@ def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards", // Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from // VFP to NEON, as an execution domain optimization. -def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs", - "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">; +def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", + "UseNEONForFPMovs", "true", + "Convert VMOVSR, VMOVRS, " + "VMOVS to NEON">; // Some processors benefit from using NEON instructions for scalar // single-precision FP operations. This affects instruction selection and should // only be enabled if the handling of denormals is not important. -def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", - "true", - "Use NEON for single precision FP">; +def FeatureNEONForFP : SubtargetFeature<"neonfp", + "UseNEONForSinglePrecisionFP", + "true", + "Use NEON for single precision FP">; // On some processors, VLDn instructions that access unaligned data take one // extra cycle. Take that into account when computing operand latencies. @@ -181,18 +212,18 @@ def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign", // Some processors have a nonpipelined VFP coprocessor. def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp", "NonpipelinedVFP", "true", - "VFP instructions are not pipelined">; + "VFP instructions are not pipelined">; // Some processors have FP multiply-accumulate instructions that don't // play nicely with other VFP / NEON instructions, and it's generally better // to just not use them. -def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", - "Disable VFP / NEON MAC instructions">; +def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", + "Disable VFP / NEON MAC instructions">; // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding. def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", - "HasVMLxForwarding", "true", - "Has multiplier accumulator forwarding">; + "HasVMLxForwarding", "true", + "Has multiplier accumulator forwarding">; // Disable 32-bit to 16-bit narrowing for experimentation. def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", @@ -213,14 +244,16 @@ def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr", "true", "Disable +1 predication cost for instructions updating CPSR">; -def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", - "AvoidMOVsShifterOperand", "true", - "Avoid movs instructions with shifter operand">; +def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", + "AvoidMOVsShifterOperand", "true", + "Avoid movs instructions with " + "shifter operand">; // Some processors perform return stack prediction. CodeGen should avoid issue // "normal" call instructions to callees which do not return. -def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true", - "Has return address stack">; +def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", + "HasRetAddrStack", "true", + "Has return address stack">; // Some processors have no branch predictor, which changes the expected cost of // taking a branch which affects the choice of whether to use predicated @@ -230,63 +263,80 @@ def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor", "Has no branch predictor">; /// DSP extension. -def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", - "Supports DSP instructions in ARM and/or Thumb2">; +def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", + "Supports DSP instructions in " + "ARM and/or Thumb2">; // Multiprocessing extension. -def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", - "Supports Multiprocessing extension">; +def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", + "Supports Multiprocessing extension">; // Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8). def FeatureVirtualization : SubtargetFeature<"virtualization", - "HasVirtualization", "true", - "Supports Virtualization extension", - [FeatureHWDivThumb, FeatureHWDivARM]>; + "HasVirtualization", "true", + "Supports Virtualization extension", + [FeatureHWDivThumb, FeatureHWDivARM]>; -// M-series ISA -def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass", - "Is microcontroller profile ('M' series)">; +// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too. +// See ARMInstrInfo.td for details. +def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true", + "NaCl trap">; -// R-series ISA -def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass", - "Is realtime profile ('R' series)">; +def FeatureStrictAlign : SubtargetFeature<"strict-align", + "StrictAlign", "true", + "Disallow all unaligned memory " + "access">; + +def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true", + "Generate calls via indirect call " + "instructions">; + +def FeatureExecuteOnly : SubtargetFeature<"execute-only", + "GenExecuteOnly", "true", + "Enable the generation of " + "execute only code.">; + +def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true", + "Reserve R9, making it unavailable" + " as GPR">; + +def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true", + "Don't use movt/movw pairs for " + "32-bit imms">; + +def FeatureNoNegativeImmediates + : SubtargetFeature<"no-neg-immediates", + "NegativeImmediates", "false", + "Convert immediates and instructions " + "to their negated or complemented " + "equivalent when the immediate does " + "not fit in the encoding.">; + + +//===----------------------------------------------------------------------===// +// ARM architecture class +// // A-series ISA def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass", "Is application profile ('A' series)">; -// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too. -// See ARMInstrInfo.td for details. -def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true", - "NaCl trap">; - -def FeatureStrictAlign : SubtargetFeature<"strict-align", - "StrictAlign", "true", - "Disallow all unaligned memory " - "access">; +// R-series ISA +def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass", + "Is realtime profile ('R' series)">; -def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true", - "Generate calls via indirect call " - "instructions">; +// M-series ISA +def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass", + "Is microcontroller profile ('M' series)">; -def FeatureExecuteOnly - : SubtargetFeature<"execute-only", "GenExecuteOnly", "true", - "Enable the generation of execute only code.">; -def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true", - "Reserve R9, making it unavailable as " - "GPR">; +def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true", + "Enable Thumb2 instructions">; -def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true", - "Don't use movt/movw pairs for 32-bit " - "imms">; +def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true", + "Does not support ARM mode execution", + [ModeThumb]>; -def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", - "NegativeImmediates", "false", - "Convert immediates and instructions " - "to their negated or complemented " - "equivalent when the immediate does " - "not fit in the encoding.">; //===----------------------------------------------------------------------===// // ARM ISAa. @@ -294,43 +344,57 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", def HasV4TOps : SubtargetFeature<"v4t", "HasV4TOps", "true", "Support ARM v4T instructions">; + def HasV5TOps : SubtargetFeature<"v5t", "HasV5TOps", "true", "Support ARM v5T instructions", [HasV4TOps]>; + def HasV5TEOps : SubtargetFeature<"v5te", "HasV5TEOps", "true", - "Support ARM v5TE, v5TEj, and v5TExp instructions", + "Support ARM v5TE, v5TEj, and " + "v5TExp instructions", [HasV5TOps]>; + def HasV6Ops : SubtargetFeature<"v6", "HasV6Ops", "true", "Support ARM v6 instructions", [HasV5TEOps]>; + def HasV6MOps : SubtargetFeature<"v6m", "HasV6MOps", "true", "Support ARM v6M instructions", [HasV6Ops]>; + def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true", "Support ARM v8M Baseline instructions", [HasV6MOps]>; + def HasV6KOps : SubtargetFeature<"v6k", "HasV6KOps", "true", "Support ARM v6k instructions", [HasV6Ops]>; + def HasV6T2Ops : SubtargetFeature<"v6t2", "HasV6T2Ops", "true", "Support ARM v6t2 instructions", [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>; + def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true", "Support ARM v7 instructions", [HasV6T2Ops, FeaturePerfMon, FeatureV7Clrex]>; + +def HasV8MMainlineOps : + SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true", + "Support ARM v8M Mainline instructions", + [HasV7Ops]>; + def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", "Support ARM v8 instructions", [HasV7Ops, FeatureAcquireRelease]>; + def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", [HasV8Ops]>; -def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", "Support ARM v8.2a instructions", [HasV8_1aOps]>; -def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true", - "Support ARM v8M Mainline instructions", - [HasV7Ops]>; //===----------------------------------------------------------------------===// @@ -386,11 +450,17 @@ def ProcR52 : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52", def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3", "Cortex-M3 ARM processors", []>; + //===----------------------------------------------------------------------===// -// ARM schedules. +// ARM Helper classes. // -include "ARMSchedule.td" +class Architecture features> + : SubtargetFeature; + +class ProcNoItin Features> + : Processor; //===----------------------------------------------------------------------===// @@ -546,6 +616,12 @@ def ARMv7k : Architecture<"armv7k", "ARMv7a", [ARMv7a]>; def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>; +//===----------------------------------------------------------------------===// +// ARM schedules. +//===----------------------------------------------------------------------===// +// +include "ARMSchedule.td" + //===----------------------------------------------------------------------===// // ARM processors // @@ -553,6 +629,9 @@ def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>; // Dummy CPU, used to target architectures def : ProcessorModel<"generic", CortexA8Model, []>; +// FIXME: Several processors below are not using their own scheduler +// model, but one of similar/previous processor. These should be fixed. + def : ProcNoItin<"arm8", [ARMv4]>; def : ProcNoItin<"arm810", [ARMv4]>; def : ProcNoItin<"strongarm", [ARMv4]>; @@ -612,7 +691,6 @@ def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ARMv6t2, FeatureVFP2, FeatureHasSlowFPVMLx]>; -// FIXME: A5 has currently the same Schedule model as A8 def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5, FeatureHasRetAddrStack, FeatureTrustZone, @@ -656,7 +734,6 @@ def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9, FeatureCheckVLDnAlign, FeatureMP]>; -// FIXME: A12 has currently the same Schedule model as A9 def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12, FeatureHasRetAddrStack, FeatureTrustZone, @@ -666,7 +743,6 @@ def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12, FeatureVirtualization, FeatureMP]>; -// FIXME: A15 has currently the same Schedule model as A9. def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, FeatureDontWidenVMOVS, FeatureHasRetAddrStack, @@ -678,7 +754,6 @@ def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, FeatureAvoidPartialCPSR, FeatureVirtualization]>; -// FIXME: A17 has currently the same Schedule model as A9 def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17, FeatureHasRetAddrStack, FeatureTrustZone, @@ -688,9 +763,7 @@ def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17, FeatureAvoidPartialCPSR, FeatureVirtualization]>; -// FIXME: krait has currently the same Schedule model as A9 -// FIXME: krait has currently the same features as A9 plus VFP4 and hardware -// division features. +// FIXME: krait has currently the same features as A9 plus VFP4 and HWDiv def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait, FeatureHasRetAddrStack, FeatureMuxedUnits, @@ -720,12 +793,10 @@ def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, FeatureSlowVGETLNi32, FeatureSlowVDUP32]>; -// FIXME: R4 has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r4", CortexA8Model, [ARMv7r, ProcR4, FeatureHasRetAddrStack, FeatureAvoidPartialCPSR]>; -// FIXME: R4F has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, FeatureHasRetAddrStack, FeatureSlowFPBrcc, @@ -734,7 +805,6 @@ def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, FeatureD16, FeatureAvoidPartialCPSR]>; -// FIXME: R5 has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, FeatureHasRetAddrStack, FeatureVFP3, @@ -744,7 +814,6 @@ def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, FeatureHasSlowFPVMLx, FeatureAvoidPartialCPSR]>; -// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5. def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, FeatureHasRetAddrStack, FeatureVFP3, @@ -814,14 +883,14 @@ def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53, FeatureCRC, FeatureFPAO]>; -def : ProcessorModel<"cortex-a57", CortexA57Model, [ARMv8a, ProcA57, - FeatureHWDivThumb, - FeatureHWDivARM, - FeatureCrypto, - FeatureCRC, - FeatureFPAO, - FeatureAvoidPartialCPSR, - FeatureCheapPredicableCPSR]>; +def : ProcessorModel<"cortex-a57", CortexA57Model, [ARMv8a, ProcA57, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFPAO, + FeatureAvoidPartialCPSR, + FeatureCheapPredicableCPSR]>; def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72, FeatureHWDivThumb, @@ -835,7 +904,6 @@ def : ProcNoItin<"cortex-a73", [ARMv8a, ProcA73, FeatureCrypto, FeatureCRC]>; -// Cyclone is very similar to swift def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureHasRetAddrStack, FeatureNEONForFP, @@ -881,9 +949,7 @@ def : ProcessorModel<"cortex-r52", CortexR52Model, [ARMv8r, ProcR52, //===----------------------------------------------------------------------===// include "ARMRegisterInfo.td" - include "ARMRegisterBanks.td" - include "ARMCallingConv.td" //===----------------------------------------------------------------------===// @@ -891,7 +957,6 @@ include "ARMCallingConv.td" //===----------------------------------------------------------------------===// include "ARMInstrInfo.td" - def ARMInstrInfo : InstrInfo; //===----------------------------------------------------------------------===// @@ -912,7 +977,7 @@ def ARMAsmParserVariant : AsmParserVariant { } def ARM : Target { - // Pull in Instruction Info: + // Pull in Instruction Info. let InstructionSet = ARMInstrInfo; let AssemblyWriters = [ARMAsmWriter]; let AssemblyParserVariants = [ARMAsmParserVariant]; diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index e97a7ce5067f..370c0a7f5c53 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -117,7 +117,7 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { const ARMSubtarget &STI = MF.getSubtarget(); if (CC == CallingConv::GHC) - // This is academic becase all GHC calls are (supposed to be) tail calls + // This is academic because all GHC calls are (supposed to be) tail calls return CSR_NoRegs_RegMask; if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() && @@ -163,7 +163,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, // both or otherwise does not want to enable this optimization, the function // should return NULL if (CC == CallingConv::GHC) - // This is academic becase all GHC calls are (supposed to be) tail calls + // This is academic because all GHC calls are (supposed to be) tail calls return nullptr; return STI.isTargetDarwin() ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask; diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 384f80356cc8..bf00ef61c2d1 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -250,8 +250,7 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) { return false; // Look to see if our OptionalDef is defining CPSR or CCR. - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isDef()) continue; if (MO.getReg() == ARM::CPSR) *CPSR = true; @@ -267,8 +266,8 @@ bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) { AFI->isThumb2Function()) return MI->isPredicable(); - for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) - if (MCID.OpInfo[i].isPredicate()) + for (const MCOperandInfo &opInfo : MCID.operands()) + if (opInfo.isPredicate()) return true; return false; @@ -1972,7 +1971,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl &Args, break; } case CCValAssign::AExt: - // Intentional fall-through. Handle AExt and ZExt. + // Intentional fall-through. Handle AExt and ZExt. case CCValAssign::ZExt: { MVT DestVT = VA.getLocVT(); Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true); @@ -2001,6 +2000,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl &Args, assert(VA.getLocVT() == MVT::f64 && "Custom lowering for v2f64 args not available"); + // FIXME: ArgLocs[++i] may extend beyond ArgLocs.size() CCValAssign &NextVA = ArgLocs[++i]; assert(VA.isRegLoc() && NextVA.isRegLoc() && @@ -2172,8 +2172,8 @@ bool ARMFastISel::SelectRet(const Instruction *I) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(RetOpc)); AddOptionalDefs(MIB); - for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) - MIB.addReg(RetRegs[i], RegState::Implicit); + for (unsigned R : RetRegs) + MIB.addReg(R, RegState::Implicit); return true; } @@ -2233,8 +2233,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { ArgRegs.reserve(I->getNumOperands()); ArgVTs.reserve(I->getNumOperands()); ArgFlags.reserve(I->getNumOperands()); - for (unsigned i = 0; i < I->getNumOperands(); ++i) { - Value *Op = I->getOperand(i); + for (Value *Op : I->operands()) { unsigned Arg = getRegForValue(Op); if (Arg == 0) return false; @@ -2278,8 +2277,8 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { MIB.addExternalSymbol(TLI.getLibcallName(Call)); // Add implicit physical register uses to the call. - for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i], RegState::Implicit); + for (unsigned R : RegArgs) + MIB.addReg(R, RegState::Implicit); // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). @@ -2423,8 +2422,8 @@ bool ARMFastISel::SelectCall(const Instruction *I, MIB.addExternalSymbol(IntrMemName, 0); // Add implicit physical register uses to the call. - for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i], RegState::Implicit); + for (unsigned R : RegArgs) + MIB.addReg(R, RegState::Implicit); // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). @@ -2932,13 +2931,12 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, bool Found = false; bool isZExt; - for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends); - i != e; ++i) { - if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() && - (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm && - MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) { + for (const FoldableLoadExtendsStruct &FLE : FoldableLoadExtends) { + if (FLE.Opc[isThumb2] == MI->getOpcode() && + (uint64_t)FLE.ExpectedImm == Imm && + MVT((MVT::SimpleValueType)FLE.ExpectedVT) == VT) { Found = true; - isZExt = FoldableLoadExtends[i].isZExt; + isZExt = FLE.isZExt; } } if (!Found) return false; @@ -3057,9 +3055,8 @@ bool ARMFastISel::fastLowerArguments() { }; const TargetRegisterClass *RC = &ARM::rGPRRegClass; - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I) { - unsigned ArgNo = I->getArgNo(); + for (const Argument &Arg : F->args()) { + unsigned ArgNo = Arg.getArgNo(); unsigned SrcReg = GPRArgRegs[ArgNo]; unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. @@ -3069,7 +3066,7 @@ bool ARMFastISel::fastLowerArguments() { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(DstReg, getKillRegState(true)); - updateValueMap(&*I, ResultReg); + updateValueMap(&Arg, ResultReg); } return true; diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 7f9fe55a5c38..f75dd4de3f96 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2682,9 +2682,12 @@ void ARMDAGToDAGISel::Select(SDNode *N) { SDNode *ResNode; if (Subtarget->isThumb()) { - SDValue Pred = getAL(CurDAG, dl); - SDValue PredReg = CurDAG->getRegister(0, MVT::i32); - SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() }; + SDValue Ops[] = { + CPIdx, + getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getEntryNode() + }; ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other, Ops); } else { @@ -2698,6 +2701,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) { ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, Ops); } + // Annotate the Node with memory operand information so that MachineInstr + // queries work properly. This e.g. gives the register allocation the + // required information for rematerialization. + MachineFunction& MF = CurDAG->getMachineFunction(); + MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); + MemOp[0] = MF.getMachineMemOperand( + MachinePointerInfo::getConstantPool(MF), + MachineMemOperand::MOLoad, 4, 4); + + cast(ResNode)->setMemRefs(MemOp, MemOp+1); + ReplaceNode(N, ResNode); return; } diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 29ef69ad0010..faed6b867e2b 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -722,6 +722,29 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { return false; break; } + case G_BRCOND: { + if (!validReg(MRI, I.getOperand(0).getReg(), 1, ARM::GPRRegBankID)) { + DEBUG(dbgs() << "Unsupported condition register for G_BRCOND"); + return false; + } + + // Set the flags. + auto Test = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::TSTri)) + .addReg(I.getOperand(0).getReg()) + .addImm(1) + .add(predOps(ARMCC::AL)); + if (!constrainSelectedInstRegOperands(*Test, TII, TRI, RBI)) + return false; + + // Branch conditionally. + auto Branch = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::Bcc)) + .add(I.getOperand(1)) + .add(predOps(ARMCC::EQ, ARM::CPSR)); + if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI)) + return false; + I.eraseFromParent(); + return true; + } default: return false; } diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index f23e62595d2e..1c17c07e4cb0 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -66,14 +66,16 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({Op, s32}, Libcall); } - // FIXME: Support s8 and s16 as well - for (unsigned Op : {G_SREM, G_UREM}) + for (unsigned Op : {G_SREM, G_UREM}) { + for (auto Ty : {s8, s16}) + setAction({Op, Ty}, WidenScalar); if (ST.hasDivideInARMMode()) setAction({Op, s32}, Lower); else if (AEABI(ST)) setAction({Op, s32}, Custom); else setAction({Op, s32}, Libcall); + } for (unsigned Op : {G_SEXT, G_ZEXT}) { setAction({Op, s32}, Legal); @@ -88,6 +90,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({G_SELECT, p0}, Legal); setAction({G_SELECT, 1, s1}, Legal); + setAction({G_BRCOND, s1}, Legal); + setAction({G_CONSTANT, s32}, Legal); for (auto Ty : {s1, s8, s16}) setAction({G_CONSTANT, Ty}, WidenScalar); diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp index 13acea3c28a9..48b02d40b246 100644 --- a/lib/Target/ARM/ARMMCInstLower.cpp +++ b/lib/Target/ARM/ARMMCInstLower.cpp @@ -153,9 +153,7 @@ void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, break; } - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp; if (AP.lowerOperand(MO, MCOp)) { if (MCOp.isImm() && EncodeImms) { diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index c0c09e8c15af..844930235894 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -259,6 +259,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; case G_SELECT: { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + (void)Ty; LLT Ty2 = MRI.getType(MI.getOperand(1).getReg()); (void)Ty2; assert(Ty.getSizeInBits() == 32 && "Unsupported size for G_SELECT"); @@ -282,6 +283,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case G_FCMP: { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + (void)Ty; LLT Ty1 = MRI.getType(MI.getOperand(2).getReg()); LLT Ty2 = MRI.getType(MI.getOperand(3).getReg()); (void)Ty2; @@ -329,6 +331,13 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { &ARM::ValueMappings[ARM::DPR3OpsIdx]}); break; } + case G_BR: + OperandsMapping = getOperandsMapping({nullptr}); + break; + case G_BRCOND: + OperandsMapping = + getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr}); + break; default: return getInvalidInstructionMapping(); } diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index f41da3e8e223..22ce949367f3 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -47,6 +47,8 @@ public: ~ARMBaseTargetMachine() override; const ARMSubtarget *getSubtargetImpl(const Function &F) const override; + // The no argument getSubtargetImpl, while it exists on some targets, is + // deprecated and should not be used. const ARMSubtarget *getSubtargetImpl() const = delete; bool isLittleEndian() const { return isLittle; } diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp index cc7a7c3849bc..81b0aa7f8b98 100644 --- a/lib/Target/BPF/BPFISelLowering.cpp +++ b/lib/Target/BPF/BPFISelLowering.cpp @@ -515,8 +515,9 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); + bool isSelectOp = MI.getOpcode() == BPF::Select; - assert(MI.getOpcode() == BPF::Select && "Unexpected instr type to insert"); + assert((isSelectOp || MI.getOpcode() == BPF::Select_Ri) && "Unexpected instr type to insert"); // To "insert" a SELECT instruction, we actually have to insert the diamond // control-flow pattern. The incoming instruction knows the destination vreg @@ -548,48 +549,40 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // Insert Branch if Flag unsigned LHS = MI.getOperand(1).getReg(); - unsigned RHS = MI.getOperand(2).getReg(); int CC = MI.getOperand(3).getImm(); + int NewCC; switch (CC) { case ISD::SETGT: - BuildMI(BB, DL, TII.get(BPF::JSGT_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JSGT_rr : BPF::JSGT_ri; break; case ISD::SETUGT: - BuildMI(BB, DL, TII.get(BPF::JUGT_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JUGT_rr : BPF::JUGT_ri; break; case ISD::SETGE: - BuildMI(BB, DL, TII.get(BPF::JSGE_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JSGE_rr : BPF::JSGE_ri; break; case ISD::SETUGE: - BuildMI(BB, DL, TII.get(BPF::JUGE_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JUGE_rr : BPF::JUGE_ri; break; case ISD::SETEQ: - BuildMI(BB, DL, TII.get(BPF::JEQ_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JEQ_rr : BPF::JEQ_ri; break; case ISD::SETNE: - BuildMI(BB, DL, TII.get(BPF::JNE_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JNE_rr : BPF::JNE_ri; break; default: report_fatal_error("unimplemented select CondCode " + Twine(CC)); } + if (isSelectOp) + BuildMI(BB, DL, TII.get(NewCC)) + .addReg(LHS) + .addReg(MI.getOperand(2).getReg()) + .addMBB(Copy1MBB); + else + BuildMI(BB, DL, TII.get(NewCC)) + .addReg(LHS) + .addImm(MI.getOperand(2).getImm()) + .addMBB(Copy1MBB); // Copy0MBB: // %FalseValue = ... diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td index 5ad777268208..f68357809add 100644 --- a/lib/Target/BPF/BPFInstrInfo.td +++ b/lib/Target/BPF/BPFInstrInfo.td @@ -460,6 +460,11 @@ let usesCustomInserter = 1 in { "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2", [(set i64:$dst, (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i64:$src, i64:$src2))]>; + def Select_Ri : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, i64imm:$rhs, i64imm:$imm, GPR:$src, GPR:$src2), + "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2", + [(set i64:$dst, + (BPFselectcc i64:$lhs, (i64 imm:$rhs), (i64 imm:$imm), i64:$src, i64:$src2))]>; } // load 64-bit global addr into register diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp index b064778c4bbd..d75d95a6baea 100644 --- a/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -7,8 +7,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "hexbit" - #include "HexagonBitTracker.h" #include "HexagonTargetMachine.h" #include "llvm/ADT/BitVector.h" @@ -42,6 +40,8 @@ #include #include +#define DEBUG_TYPE "hexbit" + using namespace llvm; static cl::opt PreserveTiedOps("hexbit-keep-tied", cl::Hidden, diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td index 2dc74632e9be..30ebf89c9808 100644 --- a/lib/Target/Hexagon/HexagonDepInstrInfo.td +++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -45863,6 +45863,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000000; let isSoloAin1 = 1; +let hasSideEffects = 1; } def Y2_dccleaninva : HInst< (outs), @@ -45872,6 +45873,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000010; let isSoloAin1 = 1; +let hasSideEffects = 1; } def Y2_dcfetch : HInst< (outs), @@ -45900,6 +45902,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000001; let isSoloAin1 = 1; +let hasSideEffects = 1; } def Y2_dczeroa : HInst< (outs), @@ -45909,6 +45912,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000110; let isSoloAin1 = 1; +let hasSideEffects = 1; let mayStore = 1; } def Y2_icinva : HInst< diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp index 03c4a83594b3..80361015e649 100644 --- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp +++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -59,8 +59,6 @@ // J2_jump , %PC // Successors according to CFG: BB#6 BB#3 -#define DEBUG_TYPE "hexagon-eif" - #include "Hexagon.h" #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" @@ -90,6 +88,8 @@ #include #include +#define DEBUG_TYPE "hexagon-eif" + using namespace llvm; namespace llvm { diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp index 734f3c6658d9..a2f6dd68c1a1 100644 --- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -86,8 +86,6 @@ // however, is that finding the locations where the implicit uses need // to be added, and updating the live ranges will be more involved. -#define DEBUG_TYPE "expand-condsets" - #include "HexagonInstrInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SetVector.h" @@ -116,6 +114,8 @@ #include #include +#define DEBUG_TYPE "expand-condsets" + using namespace llvm; static cl::opt OptTfrLimit("expand-condsets-tfr-limit", diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index c790579ccebc..e5e75198b2d1 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -8,8 +8,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "hexagon-pei" - #include "HexagonFrameLowering.h" #include "HexagonBlockRanges.h" #include "HexagonInstrInfo.h" @@ -63,6 +61,8 @@ #include #include +#define DEBUG_TYPE "hexagon-pei" + // Hexagon stack frame layout as defined by the ABI: // // Incoming arguments diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp index bf31e1699284..0a955aedaf1a 100644 --- a/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -7,8 +7,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "hexinsert" - #include "BitTracker.h" #include "HexagonBitTracker.h" #include "HexagonInstrInfo.h" @@ -44,6 +42,8 @@ #include #include +#define DEBUG_TYPE "hexinsert" + using namespace llvm; static cl::opt VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U), diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp index 3470480d607d..2da211563e0a 100644 --- a/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -7,8 +7,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "gen-pred" - #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" #include "llvm/ADT/SetVector.h" @@ -35,6 +33,8 @@ #include #include +#define DEBUG_TYPE "gen-pred" + using namespace llvm; namespace llvm { diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 67242764d453..3997702bc962 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1286,16 +1286,6 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(SV)); } -// Creates a SPLAT instruction for a constant value VAL. -static SDValue createSplat(SelectionDAG &DAG, const SDLoc &dl, EVT VT, - SDValue Val) { - EVT T = VT.getVectorElementType(); - if (T == MVT::i8 || T == MVT::i16) - return DAG.getNode(HexagonISD::VSPLAT, dl, VT, Val); - - return SDValue(); -} - static bool isSExtFree(SDValue N) { // A sign-extend of a truncate of a sign-extend is free. if (N.getOpcode() == ISD::TRUNCATE && @@ -1374,79 +1364,6 @@ HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -// Handle only specific vector loads. -SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); - LoadSDNode *LoadNode = cast(Op); - SDValue Chain = LoadNode->getChain(); - SDValue Ptr = Op.getOperand(1); - SDValue LoweredLoad; - SDValue Result; - SDValue Base = LoadNode->getBasePtr(); - ISD::LoadExtType Ext = LoadNode->getExtensionType(); - unsigned Alignment = LoadNode->getAlignment(); - SDValue LoadChain; - - if(Ext == ISD::NON_EXTLOAD) - Ext = ISD::ZEXTLOAD; - - if (VT == MVT::v4i16) { - if (Alignment == 2) { - SDValue Loads[4]; - // Base load. - Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base, - LoadNode->getPointerInfo(), MVT::i16, Alignment, - LoadNode->getMemOperand()->getFlags()); - // Base+2 load. - SDValue Increment = DAG.getConstant(2, DL, MVT::i32); - Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment); - Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr, - LoadNode->getPointerInfo(), MVT::i16, Alignment, - LoadNode->getMemOperand()->getFlags()); - // SHL 16, then OR base and base+2. - SDValue ShiftAmount = DAG.getConstant(16, DL, MVT::i32); - SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount); - SDValue Tmp2 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[0]); - // Base + 4. - Increment = DAG.getConstant(4, DL, MVT::i32); - Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment); - Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr, - LoadNode->getPointerInfo(), MVT::i16, Alignment, - LoadNode->getMemOperand()->getFlags()); - // Base + 6. - Increment = DAG.getConstant(6, DL, MVT::i32); - Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment); - Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr, - LoadNode->getPointerInfo(), MVT::i16, Alignment, - LoadNode->getMemOperand()->getFlags()); - // SHL 16, then OR base+4 and base+6. - Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount); - SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]); - // Combine to i64. This could be optimised out later if we can - // affect reg allocation of this code. - Result = DAG.getNode(HexagonISD::COMBINE, DL, MVT::i64, Tmp4, Tmp2); - LoadChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - Loads[0].getValue(1), Loads[1].getValue(1), - Loads[2].getValue(1), Loads[3].getValue(1)); - } else { - // Perform default type expansion. - Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(), - LoadNode->getAlignment(), - LoadNode->getMemOperand()->getFlags()); - LoadChain = Result.getValue(1); - } - } else - llvm_unreachable("Custom lowering unsupported load"); - - Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); - // Since we pretend to lower a load, we need the original chain - // info attached to the result. - SDValue Ops[] = { Result, LoadChain }; - - return DAG.getMergeValues(Ops, DL); -} - SDValue HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT ValTy = Op.getValueType(); @@ -1971,18 +1888,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, // Handling of vector operations. // - // Custom lower v4i16 load only. Let v4i16 store to be - // promoted for now. promoteLdStType(MVT::v4i8, MVT::i32); promoteLdStType(MVT::v2i16, MVT::i32); promoteLdStType(MVT::v8i8, MVT::i64); + promoteLdStType(MVT::v4i16, MVT::i64); promoteLdStType(MVT::v2i32, MVT::i64); - setOperationAction(ISD::LOAD, MVT::v4i16, Custom); - setOperationAction(ISD::STORE, MVT::v4i16, Promote); - AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::i64); - AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::i64); - // Set the action for vector operations to "expand", then override it with // either "custom" or "legal" for specific cases. static const unsigned VectExpOps[] = { @@ -2301,7 +2212,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG"; case HexagonISD::TC_RETURN: return "HexagonISD::TC_RETURN"; case HexagonISD::VCOMBINE: return "HexagonISD::VCOMBINE"; - case HexagonISD::VPACK: return "HexagonISD::VPACK"; + case HexagonISD::VPACKE: return "HexagonISD::VPACKE"; + case HexagonISD::VPACKO: return "HexagonISD::VPACKO"; case HexagonISD::VASL: return "HexagonISD::VASL"; case HexagonISD::VASR: return "HexagonISD::VASR"; case HexagonISD::VLSR: return "HexagonISD::VLSR"; @@ -2394,7 +2306,7 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) // Test if V1 is a SCALAR_TO_VECTOR. if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) - return createSplat(DAG, dl, VT, V1.getOperand(0)); + return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0)); // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR // (and probably will turn into a SCALAR_TO_VECTOR once legalization @@ -2409,28 +2321,26 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) } } if (IsScalarToVector) - return createSplat(DAG, dl, VT, V1.getOperand(0)); + return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0)); } - return createSplat(DAG, dl, VT, DAG.getConstant(Lane, dl, MVT::i32)); + return DAG.getNode(HexagonISD::VSPLAT, dl, VT, + DAG.getConstant(Lane, dl, MVT::i32)); } if (UseHVX) { ArrayRef Mask = SVN->getMask(); size_t MaskLen = Mask.size(); - int ElemSizeInBits = VT.getScalarSizeInBits(); - if ((Subtarget.useHVXSglOps() && (ElemSizeInBits * MaskLen) == 64 * 8) || - (Subtarget.useHVXDblOps() && (ElemSizeInBits * MaskLen) == 128 * 8)) { - // Return 1 for odd and 2 of even - StridedLoadKind Pattern = isStridedLoad(Mask); + unsigned SizeInBits = VT.getScalarSizeInBits() * MaskLen; + if ((Subtarget.useHVXSglOps() && SizeInBits == 64 * 8) || + (Subtarget.useHVXDblOps() && SizeInBits == 128 * 8)) { + StridedLoadKind Pattern = isStridedLoad(Mask); if (Pattern == StridedLoadKind::NoPattern) return SDValue(); - SDValue Vec0 = Op.getOperand(0); - SDValue Vec1 = Op.getOperand(1); - SDValue StridePattern = DAG.getConstant(Pattern, dl, MVT::i32); - SDValue Ops[] = { Vec1, Vec0, StridePattern }; - return DAG.getNode(HexagonISD::VPACK, dl, VT, Ops); + unsigned Opc = Pattern == StridedLoadKind::Even ? HexagonISD::VPACKE + : HexagonISD::VPACKO; + return DAG.getNode(Opc, dl, VT, {Op.getOperand(1), Op.getOperand(0)}); } // We used to assert in the "else" part here, but that is bad for Halide // Halide creates intermediate double registers by interleaving two @@ -2531,19 +2441,26 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (Size > 64) return SDValue(); - APInt APSplatBits, APSplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; unsigned NElts = BVN->getNumOperands(); // Try to generate a SPLAT instruction. - if ((VT.getSimpleVT() == MVT::v4i8 || VT.getSimpleVT() == MVT::v4i16) && - (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, - HasAnyUndefs, 0, true) && SplatBitSize <= 16)) { - unsigned SplatBits = APSplatBits.getZExtValue(); - int32_t SextVal = ((int32_t) (SplatBits << (32 - SplatBitSize)) >> - (32 - SplatBitSize)); - return createSplat(DAG, dl, VT, DAG.getConstant(SextVal, dl, MVT::i32)); + if (VT == MVT::v4i8 || VT == MVT::v4i16 || VT == MVT::v2i32) { + APInt APSplatBits, APSplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, + HasAnyUndefs, 0, false)) { + if (SplatBitSize == VT.getVectorElementType().getSizeInBits()) { + unsigned ZV = APSplatBits.getZExtValue(); + assert(SplatBitSize <= 32 && "Can only handle up to i32"); + // Sign-extend the splat value from SplatBitSize to 32. + int32_t SV = SplatBitSize < 32 + ? int32_t(ZV << (32-SplatBitSize)) >> (32-SplatBitSize) + : int32_t(ZV); + return DAG.getNode(HexagonISD::VSPLAT, dl, VT, + DAG.getConstant(SV, dl, MVT::i32)); + } + } } // Try to generate COMBINE to build v2i32 vectors. @@ -2974,8 +2891,6 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); - // Custom lower some vector loads. - case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index bfd2c94eeaba..d66cbc95e918 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -62,7 +62,8 @@ namespace HexagonISD { EXTRACTU, EXTRACTURP, VCOMBINE, - VPACK, + VPACKE, + VPACKO, TC_RETURN, EH_RETURN, DCFETCH, @@ -164,7 +165,6 @@ namespace HexagonISD { SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td index c611857ec26a..104a28654dd5 100644 --- a/lib/Target/Hexagon/HexagonIntrinsics.td +++ b/lib/Target/Hexagon/HexagonIntrinsics.td @@ -1366,6 +1366,18 @@ defm : MaskedStore ; defm : MaskedStore ; defm : MaskedStore ; +//******************************************************************* +// SYSTEM +//******************************************************************* + +def: T_R_pat; +def: T_R_pat; +def: T_R_pat; +def: T_R_pat; + +def: T_RR_pat; +def: T_RP_pat; + include "HexagonIntrinsicsV3.td" include "HexagonIntrinsicsV4.td" include "HexagonIntrinsicsV5.td" diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp index a331c978f59d..374ffa3799b0 100644 --- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -10,8 +10,6 @@ // load/store instructions. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "opt-addr-mode" - #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" #include "MCTargetDesc/HexagonBaseInfo.h" @@ -36,6 +34,8 @@ #include #include +#define DEBUG_TYPE "opt-addr-mode" + static cl::opt CodeGrowthLimit("hexagon-amode-growth-limit", cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode " "optimization")); diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index ba98b8994937..804a547d5b33 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -2250,6 +2250,12 @@ def: Storea_pat, I32, addrgp, PS_storerhabs>; def: Storea_pat, I32, addrgp, PS_storeriabs>; def: Storea_pat, I64, addrgp, PS_storerdabs>; +// Prefer this pattern to S2_asl_i_p_or for the special case of joining +// two 32-bit words into a 64-bit word. +let AddedComplexity = 200 in +def: Pat<(or (shl (Aext64 I32:$a), (i32 32)), (Zext64 I32:$b)), + (A2_combinew I32:$a, I32:$b)>; + def: Pat<(or (or (or (shl (i64 (zext (and I32:$b, (i32 65535)))), (i32 16)), (i64 (zext (i32 (and I32:$a, (i32 65535)))))), (shl (i64 (anyext (and I32:$c, (i32 65535)))), (i32 32))), @@ -2971,45 +2977,40 @@ def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs), (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, Requires<[UseHVXDbl]>; -def SDTHexagonVPACK: SDTypeProfile<1, 3, [SDTCisSameAs<1, 2>, - SDTCisInt<3>]>; - -def HexagonVPACK: SDNode<"HexagonISD::VPACK", SDTHexagonVPACK>; - -// 0 as the last argument denotes vpacke. 1 denotes vpacko -def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs), - (v64i8 VectorRegs:$Vt), (i32 0))), - (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>, - Requires<[UseHVXSgl]>; -def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs), - (v64i8 VectorRegs:$Vt), (i32 1))), - (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>, - Requires<[UseHVXSgl]>; -def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs), - (v32i16 VectorRegs:$Vt), (i32 0))), - (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>, - Requires<[UseHVXSgl]>; -def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs), - (v32i16 VectorRegs:$Vt), (i32 1))), - (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>, - Requires<[UseHVXSgl]>; - -def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs), - (v128i8 VecDblRegs:$Vt), (i32 0))), - (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, - Requires<[UseHVXDbl]>; -def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs), - (v128i8 VecDblRegs:$Vt), (i32 1))), - (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, - Requires<[UseHVXDbl]>; -def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs), - (v64i16 VecDblRegs:$Vt), (i32 0))), - (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, - Requires<[UseHVXDbl]>; -def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs), - (v64i16 VecDblRegs:$Vt), (i32 1))), - (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, - Requires<[UseHVXDbl]>; +def SDTHexagonVPACK: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>, SDTCisVec<1>]>; + +def HexagonVPACKE: SDNode<"HexagonISD::VPACKE", SDTHexagonVPACK>; +def HexagonVPACKO: SDNode<"HexagonISD::VPACKO", SDTHexagonVPACK>; + +let Predicates = [UseHVXSgl] in { + def: Pat<(v64i8 (HexagonVPACKE (v64i8 VectorRegs:$Vs), + (v64i8 VectorRegs:$Vt))), + (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>; + def: Pat<(v64i8 (HexagonVPACKO (v64i8 VectorRegs:$Vs), + (v64i8 VectorRegs:$Vt))), + (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>; + def: Pat<(v32i16 (HexagonVPACKE (v32i16 VectorRegs:$Vs), + (v32i16 VectorRegs:$Vt))), + (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>; + def: Pat<(v32i16 (HexagonVPACKO (v32i16 VectorRegs:$Vs), + (v32i16 VectorRegs:$Vt))), + (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>; +} + +let Predicates = [UseHVXDbl] in { + def: Pat<(v128i8 (HexagonVPACKE (v128i8 VecDblRegs:$Vs), + (v128i8 VecDblRegs:$Vt))), + (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>; + def: Pat<(v128i8 (HexagonVPACKO (v128i8 VecDblRegs:$Vs), + (v128i8 VecDblRegs:$Vt))), + (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>; + def: Pat<(v64i16 (HexagonVPACKE (v64i16 VecDblRegs:$Vs), + (v64i16 VecDblRegs:$Vt))), + (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>; + def: Pat<(v64i16 (HexagonVPACKO (v64i16 VecDblRegs:$Vs), + (v64i16 VecDblRegs:$Vt))), + (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>; +} def V2I1: PatLeaf<(v2i1 PredRegs:$R)>; def V4I1: PatLeaf<(v4i1 PredRegs:$R)>; @@ -3073,6 +3074,10 @@ def: Pat<(v4i8 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrb I32:$Rs)>; // four halfwords of 64-bits destination register. def: Pat<(v4i16 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrh I32:$Rs)>; +def: Pat<(v2i32 (HexagonVSPLAT s8_0ImmPred:$s8)), + (A2_combineii imm:$s8, imm:$s8)>; +def: Pat<(v2i32 (HexagonVSPLAT I32:$Rs)), (A2_combinew I32:$Rs, I32:$Rs)>; + class VArith_pat : Pat <(Op Type:$Rss, Type:$Rtt), @@ -3099,14 +3104,11 @@ def: VArith_pat ; def: VArith_pat ; def: VArith_pat ; -def: Pat<(v2i32 (sra V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c), - (i32 u5_0ImmPred:$c))))), +def: Pat<(v2i32 (sra V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))), (S2_asr_i_vw V2I32:$b, imm:$c)>; -def: Pat<(v2i32 (srl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c), - (i32 u5_0ImmPred:$c))))), +def: Pat<(v2i32 (srl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))), (S2_lsr_i_vw V2I32:$b, imm:$c)>; -def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c), - (i32 u5_0ImmPred:$c))))), +def: Pat<(v2i32 (shl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))), (S2_asl_i_vw V2I32:$b, imm:$c)>; def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))), diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp index 34df2ebcc520..ea86c9c42f47 100644 --- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -53,6 +53,10 @@ static cl::opt EmitJtInText("hexagon-emit-jt-text", cl::Hidden, cl::init(false), cl::desc("Emit hexagon jump tables in function section")); +static cl::opt + EmitLutInText("hexagon-emit-lut-text", cl::Hidden, cl::init(false), + cl::desc("Emit hexagon lookup tables in function section")); + // TraceGVPlacement controls messages for all builds. For builds with assertions // (debug or release), messages are also controlled by the usual debug flags // (e.g. -debug and -debug-only=globallayout) @@ -136,6 +140,13 @@ MCSection *HexagonTargetObjectFile::SelectSectionForGlobal( << (Kind.isBSS() ? "kind_bss " : "" ) << (Kind.isBSSLocal() ? "kind_bss_local " : "" )); + // If the lookup table is used by more than one function, do not place + // it in text section. + if (EmitLutInText && GO->getName().startswith("switch.table")) { + if (const Function *Fn = getLutUsedFunction(GO)) + return selectSectionForLookupTable(GO, TM, Fn); + } + if (isGlobalInSmallSection(GO, TM)) return selectSmallSectionForGlobal(GO, Kind, TM); @@ -402,3 +413,39 @@ MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal( // Otherwise, we work the same as ELF. return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM); } + +// Return the function that uses the lookup table. If there are more +// than one live function that uses this look table, bail out and place +// the lookup table in default section. +const Function * +HexagonTargetObjectFile::getLutUsedFunction(const GlobalObject *GO) const { + const Function *ReturnFn = nullptr; + for (auto U : GO->users()) { + // validate each instance of user to be a live function. + auto *I = dyn_cast(U); + if (!I) + continue; + auto *Bb = I->getParent(); + if (!Bb) + continue; + auto *UserFn = Bb->getParent(); + if (!ReturnFn) + ReturnFn = UserFn; + else if (ReturnFn != UserFn) + return nullptr; + } + return ReturnFn; +} + +MCSection *HexagonTargetObjectFile::selectSectionForLookupTable( + const GlobalObject *GO, const TargetMachine &TM, const Function *Fn) const { + + SectionKind Kind = SectionKind::getText(); + // If the function has explicit section, place the lookup table in this + // explicit section. + if (Fn->hasSection()) + return getExplicitSectionGlobal(Fn, Kind, TM); + + const auto *FuncObj = dyn_cast(Fn); + return SelectSectionForGlobal(FuncObj, Kind, TM); +} diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h index 373d850b53be..eff44f097e03 100644 --- a/lib/Target/Hexagon/HexagonTargetObjectFile.h +++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h @@ -36,6 +36,8 @@ namespace llvm { bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference, const Function &F) const override; + const Function *getLutUsedFunction(const GlobalObject *GO) const; + private: MCSectionELF *SmallDataSection; MCSectionELF *SmallBSSSection; @@ -46,6 +48,10 @@ namespace llvm { MCSection *selectSmallSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const; + + MCSection *selectSectionForLookupTable(const GlobalObject *GO, + const TargetMachine &TM, + const Function *Fn) const; }; } // namespace llvm diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index b72c9d534478..e12188e70602 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -304,9 +304,6 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); - bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, - const MCSubtargetInfo *STI); - bool reportParseError(Twine ErrorMsg); bool reportParseError(SMLoc Loc, Twine ErrorMsg); @@ -2514,16 +2511,6 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SEQIMacro: return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; - case Mips::MFTC0: case Mips::MTTC0: - case Mips::MFTGPR: case Mips::MTTGPR: - case Mips::MFTLO: case Mips::MTTLO: - case Mips::MFTHI: case Mips::MTTHI: - case Mips::MFTACX: case Mips::MTTACX: - case Mips::MFTDSP: case Mips::MTTDSP: - case Mips::MFTC1: case Mips::MTTC1: - case Mips::MFTHC1: case Mips::MTTHC1: - case Mips::CFTC1: case Mips::CTTC1: - return expandMXTRAlias(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; } } @@ -4895,212 +4882,6 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return false; } -// Map the DSP accumulator and control register to the corresponding gpr -// operand. Unlike the other alias, the m(f|t)t(lo|hi|acx) instructions -// do not map the DSP registers contigously to gpr registers. -static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) { - switch (Inst.getOpcode()) { - case Mips::MFTLO: - case Mips::MTTLO: - switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { - case Mips::AC0: - return Mips::ZERO; - case Mips::AC1: - return Mips::A0; - case Mips::AC2: - return Mips::T0; - case Mips::AC3: - return Mips::T4; - default: - llvm_unreachable("Unknown register for 'mttr' alias!"); - } - case Mips::MFTHI: - case Mips::MTTHI: - switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { - case Mips::AC0: - return Mips::AT; - case Mips::AC1: - return Mips::A1; - case Mips::AC2: - return Mips::T1; - case Mips::AC3: - return Mips::T5; - default: - llvm_unreachable("Unknown register for 'mttr' alias!"); - } - case Mips::MFTACX: - case Mips::MTTACX: - switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { - case Mips::AC0: - return Mips::V0; - case Mips::AC1: - return Mips::A2; - case Mips::AC2: - return Mips::T2; - case Mips::AC3: - return Mips::T6; - default: - llvm_unreachable("Unknown register for 'mttr' alias!"); - } - case Mips::MFTDSP: - case Mips::MTTDSP: - return Mips::S0; - default: - llvm_unreachable("Unknown instruction for 'mttr' dsp alias!"); - } -} - -// Map the floating point register operand to the corresponding register -// operand. -static unsigned getRegisterForMxtrFP(MCInst &Inst, bool IsMFTC1) { - switch (Inst.getOperand(IsMFTC1 ? 1 : 0).getReg()) { - case Mips::F0: return Mips::ZERO; - case Mips::F1: return Mips::AT; - case Mips::F2: return Mips::V0; - case Mips::F3: return Mips::V1; - case Mips::F4: return Mips::A0; - case Mips::F5: return Mips::A1; - case Mips::F6: return Mips::A2; - case Mips::F7: return Mips::A3; - case Mips::F8: return Mips::T0; - case Mips::F9: return Mips::T1; - case Mips::F10: return Mips::T2; - case Mips::F11: return Mips::T3; - case Mips::F12: return Mips::T4; - case Mips::F13: return Mips::T5; - case Mips::F14: return Mips::T6; - case Mips::F15: return Mips::T7; - case Mips::F16: return Mips::S0; - case Mips::F17: return Mips::S1; - case Mips::F18: return Mips::S2; - case Mips::F19: return Mips::S3; - case Mips::F20: return Mips::S4; - case Mips::F21: return Mips::S5; - case Mips::F22: return Mips::S6; - case Mips::F23: return Mips::S7; - case Mips::F24: return Mips::T8; - case Mips::F25: return Mips::T9; - case Mips::F26: return Mips::K0; - case Mips::F27: return Mips::K1; - case Mips::F28: return Mips::GP; - case Mips::F29: return Mips::SP; - case Mips::F30: return Mips::FP; - case Mips::F31: return Mips::RA; - default: llvm_unreachable("Unknown register for mttc1 alias!"); - } -} - -// Map the coprocessor operand the corresponding gpr register operand. -static unsigned getRegisterForMxtrC0(MCInst &Inst, bool IsMFTC0) { - switch (Inst.getOperand(IsMFTC0 ? 1 : 0).getReg()) { - case Mips::COP00: return Mips::ZERO; - case Mips::COP01: return Mips::AT; - case Mips::COP02: return Mips::V0; - case Mips::COP03: return Mips::V1; - case Mips::COP04: return Mips::A0; - case Mips::COP05: return Mips::A1; - case Mips::COP06: return Mips::A2; - case Mips::COP07: return Mips::A3; - case Mips::COP08: return Mips::T0; - case Mips::COP09: return Mips::T1; - case Mips::COP010: return Mips::T2; - case Mips::COP011: return Mips::T3; - case Mips::COP012: return Mips::T4; - case Mips::COP013: return Mips::T5; - case Mips::COP014: return Mips::T6; - case Mips::COP015: return Mips::T7; - case Mips::COP016: return Mips::S0; - case Mips::COP017: return Mips::S1; - case Mips::COP018: return Mips::S2; - case Mips::COP019: return Mips::S3; - case Mips::COP020: return Mips::S4; - case Mips::COP021: return Mips::S5; - case Mips::COP022: return Mips::S6; - case Mips::COP023: return Mips::S7; - case Mips::COP024: return Mips::T8; - case Mips::COP025: return Mips::T9; - case Mips::COP026: return Mips::K0; - case Mips::COP027: return Mips::K1; - case Mips::COP028: return Mips::GP; - case Mips::COP029: return Mips::SP; - case Mips::COP030: return Mips::FP; - case Mips::COP031: return Mips::RA; - default: llvm_unreachable("Unknown register for mttc0 alias!"); - } -} - -/// Expand an alias of 'mftr' or 'mttr' into the full instruction, by producing -/// an mftr or mttr with the correctly mapped gpr register, u, sel and h bits. -bool MipsAsmParser::expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, - const MCSubtargetInfo *STI) { - MipsTargetStreamer &TOut = getTargetStreamer(); - unsigned rd = 0; - unsigned u = 1; - unsigned sel = 0; - unsigned h = 0; - bool IsMFTR = false; - switch (Inst.getOpcode()) { - case Mips::MFTC0: - IsMFTR = true; - LLVM_FALLTHROUGH; - case Mips::MTTC0: - u = 0; - rd = getRegisterForMxtrC0(Inst, IsMFTR); - sel = Inst.getOperand(2).getImm(); - break; - case Mips::MFTGPR: - IsMFTR = true; - LLVM_FALLTHROUGH; - case Mips::MTTGPR: - rd = Inst.getOperand(IsMFTR ? 1 : 0).getReg(); - break; - case Mips::MFTLO: - case Mips::MFTHI: - case Mips::MFTACX: - case Mips::MFTDSP: - IsMFTR = true; - LLVM_FALLTHROUGH; - case Mips::MTTLO: - case Mips::MTTHI: - case Mips::MTTACX: - case Mips::MTTDSP: - rd = getRegisterForMxtrDSP(Inst, IsMFTR); - sel = 1; - break; - case Mips::MFTHC1: - h = 1; - LLVM_FALLTHROUGH; - case Mips::MFTC1: - IsMFTR = true; - rd = getRegisterForMxtrFP(Inst, IsMFTR); - sel = 2; - break; - case Mips::MTTHC1: - h = 1; - LLVM_FALLTHROUGH; - case Mips::MTTC1: - rd = getRegisterForMxtrFP(Inst, IsMFTR); - sel = 2; - break; - case Mips::CFTC1: - IsMFTR = true; - LLVM_FALLTHROUGH; - case Mips::CTTC1: - rd = getRegisterForMxtrFP(Inst, IsMFTR); - sel = 3; - break; - } - unsigned Op0 = IsMFTR ? Inst.getOperand(0).getReg() : rd; - unsigned Op1 = - IsMFTR ? rd - : (Inst.getOpcode() != Mips::MTTDSP ? Inst.getOperand(1).getReg() - : Inst.getOperand(0).getReg()); - - TOut.emitRRIII(IsMFTR ? Mips::MFTR : Mips::MTTR, Op0, Op1, u, sel, h, IDLoc, - STI); - return false; -} - unsigned MipsAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst, const OperandVector &Operands) { diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index 7caeb08589af..2907b7715857 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -193,21 +193,6 @@ void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, STI); } -void MipsTargetStreamer::emitRRIII(unsigned Opcode, unsigned Reg0, - unsigned Reg1, int16_t Imm0, int16_t Imm1, - int16_t Imm2, SMLoc IDLoc, - const MCSubtargetInfo *STI) { - MCInst TmpInst; - TmpInst.setOpcode(Opcode); - TmpInst.addOperand(MCOperand::createReg(Reg0)); - TmpInst.addOperand(MCOperand::createReg(Reg1)); - TmpInst.addOperand(MCOperand::createImm(Imm0)); - TmpInst.addOperand(MCOperand::createImm(Imm1)); - TmpInst.addOperand(MCOperand::createImm(Imm2)); - TmpInst.setLoc(IDLoc); - getStreamer().EmitInstruction(TmpInst, *STI); -} - void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit, const MCSubtargetInfo *STI) { diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index d2f0fdcc6cc1..6ceb05577538 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -190,6 +190,9 @@ def FeatureMadd4 : SubtargetFeature<"nomadd4", "DisableMadd4", "true", def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">; +def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true", + "Disable use of the jal instruction">; + //===----------------------------------------------------------------------===// // Mips processors supported. //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index a6ec9fb2e598..20319f85696c 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -364,6 +364,18 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::UDIV, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); + if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) { + setOperationAction(ISD::ADDC, MVT::i32, Expand); + setOperationAction(ISD::ADDE, MVT::i32, Expand); + } + + setOperationAction(ISD::ADDC, MVT::i64, Expand); + setOperationAction(ISD::ADDE, MVT::i64, Expand); + setOperationAction(ISD::SUBC, MVT::i32, Expand); + setOperationAction(ISD::SUBE, MVT::i32, Expand); + setOperationAction(ISD::SUBC, MVT::i64, Expand); + setOperationAction(ISD::SUBE, MVT::i64, Expand); + // Operations not directly supported by Mips. setOperationAction(ISD::BR_CC, MVT::f32, Expand); setOperationAction(ISD::BR_CC, MVT::f64, Expand); @@ -469,6 +481,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::AssertZext); setTargetDAGCombine(ISD::SHL); @@ -923,14 +936,127 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, } } +static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG, + const MipsSubtarget &Subtarget) { + // ROOTNode must have a multiplication as an operand for the match to be + // successful. + if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL && + ROOTNode->getOperand(1).getOpcode() != ISD::MUL) + return SDValue(); + + // We don't handle vector types here. + if (ROOTNode->getValueType(0).isVector()) + return SDValue(); + + // For MIPS64, madd / msub instructions are inefficent to use with 64 bit + // arithmetic. E.g. + // (add (mul a b) c) => + // let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in + // MIPS64: (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32) + // or + // MIPS64R2: (dins (mflo res) (mfhi res) 32 32) + // + // The overhead of setting up the Hi/Lo registers and reassembling the + // result makes this a dubious optimzation for MIPS64. The core of the + // problem is that Hi/Lo contain the upper and lower 32 bits of the + // operand and result. + // + // It requires a chain of 4 add/mul for MIPS64R2 to get better code + // density than doing it naively, 5 for MIPS64. Additionally, using + // madd/msub on MIPS64 requires the operands actually be 32 bit sign + // extended operands, not true 64 bit values. + // + // FIXME: For the moment, disable this completely for MIPS64. + if (Subtarget.hasMips64()) + return SDValue(); + + SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL + ? ROOTNode->getOperand(0) + : ROOTNode->getOperand(1); + + SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL + ? ROOTNode->getOperand(1) + : ROOTNode->getOperand(0); + + // Transform this to a MADD only if the user of this node is the add. + // If there are other users of the mul, this function returns here. + if (!Mult.hasOneUse()) + return SDValue(); + + // maddu and madd are unusual instructions in that on MIPS64 bits 63..31 + // must be in canonical form, i.e. sign extended. For MIPS32, the operands + // of the multiply must have 32 or more sign bits, otherwise we cannot + // perform this optimization. We have to check this here as we're performing + // this optimization pre-legalization. + SDValue MultLHS = Mult->getOperand(0); + SDValue MultRHS = Mult->getOperand(1); + + bool IsSigned = MultLHS->getOpcode() == ISD::SIGN_EXTEND && + MultRHS->getOpcode() == ISD::SIGN_EXTEND; + bool IsUnsigned = MultLHS->getOpcode() == ISD::ZERO_EXTEND && + MultRHS->getOpcode() == ISD::ZERO_EXTEND; + + if (!IsSigned && !IsUnsigned) + return SDValue(); + + // Initialize accumulator. + SDLoc DL(ROOTNode); + SDValue TopHalf; + SDValue BottomHalf; + BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand, + CurDAG.getIntPtrConstant(0, DL)); + + TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand, + CurDAG.getIntPtrConstant(1, DL)); + SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped, + BottomHalf, + TopHalf); + + // Create MipsMAdd(u) / MipsMSub(u) node. + bool IsAdd = ROOTNode->getOpcode() == ISD::ADD; + unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd) + : (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub); + SDValue MAddOps[3] = { + CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)), + CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn}; + EVT VTs[2] = {MVT::i32, MVT::i32}; + SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps); + + SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd); + SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd); + SDValue Combined = + CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi); + return Combined; +} + +static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const MipsSubtarget &Subtarget) { + // (sub v0 (mul v1, v2)) => (msub v1, v2, v0) + if (DCI.isBeforeLegalizeOps()) { + if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() && + !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64) + return performMADD_MSUBCombine(N, DAG, Subtarget); + + return SDValue(); + } + + return SDValue(); +} + static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const MipsSubtarget &Subtarget) { - // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt)) + // (add v0 (mul v1, v2)) => (madd v1, v2, v0) + if (DCI.isBeforeLegalizeOps()) { + if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() && + !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64) + return performMADD_MSUBCombine(N, DAG, Subtarget); - if (DCI.isBeforeLegalizeOps()) return SDValue(); + } + // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt)) SDValue Add = N->getOperand(1); if (Add.getOpcode() != ISD::ADD) @@ -1058,6 +1184,8 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return performAssertZextCombine(N, DAG, DCI, Subtarget); case ISD::SHL: return performSHLCombine(N, DAG, DCI, Subtarget); + case ISD::SUB: + return performSUBCombine(N, DAG, DCI, Subtarget); } return SDValue(); @@ -3021,6 +3149,20 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EVT Ty = Callee.getValueType(); bool GlobalOrExternal = false, IsCallReloc = false; + // The long-calls feature is ignored in case of PIC. + // While we do not support -mshared / -mno-shared properly, + // ignore long-calls in case of -mabicalls too. + if (Subtarget.useLongCalls() && !Subtarget.isABICalls() && !IsPIC) { + // Get the address of the callee into a register to prevent + // using of the `jal` instruction for the direct call. + if (auto *N = dyn_cast(Callee)) + Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG) + : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG); + else if (auto *N = dyn_cast(Callee)) + Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG) + : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG); + } + if (GlobalAddressSDNode *G = dyn_cast(Callee)) { if (IsPIC) { const GlobalValue *Val = G->getGlobal(); diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index 94f3a74be98b..0333fe6520fa 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -443,8 +443,17 @@ let AdditionalPredicates = [NotInMicroMips] in { } def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1, bitconvert>, MFC1_FM<0>; +def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>, + FGR_64 { + let DecoderNamespace = "Mips64"; +} def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1, bitconvert>, MFC1_FM<4>; +def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>, + FGR_64 { + let DecoderNamespace = "Mips64"; +} + let AdditionalPredicates = [NotInMicroMips] in { def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>, MFC1_FM<3>, ISA_MIPS32R2, FGR_32; diff --git a/lib/Target/Mips/MipsMTInstrFormats.td b/lib/Target/Mips/MipsMTInstrFormats.td index edc0981e6278..64bee5bfba18 100644 --- a/lib/Target/Mips/MipsMTInstrFormats.td +++ b/lib/Target/Mips/MipsMTInstrFormats.td @@ -35,8 +35,6 @@ class FIELD5 Val> { def FIELD5_1_DMT_EMT : FIELD5<0b00001>; def FIELD5_2_DMT_EMT : FIELD5<0b01111>; def FIELD5_1_2_DVPE_EVPE : FIELD5<0b00000>; -def FIELD5_MFTR : FIELD5<0b01000>; -def FIELD5_MTTR : FIELD5<0b01100>; class COP0_MFMC0_MT : MipsMTInst { bits<32> Inst; @@ -52,25 +50,6 @@ class COP0_MFMC0_MT : MipsMTInst { let Inst{2-0} = 0b001; } -class COP0_MFTTR_MT : MipsMTInst { - bits<32> Inst; - - bits<5> rt; - bits<5> rd; - bits<1> u; - bits<1> h; - bits<3> sel; - let Inst{31-26} = 0b010000; // COP0 - let Inst{25-21} = Op.Value; // MFMC0 - let Inst{20-16} = rt; - let Inst{15-11} = rd; - let Inst{10-6} = 0b00000; // rx - currently unsupported. - let Inst{5} = u; - let Inst{4} = h; - let Inst{3} = 0b0; - let Inst{2-0} = sel; -} - class SPECIAL3_MT_FORK : MipsMTInst { bits<32> Inst; diff --git a/lib/Target/Mips/MipsMTInstrInfo.td b/lib/Target/Mips/MipsMTInstrInfo.td index 72e626cbec40..ab6693f60fd9 100644 --- a/lib/Target/Mips/MipsMTInstrInfo.td +++ b/lib/Target/Mips/MipsMTInstrInfo.td @@ -6,13 +6,6 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This file describes the MIPS MT ASE as defined by MD00378 1.12. -// -// TODO: Add support for the microMIPS encodings for the MT ASE and add the -// instruction mappings. -// -//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // MIPS MT Instruction Encodings @@ -34,10 +27,6 @@ class FORK_ENC : SPECIAL3_MT_FORK; class YIELD_ENC : SPECIAL3_MT_YIELD; -class MFTR_ENC : COP0_MFTTR_MT; - -class MTTR_ENC : COP0_MFTTR_MT; - //===----------------------------------------------------------------------===// // MIPS MT Instruction Descriptions //===----------------------------------------------------------------------===// @@ -50,22 +39,6 @@ class MT_1R_DESC_BASE { InstrItinClass Itinerary = Itin; } -class MFTR_DESC { - dag OutOperandList = (outs GPR32Opnd:$rd); - dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h); - string AsmString = "mftr\t$rd, $rt, $u, $sel, $h"; - list Pattern = []; - InstrItinClass Itinerary = II_MFTR; -} - -class MTTR_DESC { - dag OutOperandList = (outs GPR32Opnd:$rd); - dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h); - string AsmString = "mttr\t$rt, $rd, $u, $sel, $h"; - list Pattern = []; - InstrItinClass Itinerary = II_MTTR; -} - class FORK_DESC { dag OutOperandList = (outs GPR32Opnd:$rs, GPR32Opnd:$rd); dag InOperandList = (ins GPR32Opnd:$rt); @@ -106,73 +79,8 @@ let hasSideEffects = 1, isNotDuplicable = 1, def FORK : FORK_ENC, FORK_DESC, ASE_MT; def YIELD : YIELD_ENC, YIELD_DESC, ASE_MT; - - def MFTR : MFTR_ENC, MFTR_DESC, ASE_MT; - - def MTTR : MTTR_ENC, MTTR_DESC, ASE_MT; } -//===----------------------------------------------------------------------===// -// MIPS MT Pseudo Instructions - used to support mtfr & mttr aliases. -//===----------------------------------------------------------------------===// -def MFTC0 : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins COP0Opnd:$rt, - uimm3:$sel), - "mftc0 $rd, $rt, $sel">, ASE_MT; - -def MFTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rt, - uimm3:$sel), - "mftgpr $rd, $rt">, ASE_MT; - -def MFTLO : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac), - "mftlo $rt, $ac">, ASE_MT; - -def MFTHI : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac), - "mfthi $rt, $ac">, ASE_MT; - -def MFTACX : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac), - "mftacx $rt, $ac">, ASE_MT; - -def MFTDSP : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins), - "mftdsp $rt">, ASE_MT; - -def MFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft), - "mftc1 $rt, $ft">, ASE_MT; - -def MFTHC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft), - "mfthc1 $rt, $ft">, ASE_MT; - -def CFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGRCCOpnd:$ft), - "cftc1 $rt, $ft">, ASE_MT; - - -def MTTC0 : MipsAsmPseudoInst<(outs COP0Opnd:$rd), (ins GPR32Opnd:$rt, - uimm3:$sel), - "mttc0 $rt, $rd, $sel">, ASE_MT; - -def MTTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins GPR32Opnd:$rd), - "mttgpr $rd, $rt">, ASE_MT; - -def MTTLO : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt), - "mttlo $rt, $ac">, ASE_MT; - -def MTTHI : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt), - "mtthi $rt, $ac">, ASE_MT; - -def MTTACX : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt), - "mttacx $rt, $ac">, ASE_MT; - -def MTTDSP : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rt), - "mttdsp $rt">, ASE_MT; - -def MTTC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt), - "mttc1 $rt, $ft">, ASE_MT; - -def MTTHC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt), - "mtthc1 $rt, $ft">, ASE_MT; - -def CTTC1 : MipsAsmPseudoInst<(outs FGRCCOpnd:$ft), (ins GPR32Opnd:$rt), - "cttc1 $rt, $ft">, ASE_MT; - //===----------------------------------------------------------------------===// // MIPS MT Instruction Definitions //===----------------------------------------------------------------------===// @@ -187,22 +95,4 @@ let AdditionalPredicates = [NotInMicroMips] in { def : MipsInstAlias<"evpe", (EVPE ZERO), 1>, ASE_MT; def : MipsInstAlias<"yield $rs", (YIELD ZERO, GPR32Opnd:$rs), 1>, ASE_MT; - - def : MipsInstAlias<"mftc0 $rd, $rt", (MFTC0 GPR32Opnd:$rd, COP0Opnd:$rt, 0), - 1>, ASE_MT; - - def : MipsInstAlias<"mftlo $rt", (MFTLO GPR32Opnd:$rt, AC0), 1>, ASE_MT; - - def : MipsInstAlias<"mfthi $rt", (MFTHI GPR32Opnd:$rt, AC0), 1>, ASE_MT; - - def : MipsInstAlias<"mftacx $rt", (MFTACX GPR32Opnd:$rt, AC0), 1>, ASE_MT; - - def : MipsInstAlias<"mttc0 $rd, $rt", (MTTC0 COP0Opnd:$rt, GPR32Opnd:$rd, 0), - 1>, ASE_MT; - - def : MipsInstAlias<"mttlo $rt", (MTTLO AC0, GPR32Opnd:$rt), 1>, ASE_MT; - - def : MipsInstAlias<"mtthi $rt", (MTTHI AC0, GPR32Opnd:$rt), 1>, ASE_MT; - - def : MipsInstAlias<"mttacx $rt", (MTTACX AC0, GPR32Opnd:$rt), 1>, ASE_MT; } diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 49ae6dd4cd39..4be26dd25dc0 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -245,46 +245,64 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) { } } -void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag, - SDValue CmpLHS, const SDLoc &DL, - SDNode *Node) const { - unsigned Opc = InFlag.getOpcode(); (void)Opc; - - assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) || - (Opc == ISD::SUBC || Opc == ISD::SUBE)) && - "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn"); - - unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu; - if (Subtarget->isGP64bit()) { - SLTuOp = Mips::SLTu64; - ADDuOp = Mips::DADDu; - } - - SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) }; +void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const { + SDValue InFlag = Node->getOperand(2); + unsigned Opc = InFlag.getOpcode(); SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1); EVT VT = LHS.getValueType(); - SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops); - - if (Subtarget->isGP64bit()) { - // On 64-bit targets, sltu produces an i64 but our backend currently says - // that SLTu64 produces an i32. We need to fix this in the long run but for - // now, just make the DAG type-correct by asserting the upper bits are zero. - Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT, - CurDAG->getTargetConstant(0, DL, VT), - SDValue(Carry, 0), - CurDAG->getTargetConstant(Mips::sub_32, DL, - VT)); + // In the base case, we can rely on the carry bit from the addsc + // instruction. + if (Opc == ISD::ADDC) { + SDValue Ops[3] = {LHS, RHS, InFlag}; + CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Ops); + return; } - // Generate a second addition only if we know that RHS is not a - // constant-zero node. - SDNode *AddCarry = Carry; - ConstantSDNode *C = dyn_cast(RHS); - if (!C || C->getZExtValue()) - AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS); + assert(Opc == ISD::ADDE && "ISD::ADDE not in a chain of ADDE nodes!"); + + // The more complex case is when there is a chain of ISD::ADDE nodes like: + // (adde (adde (adde (addc a b) c) d) e). + // + // The addwc instruction does not write to the carry bit, instead it writes + // to bit 20 of the dsp control register. To match this series of nodes, each + // intermediate adde node must be expanded to write the carry bit before the + // addition. + + // Start by reading the overflow field for addsc and moving the value to the + // carry field. The usage of 1 here with MipsISD::RDDSP / Mips::WRDSP + // corresponds to reading/writing the entire control register to/from a GPR. + + SDValue CstOne = CurDAG->getTargetConstant(1, DL, MVT::i32); + + SDValue OuFlag = CurDAG->getTargetConstant(20, DL, MVT::i32); + + SDNode *DSPCtrlField = + CurDAG->getMachineNode(Mips::RDDSP, DL, MVT::i32, MVT::Glue, CstOne, InFlag); + + SDNode *Carry = CurDAG->getMachineNode( + Mips::EXT, DL, MVT::i32, SDValue(DSPCtrlField, 0), OuFlag, CstOne); - CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0)); + SDValue Ops[4] = {SDValue(DSPCtrlField, 0), + CurDAG->getTargetConstant(6, DL, MVT::i32), CstOne, + SDValue(Carry, 0)}; + SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops); + + // My reading of the the MIPS DSP 3.01 specification isn't as clear as I + // would like about whether bit 20 always gets overwritten by addwc. + // Hence take an extremely conservative view and presume it's sticky. We + // therefore need to clear it. + + SDValue Zero = CurDAG->getRegister(Mips::ZERO, MVT::i32); + + SDValue InsOps[4] = {Zero, OuFlag, CstOne, SDValue(DSPCFWithCarry, 0)}; + SDNode *DSPCtrlFinal = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, InsOps); + + SDNode *WrDSP = CurDAG->getMachineNode(Mips::WRDSP, DL, MVT::Glue, + SDValue(DSPCtrlFinal, 0), CstOne); + + SDValue Operands[3] = {LHS, RHS, SDValue(WrDSP, 0)}; + CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Operands); } /// Match frameindex @@ -765,19 +783,8 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) { switch(Opcode) { default: break; - case ISD::SUBE: { - SDValue InFlag = Node->getOperand(2); - unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu; - selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node); - return true; - } - case ISD::ADDE: { - if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC. - break; - SDValue InFlag = Node->getOperand(2); - unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu; - selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node); + selectAddE(Node, DL); return true; } diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h index f89a350cab04..6f38289c5a45 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.h +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h @@ -41,8 +41,7 @@ private: const SDLoc &dl, EVT Ty, bool HasLo, bool HasHi); - void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS, - const SDLoc &DL, SDNode *Node) const; + void selectAddE(SDNode *Node, const SDLoc &DL) const; bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const; bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset, diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index 06a97b9d123e..72b2738bfac4 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -179,8 +179,6 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::LOAD, MVT::i32, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); - setTargetDAGCombine(ISD::ADDE); - setTargetDAGCombine(ISD::SUBE); setTargetDAGCombine(ISD::MUL); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -421,163 +419,6 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op, return MipsTargetLowering::LowerOperation(Op, DAG); } -// selectMADD - -// Transforms a subgraph in CurDAG if the following pattern is found: -// (addc multLo, Lo0), (adde multHi, Hi0), -// where, -// multHi/Lo: product of multiplication -// Lo0: initial value of Lo register -// Hi0: initial value of Hi register -// Return true if pattern matching was successful. -static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) { - // ADDENode's second operand must be a flag output of an ADDC node in order - // for the matching to be successful. - SDNode *ADDCNode = ADDENode->getOperand(2).getNode(); - - if (ADDCNode->getOpcode() != ISD::ADDC) - return false; - - SDValue MultHi = ADDENode->getOperand(0); - SDValue MultLo = ADDCNode->getOperand(0); - SDNode *MultNode = MultHi.getNode(); - unsigned MultOpc = MultHi.getOpcode(); - - // MultHi and MultLo must be generated by the same node, - if (MultLo.getNode() != MultNode) - return false; - - // and it must be a multiplication. - if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI) - return false; - - // MultLo amd MultHi must be the first and second output of MultNode - // respectively. - if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0) - return false; - - // Transform this to a MADD only if ADDENode and ADDCNode are the only users - // of the values of MultNode, in which case MultNode will be removed in later - // phases. - // If there exist users other than ADDENode or ADDCNode, this function returns - // here, which will result in MultNode being mapped to a single MULT - // instruction node rather than a pair of MULT and MADD instructions being - // produced. - if (!MultHi.hasOneUse() || !MultLo.hasOneUse()) - return false; - - SDLoc DL(ADDENode); - - // Initialize accumulator. - SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped, - ADDCNode->getOperand(1), - ADDENode->getOperand(1)); - - // create MipsMAdd(u) node - MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd; - - SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped, - MultNode->getOperand(0),// Factor 0 - MultNode->getOperand(1),// Factor 1 - ACCIn); - - // replace uses of adde and addc here - if (!SDValue(ADDCNode, 0).use_empty()) { - SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut); - } - if (!SDValue(ADDENode, 0).use_empty()) { - SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut); - } - - return true; -} - -// selectMSUB - -// Transforms a subgraph in CurDAG if the following pattern is found: -// (addc Lo0, multLo), (sube Hi0, multHi), -// where, -// multHi/Lo: product of multiplication -// Lo0: initial value of Lo register -// Hi0: initial value of Hi register -// Return true if pattern matching was successful. -static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) { - // SUBENode's second operand must be a flag output of an SUBC node in order - // for the matching to be successful. - SDNode *SUBCNode = SUBENode->getOperand(2).getNode(); - - if (SUBCNode->getOpcode() != ISD::SUBC) - return false; - - SDValue MultHi = SUBENode->getOperand(1); - SDValue MultLo = SUBCNode->getOperand(1); - SDNode *MultNode = MultHi.getNode(); - unsigned MultOpc = MultHi.getOpcode(); - - // MultHi and MultLo must be generated by the same node, - if (MultLo.getNode() != MultNode) - return false; - - // and it must be a multiplication. - if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI) - return false; - - // MultLo amd MultHi must be the first and second output of MultNode - // respectively. - if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0) - return false; - - // Transform this to a MSUB only if SUBENode and SUBCNode are the only users - // of the values of MultNode, in which case MultNode will be removed in later - // phases. - // If there exist users other than SUBENode or SUBCNode, this function returns - // here, which will result in MultNode being mapped to a single MULT - // instruction node rather than a pair of MULT and MSUB instructions being - // produced. - if (!MultHi.hasOneUse() || !MultLo.hasOneUse()) - return false; - - SDLoc DL(SUBENode); - - // Initialize accumulator. - SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped, - SUBCNode->getOperand(0), - SUBENode->getOperand(0)); - - // create MipsSub(u) node - MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub; - - SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue, - MultNode->getOperand(0),// Factor 0 - MultNode->getOperand(1),// Factor 1 - ACCIn); - - // replace uses of sube and subc here - if (!SDValue(SUBCNode, 0).use_empty()) { - SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut); - } - if (!SDValue(SUBENode, 0).use_empty()) { - SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut); - } - - return true; -} - -static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget &Subtarget) { - if (DCI.isBeforeLegalize()) - return SDValue(); - - if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() && - N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG)) - return SDValue(N, 0); - - return SDValue(); -} - // Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT // // Performs the following transformations: @@ -820,19 +661,6 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget &Subtarget) { - if (DCI.isBeforeLegalize()) - return SDValue(); - - if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 && - selectMSUB(N, &DAG)) - return SDValue(N, 0); - - return SDValue(); -} - static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT, EVT ShiftTy, SelectionDAG &DAG) { // Clear the upper (64 - VT.sizeInBits) bits. @@ -1110,16 +938,12 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue Val; switch (N->getOpcode()) { - case ISD::ADDE: - return performADDECombine(N, DAG, DCI, Subtarget); case ISD::AND: Val = performANDCombine(N, DAG, DCI, Subtarget); break; case ISD::OR: Val = performORCombine(N, DAG, DCI, Subtarget); break; - case ISD::SUBE: - return performSUBECombine(N, DAG, DCI, Subtarget); case ISD::MUL: return performMULCombine(N, DAG, DCI, this); case ISD::SHL: @@ -3596,9 +3420,17 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI, : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass : &Mips::GPR64RegClass); const bool UsingMips32 = RC == &Mips::GPR32RegClass; - unsigned Rs = RegInfo.createVirtualRegister(RC); + unsigned Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass); BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0); + if(!UsingMips32) { + unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass); + BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Tmp) + .addImm(0) + .addReg(Rs) + .addImm(Mips::sub_32); + Rs = Tmp; + } BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::SH : Mips::SH64)) .addReg(Rs) .addReg(Rt) @@ -3649,6 +3481,12 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI, for (unsigned i = 1; i < MI.getNumOperands(); i++) MIB.add(MI.getOperand(i)); + if(!UsingMips32) { + unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass); + BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Tmp).addReg(Rt, 0, Mips::sub_32); + Rt = Tmp; + } + BuildMI(*BB, MI, DL, TII->get(Mips::FILL_H), Wd).addReg(Rt); MI.eraseFromParent(); @@ -3716,6 +3554,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI, assert(Subtarget.hasMSA() && Subtarget.hasMips32r2()); bool IsFGR64onMips64 = Subtarget.hasMips64() && IsFGR64; + bool IsFGR64onMips32 = !Subtarget.hasMips64() && IsFGR64; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); @@ -3726,7 +3565,9 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI, unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); const TargetRegisterClass *GPRRC = IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; - unsigned MFC1Opc = IsFGR64onMips64 ? Mips::DMFC1 : Mips::MFC1; + unsigned MFC1Opc = IsFGR64onMips64 + ? Mips::DMFC1 + : (IsFGR64onMips32 ? Mips::MFC1_D64 : Mips::MFC1); unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W; // Perform the register class copy as mentioned above. @@ -3735,7 +3576,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI, BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp); unsigned WPHI = Wtemp; - if (!Subtarget.hasMips64() && IsFGR64) { + if (IsFGR64onMips32) { unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC); BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs); unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); @@ -3829,7 +3670,9 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI, MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); const TargetRegisterClass *GPRRC = IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; - unsigned MTC1Opc = IsFGR64onMips64 ? Mips::DMTC1 : Mips::MTC1; + unsigned MTC1Opc = IsFGR64onMips64 + ? Mips::DMTC1 + : (IsFGR64onMips32 ? Mips::MTC1_D64 : Mips::MTC1); unsigned COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W; unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td index 8ec55ab6284d..c2947bb44ef5 100644 --- a/lib/Target/Mips/MipsSchedule.td +++ b/lib/Target/Mips/MipsSchedule.td @@ -226,7 +226,6 @@ def II_MFC1 : InstrItinClass; def II_MFHC1 : InstrItinClass; def II_MFC2 : InstrItinClass; def II_MFHI_MFLO : InstrItinClass; // mfhi and mflo -def II_MFTR : InstrItinClass; def II_MOD : InstrItinClass; def II_MODU : InstrItinClass; def II_MOVE : InstrItinClass; @@ -256,7 +255,6 @@ def II_MTC1 : InstrItinClass; def II_MTHC1 : InstrItinClass; def II_MTC2 : InstrItinClass; def II_MTHI_MTLO : InstrItinClass; // mthi and mtlo -def II_MTTR : InstrItinClass; def II_MUL : InstrItinClass; def II_MUH : InstrItinClass; def II_MUHU : InstrItinClass; @@ -666,14 +664,12 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index 7619e7b08612..cce3b8c4c8d1 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -152,6 +152,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // HasMT -- support MT ASE. bool HasMT; + // Disable use of the `jal` instruction. + bool UseLongCalls = false; + InstrItineraryData InstrItins; // We can override the determination of whether we are in mips16 mode @@ -269,6 +272,8 @@ public: bool useSoftFloat() const { return IsSoftFloat; } + bool useLongCalls() const { return UseLongCalls; } + bool enableLongBranchPass() const { return hasStandardEncoding() || allowMixed16_32(); } diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h index af24838665e1..7d9f99ce071e 100644 --- a/lib/Target/Mips/MipsTargetStreamer.h +++ b/lib/Target/Mips/MipsTargetStreamer.h @@ -119,9 +119,6 @@ public: SMLoc IDLoc, const MCSubtargetInfo *STI); void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm, SMLoc IDLoc, const MCSubtargetInfo *STI); - void emitRRIII(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm0, - int16_t Imm1, int16_t Imm2, SMLoc IDLoc, - const MCSubtargetInfo *STI); void emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit, const MCSubtargetInfo *STI); void emitDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount, diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index a00b56af0490..92c8c224b71b 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -271,7 +271,8 @@ unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo, unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12; const MCOperand &MO = MI.getOperand(OpNo); - assert(MO.isImm()); + assert(MO.isImm() && !(MO.getImm() % 16) && + "Expecting an immediate that is a multiple of 16"); return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits; } diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 3aaf7ef2c2a0..901539b682ba 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -178,7 +178,7 @@ namespace { /// a base register plus a signed 16-bit displacement [r+imm]. bool SelectAddrImm(SDValue N, SDValue &Disp, SDValue &Base) { - return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false); + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0); } /// SelectAddrImmOffs - Return true if the operand is valid for a preinc @@ -211,7 +211,11 @@ namespace { /// a base register plus a signed 16-bit displacement that is a multiple of 4. /// Suitable for use by STD and friends. bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) { - return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true); + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 4); + } + + bool SelectAddrImmX16(SDValue N, SDValue &Disp, SDValue &Base) { + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 16); } // Select an address into a single register. @@ -305,6 +309,7 @@ private: bool AllUsersSelectZero(SDNode *N); void SwapAllSelectUsers(SDNode *N); + bool isOffsetMultipleOf(SDNode *N, unsigned Val) const; void transferMemOperands(SDNode *N, SDNode *Result); }; @@ -2999,6 +3004,25 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl); } +/// Does this node represent a load/store node whose address can be represented +/// with a register plus an immediate that's a multiple of \p Val: +bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const { + LoadSDNode *LDN = dyn_cast(N); + StoreSDNode *STN = dyn_cast(N); + SDValue AddrOp; + if (LDN) + AddrOp = LDN->getOperand(1); + else if (STN) + AddrOp = STN->getOperand(2); + + short Imm = 0; + if (AddrOp.getOpcode() == ISD::ADD) + return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val); + + // If the address comes from the outside, the offset will be zero. + return AddrOp.getOpcode() == ISD::CopyFromReg; +} + void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { // Transfer memoperands. MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 0e069ec1665f..b3a3c73f6df0 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2130,12 +2130,12 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { /// Returns true if the address N can be represented by a base register plus /// a signed 16-bit displacement [r+imm], and if it is not better -/// represented as reg+reg. If Aligned is true, only accept displacements -/// suitable for STD and friends, i.e. multiples of 4. +/// represented as reg+reg. If \p Alignment is non-zero, only accept +/// displacements that are multiples of that value. bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, - bool Aligned) const { + unsigned Alignment) const { // FIXME dl should come from parent load or store, not from address SDLoc dl(N); // If this can be more profitably realized as r+r, fail. @@ -2145,7 +2145,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, if (N.getOpcode() == ISD::ADD) { int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Aligned || (imm & 3) == 0)) { + (!Alignment || (imm % Alignment) == 0)) { Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); @@ -2169,7 +2169,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, } else if (N.getOpcode() == ISD::OR) { int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Aligned || (imm & 3) == 0)) { + (!Alignment || (imm % Alignment) == 0)) { // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are // provably disjoint. @@ -2196,7 +2196,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // If this address fits entirely in a 16-bit sext immediate field, codegen // this as "d, 0" int16_t Imm; - if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { + if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) { Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); @@ -2206,7 +2206,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // Handle 32-bit sext immediates with LIS + addr mode. if ((CN->getValueType(0) == MVT::i32 || (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && - (!Aligned || (CN->getZExtValue() & 3) == 0)) { + (!Alignment || (CN->getZExtValue() % Alignment) == 0)) { int Addr = (int)CN->getZExtValue(); // Otherwise, break this down into an LIS + disp. @@ -2321,14 +2321,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, // LDU/STU can only handle immediates that are a multiple of 4. if (VT != MVT::i64) { - if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0)) return false; } else { // LDU/STU need an address with at least 4-byte alignment. if (Alignment < 4) return false; - if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4)) return false; } diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 821927d3b157..49d7d8220af1 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -616,7 +616,7 @@ namespace llvm { /// is not better represented as reg+reg. If Aligned is true, only accept /// displacements suitable for STD and friends, i.e. multiples of 4. bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, - SelectionDAG &DAG, bool Aligned) const; + SelectionDAG &DAG, unsigned Alignment) const; /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 13b4f9ab962d..e74ba38c351f 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1635,8 +1635,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (equalityOnly) { // We need to check the uses of the condition register in order to reject // non-equality comparisons. - for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg), - IE = MRI->use_instr_end(); I != IE; ++I) { + for (MachineRegisterInfo::use_instr_iterator + I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end(); + I != IE; ++I) { MachineInstr *UseMI = &*I; if (UseMI->getOpcode() == PPC::BCC) { unsigned Pred = UseMI->getOperand(0).getImm(); @@ -1658,8 +1659,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL; ++I) { bool FoundUse = false; - for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg), - JE = MRI->use_instr_end(); J != JE; ++J) + for (MachineRegisterInfo::use_instr_iterator + J = MRI->use_instr_begin(CRReg), JE = MRI->use_instr_end(); + J != JE; ++J) if (&*J == &*I) { FoundUse = true; break; diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 6d9f55206b6a..dd7fc2659102 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -405,6 +405,25 @@ def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ return cast(N)->getAlignment() < 4; }]>; +// This is a somewhat weaker condition than actually checking for 16-byte +// alignment. It is simply checking that the displacement can be represented +// as an immediate that is a multiple of 16 (i.e. the requirements for DQ-Form +// instructions). +def quadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isOffsetMultipleOf(N, 16); +}]>; +def quadwOffsetStore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isOffsetMultipleOf(N, 16); +}]>; +def nonQuadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return !isOffsetMultipleOf(N, 16); +}]>; +def nonQuadwOffsetStore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return !isOffsetMultipleOf(N, 16); +}]>; + //===----------------------------------------------------------------------===// // PowerPC Flag Definitions. @@ -815,7 +834,8 @@ def pred : Operand { def iaddr : ComplexPattern; def xaddr : ComplexPattern; def xoaddr : ComplexPattern; -def ixaddr : ComplexPattern; // "std" +def ixaddr : ComplexPattern; // "std" +def iqaddr : ComplexPattern; // "stxv" // The address in a single register. This is used with the SjLj // pseudo-instructions. diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 43635a8919e2..942e8b392b82 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -2606,37 +2606,41 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { } // IsLittleEndian, HasP9Vector // D-Form Load/Store - def : Pat<(v4i32 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v4f32 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2i64 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2f64 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddr:$src)), (LXV memrix16:$src)>; - - def : Pat<(store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddr:$dst), + def : Pat<(v4i32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>; + + def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddr:$dst), + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(v2f64 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v2i64 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v4f32 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v4i32 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(store v2f64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(store v2i64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(store v4f32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(store v4i32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xaddr:$dst), - (STXVX $rS, xaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xaddr:$dst), - (STXVX $rS, xaddr:$dst)>; + def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), (v4i32 (LXVWSX xoaddr:$src))>; def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), @@ -2788,21 +2792,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { let isPseudo = 1 in { def DFLOADf32 : Pseudo<(outs vssrc:$XT), (ins memrix:$src), "#DFLOADf32", - [(set f32:$XT, (load iaddr:$src))]>; + [(set f32:$XT, (load ixaddr:$src))]>; def DFLOADf64 : Pseudo<(outs vsfrc:$XT), (ins memrix:$src), "#DFLOADf64", - [(set f64:$XT, (load iaddr:$src))]>; + [(set f64:$XT, (load ixaddr:$src))]>; def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst), "#DFSTOREf32", - [(store f32:$XT, iaddr:$dst)]>; + [(store f32:$XT, ixaddr:$dst)]>; def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst), "#DFSTOREf64", - [(store f64:$XT, iaddr:$dst)]>; + [(store f64:$XT, ixaddr:$dst)]>; } - def : Pat<(f64 (extloadf32 iaddr:$src)), - (COPY_TO_REGCLASS (DFLOADf32 iaddr:$src), VSFRC)>; - def : Pat<(f32 (fpround (extloadf32 iaddr:$src))), - (f32 (DFLOADf32 iaddr:$src))>; + def : Pat<(f64 (extloadf32 ixaddr:$src)), + (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>; + def : Pat<(f32 (fpround (extloadf32 ixaddr:$src))), + (f32 (DFLOADf32 ixaddr:$src))>; } // end HasP9Vector, AddedComplexity // Integer extend helper dags 32 -> 64 @@ -2881,13 +2885,13 @@ def FltToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A))))); } def FltToLongLoadP9 { - dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddr:$A))))); + dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ixaddr:$A))))); } def FltToULongLoad { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A))))); } def FltToULongLoadP9 { - dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddr:$A))))); + dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A))))); } def FltToLong { dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A)))); @@ -2911,13 +2915,13 @@ def DblToIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A))))); } def DblToIntLoadP9 { - dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddr:$A))))); + dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ixaddr:$A))))); } def DblToUIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A))))); } def DblToUIntLoadP9 { - dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddr:$A))))); + dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ixaddr:$A))))); } def DblToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A))))); @@ -3088,17 +3092,17 @@ let AddedComplexity = 400 in { (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>; def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; + (XSCVDPSXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>; def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; + (XSCVDPUXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>; def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS - (DFLOADf32 iaddr:$A), + (DFLOADf32 ixaddr:$A), VSFRC)), 0))>; def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS - (DFLOADf32 iaddr:$A), + (DFLOADf32 ixaddr:$A), VSFRC)), 0))>; } diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 8af7f7e98117..9207165c46a6 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -754,19 +754,31 @@ bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, return false; } -// Figure out if the offset in the instruction must be a multiple of 4. -// This is true for instructions like "STD". -static bool usesIXAddr(const MachineInstr &MI) { +// If the offset must be a multiple of some value, return what that value is. +static unsigned offsetMinAlign(const MachineInstr &MI) { unsigned OpC = MI.getOpcode(); switch (OpC) { default: - return false; + return 1; case PPC::LWA: case PPC::LWA_32: case PPC::LD: + case PPC::LDU: case PPC::STD: - return true; + case PPC::STDU: + case PPC::DFLOADf32: + case PPC::DFLOADf64: + case PPC::DFSTOREf32: + case PPC::DFSTOREf64: + case PPC::LXSD: + case PPC::LXSSP: + case PPC::STXSD: + case PPC::STXSSP: + return 4; + case PPC::LXV: + case PPC::STXV: + return 16; } } @@ -852,9 +864,6 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum).ChangeToRegister( FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false); - // Figure out if the offset in the instruction is shifted right two bits. - bool isIXAddr = usesIXAddr(MI); - // If the instruction is not present in ImmToIdxMap, then it has no immediate // form (and must be r+r). bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP && @@ -883,7 +892,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // happen in invalid code. assert(OpC != PPC::DBG_VALUE && "This should be handled in a target-independent way"); - if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) || + if (!noImmForm && ((isInt<16>(Offset) && + ((Offset % offsetMinAlign(MI)) == 0)) || OpC == TargetOpcode::STACKMAP || OpC == TargetOpcode::PATCHPOINT)) { MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset); @@ -1076,5 +1086,5 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm MI->getOpcode() == TargetOpcode::STACKMAP || MI->getOpcode() == TargetOpcode::PATCHPOINT || - (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0)); + (isInt<16>(Offset) && (Offset % offsetMinAlign(*MI)) == 0); } diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 2dc3828334ac..be705507b534 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -41,6 +41,8 @@ public: ~PPCTargetMachine() override; const PPCSubtarget *getSubtargetImpl(const Function &F) const override; + // The no argument getSubtargetImpl, while it exists on some targets, is + // deprecated and should not be used. const PPCSubtarget *getSubtargetImpl() const = delete; // Pass Pipeline Configuration diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td index 11004c5a952f..91cab00b2b65 100644 --- a/lib/Target/Sparc/Sparc.td +++ b/lib/Target/Sparc/Sparc.td @@ -20,6 +20,10 @@ include "llvm/Target/Target.td" // SPARC Subtarget features. // +def FeatureSoftMulDiv + : SubtargetFeature<"soft-mul-div", "UseSoftMulDiv", "true", + "Use software emulation for integer multiply and divide">; + def FeatureV9 : SubtargetFeature<"v9", "IsV9", "true", "Enable SPARC-V9 instructions">; @@ -75,7 +79,7 @@ class Proc Features> : Processor; def : Proc<"generic", []>; -def : Proc<"v7", []>; +def : Proc<"v7", [FeatureSoftMulDiv]>; def : Proc<"v8", []>; def : Proc<"supersparc", []>; def : Proc<"sparclite", []>; diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index 9e7e3c6b705a..6767a59a9757 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -1689,6 +1689,19 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::MUL, MVT::i32, Expand); + if (Subtarget->useSoftMulDiv()) { + // .umul works for both signed and unsigned + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setLibcallName(RTLIB::MUL_I32, ".umul"); + + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setLibcallName(RTLIB::SDIV_I32, ".div"); + + setOperationAction(ISD::UDIV, MVT::i32, Expand); + setLibcallName(RTLIB::UDIV_I32, ".udiv"); + } + if (Subtarget->is64Bit()) { setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td index ae45c8be6752..3194ad4aeb6b 100644 --- a/lib/Target/Sparc/SparcInstrInfo.td +++ b/lib/Target/Sparc/SparcInstrInfo.td @@ -27,6 +27,9 @@ def Is32Bit : Predicate<"!Subtarget->is64Bit()">; // True when generating 64-bit code. This also implies HasV9. def Is64Bit : Predicate<"Subtarget->is64Bit()">; +def UseSoftMulDiv : Predicate<"Subtarget->useSoftMulDiv()">, + AssemblerPredicate<"FeatureSoftMulDiv">; + // HasV9 - This predicate is true when the target processor supports V9 // instructions. Note that the machine may be running in 32-bit mode. def HasV9 : Predicate<"Subtarget->isV9()">, diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp index 43ddef3cc96e..daac56add87c 100644 --- a/lib/Target/Sparc/SparcSubtarget.cpp +++ b/lib/Target/Sparc/SparcSubtarget.cpp @@ -28,6 +28,7 @@ void SparcSubtarget::anchor() { } SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { + UseSoftMulDiv = false; IsV9 = false; IsLeon = false; V8DeprecatedInsts = false; diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h index fa42da425ff2..d18139984b87 100644 --- a/lib/Target/Sparc/SparcSubtarget.h +++ b/lib/Target/Sparc/SparcSubtarget.h @@ -32,6 +32,7 @@ class StringRef; class SparcSubtarget : public SparcGenSubtargetInfo { Triple TargetTriple; virtual void anchor(); + bool UseSoftMulDiv; bool IsV9; bool IsLeon; bool V8DeprecatedInsts; @@ -76,6 +77,7 @@ public: bool enableMachineScheduler() const override; + bool useSoftMulDiv() const { return UseSoftMulDiv; } bool isV9() const { return IsV9; } bool isLeon() const { return IsLeon; } bool isVIS() const { return IsVIS; } diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index ee23692ad1db..33680789ee08 100644 --- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -275,6 +275,10 @@ public: SMLoc getEndLoc() const override { return EndLoc; } void print(raw_ostream &OS) const override; + /// getLocRange - Get the range between the first and last token of this + /// operand. + SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); } + // Used by the TableGen code to add particular types of operand // to an instruction. void addRegOperands(MCInst &Inst, unsigned N) const { @@ -1164,6 +1168,8 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands, return false; } +std::string SystemZMnemonicSpellCheck(StringRef S, uint64_t FBS); + bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -1209,8 +1215,13 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(ErrorLoc, "invalid operand for instruction"); } - case Match_MnemonicFail: - return Error(IDLoc, "invalid instruction"); + case Match_MnemonicFail: { + uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + std::string Suggestion = SystemZMnemonicSpellCheck( + ((SystemZOperand &)*Operands[0]).getToken(), FBS); + return Error(IDLoc, "invalid instruction" + Suggestion, + ((SystemZOperand &)*Operands[0]).getLocRange()); + } } llvm_unreachable("Unexpected match type"); diff --git a/lib/Target/SystemZ/LLVMBuild.txt b/lib/Target/SystemZ/LLVMBuild.txt index 6f8431db7b11..9b8b141fd52a 100644 --- a/lib/Target/SystemZ/LLVMBuild.txt +++ b/lib/Target/SystemZ/LLVMBuild.txt @@ -31,5 +31,5 @@ has_jit = 1 type = Library name = SystemZCodeGen parent = SystemZ -required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target +required_libraries = Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target add_to_library_groups = SystemZ diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td index c5faa0d62881..fda9c30fe3fc 100644 --- a/lib/Target/SystemZ/SystemZFeatures.td +++ b/lib/Target/SystemZ/SystemZFeatures.td @@ -187,6 +187,58 @@ def Arch11NewFeatures : SystemZFeatureList<[ FeatureVector ]>; +//===----------------------------------------------------------------------===// +// +// New features added in the Twelvth Edition of the z/Architecture +// +//===----------------------------------------------------------------------===// + +def FeatureMiscellaneousExtensions2 : SystemZFeature< + "miscellaneous-extensions-2", "MiscellaneousExtensions2", + "Assume that the miscellaneous-extensions facility 2 is installed" +>; + +def FeatureGuardedStorage : SystemZFeature< + "guarded-storage", "GuardedStorage", + "Assume that the guarded-storage facility is installed" +>; + +def FeatureMessageSecurityAssist7 : SystemZFeature< + "message-security-assist-extension7", "MessageSecurityAssist7", + "Assume that the message-security-assist extension facility 7 is installed" +>; + +def FeatureMessageSecurityAssist8 : SystemZFeature< + "message-security-assist-extension8", "MessageSecurityAssist8", + "Assume that the message-security-assist extension facility 8 is installed" +>; + +def FeatureVectorEnhancements1 : SystemZFeature< + "vector-enhancements-1", "VectorEnhancements1", + "Assume that the vector enhancements facility 1 is installed" +>; +def FeatureNoVectorEnhancements1 : SystemZMissingFeature<"VectorEnhancements1">; + +def FeatureVectorPackedDecimal : SystemZFeature< + "vector-packed-decimal", "VectorPackedDecimal", + "Assume that the vector packed decimal facility is installed" +>; + +def FeatureInsertReferenceBitsMultiple : SystemZFeature< + "insert-reference-bits-multiple", "InsertReferenceBitsMultiple", + "Assume that the insert-reference-bits-multiple facility is installed" +>; + +def Arch12NewFeatures : SystemZFeatureList<[ + FeatureMiscellaneousExtensions2, + FeatureGuardedStorage, + FeatureMessageSecurityAssist7, + FeatureMessageSecurityAssist8, + FeatureVectorEnhancements1, + FeatureVectorPackedDecimal, + FeatureInsertReferenceBitsMultiple +]>; + //===----------------------------------------------------------------------===// // // Cumulative supported and unsupported feature sets @@ -201,9 +253,13 @@ def Arch10SupportedFeatures : SystemZFeatureAdd; def Arch11SupportedFeatures : SystemZFeatureAdd; +def Arch12SupportedFeatures + : SystemZFeatureAdd; -def Arch11UnsupportedFeatures +def Arch12UnsupportedFeatures : SystemZFeatureList<[]>; +def Arch11UnsupportedFeatures + : SystemZFeatureAdd; def Arch10UnsupportedFeatures : SystemZFeatureAdd; def Arch9UnsupportedFeatures diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 2801141cd951..2d916d2e1521 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -101,7 +101,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass); addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass); } - addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass); + if (Subtarget.hasVectorEnhancements1()) + addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass); + else + addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass); if (Subtarget.hasVector()) { addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass); @@ -316,7 +319,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::AND, VT, Legal); setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); - setOperationAction(ISD::CTPOP, VT, Custom); + if (Subtarget.hasVectorEnhancements1()) + setOperationAction(ISD::CTPOP, VT, Legal); + else + setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Legal); setOperationAction(ISD::CTLZ, VT, Legal); @@ -414,10 +420,60 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FROUND, MVT::v2f64, Legal); } + // The vector enhancements facility 1 has instructions for these. + if (Subtarget.hasVectorEnhancements1()) { + setOperationAction(ISD::FADD, MVT::v4f32, Legal); + setOperationAction(ISD::FNEG, MVT::v4f32, Legal); + setOperationAction(ISD::FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::FMA, MVT::v4f32, Legal); + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FABS, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + setOperationAction(ISD::FRINT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FROUND, MVT::v4f32, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMINNAN, MVT::f64, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v2f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal); + setOperationAction(ISD::FMINNAN, MVT::v2f64, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::f32, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::f128, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f128, Legal); + setOperationAction(ISD::FMINNUM, MVT::f128, Legal); + setOperationAction(ISD::FMINNAN, MVT::f128, Legal); + } + // We have fused multiply-addition for f32 and f64 but not f128. setOperationAction(ISD::FMA, MVT::f32, Legal); setOperationAction(ISD::FMA, MVT::f64, Legal); - setOperationAction(ISD::FMA, MVT::f128, Expand); + if (Subtarget.hasVectorEnhancements1()) + setOperationAction(ISD::FMA, MVT::f128, Legal); + else + setOperationAction(ISD::FMA, MVT::f128, Expand); + + // We don't have a copysign instruction on vector registers. + if (Subtarget.hasVectorEnhancements1()) + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); // Needed so that we don't try to implement f128 constant loads using // a load-and-extend of a f80 constant (in cases where the constant @@ -425,6 +481,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); + // We don't have extending load instruction on vector registers. + if (Subtarget.hasVectorEnhancements1()) { + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand); + } + // Floating-point truncation and stores need to be done separately. setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); @@ -489,7 +551,7 @@ bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { case MVT::f64: return true; case MVT::f128: - return false; + return Subtarget.hasVectorEnhancements1(); default: break; } @@ -1462,21 +1524,25 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) { return true; case Intrinsic::s390_vfcedbs: + case Intrinsic::s390_vfcesbs: Opcode = SystemZISD::VFCMPES; CCValid = SystemZ::CCMASK_VCMP; return true; case Intrinsic::s390_vfchdbs: + case Intrinsic::s390_vfchsbs: Opcode = SystemZISD::VFCMPHS; CCValid = SystemZ::CCMASK_VCMP; return true; case Intrinsic::s390_vfchedbs: + case Intrinsic::s390_vfchesbs: Opcode = SystemZISD::VFCMPHES; CCValid = SystemZ::CCMASK_VCMP; return true; case Intrinsic::s390_vftcidb: + case Intrinsic::s390_vftcisb: Opcode = SystemZISD::VFTCI; CCValid = SystemZ::CCMASK_VCMP; return true; @@ -2316,11 +2382,15 @@ static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL, // Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode, // producing a result of type VT. -static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL, - EVT VT, SDValue CmpOp0, SDValue CmpOp1) { - // There is no hardware support for v4f32, so extend the vector into - // two v2f64s and compare those. - if (CmpOp0.getValueType() == MVT::v4f32) { +SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode, + const SDLoc &DL, EVT VT, + SDValue CmpOp0, + SDValue CmpOp1) const { + // There is no hardware support for v4f32 (unless we have the vector + // enhancements facility 1), so extend the vector into two v2f64s + // and compare those. + if (CmpOp0.getValueType() == MVT::v4f32 && + !Subtarget.hasVectorEnhancements1()) { SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0); SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0); SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1); @@ -2334,9 +2404,11 @@ static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL, // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing // an integer mask of type VT. -static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT, - ISD::CondCode CC, SDValue CmpOp0, - SDValue CmpOp1) { +SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG, + const SDLoc &DL, EVT VT, + ISD::CondCode CC, + SDValue CmpOp0, + SDValue CmpOp1) const { bool IsFP = CmpOp0.getValueType().isFloatingPoint(); bool Invert = false; SDValue Cmp; @@ -2960,6 +3032,12 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op, // We define this so that it can be used for constant division. lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]); + else if (Subtarget.hasMiscellaneousExtensions2()) + // SystemZISD::SMUL_LOHI returns the low result in the odd register and + // the high result in the even register. ISD::SMUL_LOHI is defined to + // return the low half first, so the results are in reverse order. + lowerGR128Binary(DAG, DL, VT, SystemZISD::SMUL_LOHI, + Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]); else { // Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI: // @@ -4658,6 +4736,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(SELECT_CCMASK); OPCODE(ADJDYNALLOC); OPCODE(POPCNT); + OPCODE(SMUL_LOHI); OPCODE(UMUL_LOHI); OPCODE(SDIVREM); OPCODE(UDIVREM); @@ -6118,6 +6197,7 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( case SystemZ::SelectF32: case SystemZ::SelectF64: case SystemZ::SelectF128: + case SystemZ::SelectVR128: return emitSelect(MI, MBB, 0); case SystemZ::CondStore8Mux: diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 6c9c404816f0..abe8b7233e60 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -88,6 +88,7 @@ enum NodeType : unsigned { // Wrappers around the ISD opcodes of the same name. The output is GR128. // Input operands may be GR64 or GR32, depending on the instruction. + SMUL_LOHI, UMUL_LOHI, SDIVREM, UDIVREM, @@ -479,6 +480,12 @@ private: const SystemZSubtarget &Subtarget; // Implement LowerOperation for individual opcodes. + SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, + const SDLoc &DL, EVT VT, + SDValue CmpOp0, SDValue CmpOp1) const; + SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, + EVT VT, ISD::CondCode CC, + SDValue CmpOp0, SDValue CmpOp1) const; SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index 10172bd45203..02aeaadad0d9 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -12,9 +12,12 @@ //===----------------------------------------------------------------------===// // C's ?: operator for floating-point operands. -def SelectF32 : SelectWrapper; -def SelectF64 : SelectWrapper; -def SelectF128 : SelectWrapper; +def SelectF32 : SelectWrapper; +def SelectF64 : SelectWrapper; +let Predicates = [FeatureNoVectorEnhancements1] in + def SelectF128 : SelectWrapper; +let Predicates = [FeatureVectorEnhancements1] in + def SelectVR128 : SelectWrapper; defm CondStoreF32 : CondStores; @@ -69,8 +72,9 @@ let Defs = [CC], usesCustomInserter = 1 in { let Predicates = [FeatureVector] in { defm : CompareZeroFP; defm : CompareZeroFP; - defm : CompareZeroFP; } +let Predicates = [FeatureVector, FeatureNoVectorEnhancements1] in + defm : CompareZeroFP; // Moves between 64-bit integer and floating-point registers. def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>; @@ -83,8 +87,12 @@ let isCodeGenOnly = 1 in { } // The sign of an FP128 is in the high register. -def : Pat<(fcopysign FP32:$src1, FP128:$src2), - (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +let Predicates = [FeatureNoVectorEnhancements1] in + def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 FP128:$src2)))), + (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +let Predicates = [FeatureVectorEnhancements1] in + def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 VR128:$src2)))), + (CPSDRsd FP32:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>; // fcopysign with an FP64 result. let isCodeGenOnly = 1 in @@ -92,8 +100,12 @@ let isCodeGenOnly = 1 in def CPSDRdd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP64>; // The sign of an FP128 is in the high register. -def : Pat<(fcopysign FP64:$src1, FP128:$src2), - (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +let Predicates = [FeatureNoVectorEnhancements1] in + def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 FP128:$src2)))), + (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +let Predicates = [FeatureVectorEnhancements1] in + def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 VR128:$src2)))), + (CPSDRdd FP64:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>; // fcopysign with an FP128 result. Use "upper" as the high half and leave // the low half as-is. @@ -101,12 +113,14 @@ class CopySign128 : Pat<(fcopysign FP128:$src1, cls:$src2), (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>; -def : CopySign128; -def : CopySign128; -def : CopySign128; +let Predicates = [FeatureNoVectorEnhancements1] in { + def : CopySign128; + def : CopySign128; + def : CopySign128; +} defm LoadStoreF32 : MVCLoadStore; defm LoadStoreF64 : MVCLoadStore; @@ -166,20 +180,32 @@ def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>, def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>, Requires<[FeatureFPExtension]>; -def : Pat<(f32 (fpround FP128:$src)), - (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>; -def : Pat<(f64 (fpround FP128:$src)), - (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>; +let Predicates = [FeatureNoVectorEnhancements1] in { + def : Pat<(f32 (fpround FP128:$src)), + (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>; + def : Pat<(f64 (fpround FP128:$src)), + (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>; +} // Extend register floating-point values to wider representations. -def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64, FP32>; -def LXEBR : UnaryRRE<"lxebr", 0xB306, fpextend, FP128, FP32>; -def LXDBR : UnaryRRE<"lxdbr", 0xB305, fpextend, FP128, FP64>; +def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64, FP32>; +def LXEBR : UnaryRRE<"lxebr", 0xB306, null_frag, FP128, FP32>; +def LXDBR : UnaryRRE<"lxdbr", 0xB305, null_frag, FP128, FP64>; +let Predicates = [FeatureNoVectorEnhancements1] in { + def : Pat<(f128 (fpextend (f32 FP32:$src))), (LXEBR FP32:$src)>; + def : Pat<(f128 (fpextend (f64 FP64:$src))), (LXDBR FP64:$src)>; +} // Extend memory floating-point values to wider representations. def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64, 4>; -def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128, 4>; -def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128, 8>; +def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>; +def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>; +let Predicates = [FeatureNoVectorEnhancements1] in { + def : Pat<(f128 (extloadf32 bdxaddr12only:$src)), + (LXEB bdxaddr12only:$src)>; + def : Pat<(f128 (extloadf64 bdxaddr12only:$src)), + (LXDB bdxaddr12only:$src)>; +} // Convert a signed integer register value to a floating-point one. def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32, GR32>; @@ -426,16 +452,18 @@ def : Pat<(fmul (f64 (fpextend FP32:$src1)), // f128 multiplication of two FP64 registers. def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>; -def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))), - (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)), - FP64:$src1, subreg_h64), FP64:$src2)>; +let Predicates = [FeatureNoVectorEnhancements1] in + def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))), + (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)), + FP64:$src1, subreg_h64), FP64:$src2)>; // f128 multiplication of an FP64 register and an f64 memory. def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>; -def : Pat<(fmul (f128 (fpextend FP64:$src1)), - (f128 (extloadf64 bdxaddr12only:$addr))), - (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64), - bdxaddr12only:$addr)>; +let Predicates = [FeatureNoVectorEnhancements1] in + def : Pat<(fmul (f128 (fpextend FP64:$src1)), + (f128 (extloadf64 bdxaddr12only:$addr))), + (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64), + bdxaddr12only:$addr)>; // Fused multiply-add. def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32, FP32>; diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index 7620e06ccbc9..033a0a879d37 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -1091,6 +1091,94 @@ class InstVRIe op, dag outs, dag ins, string asmstr, list pattern> let Inst{7-0} = op{7-0}; } +class InstVRIf op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<5> V2; + bits<5> V3; + bits<8> I4; + bits<4> M5; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = V1{3-0}; + let Inst{35-32} = V2{3-0}; + let Inst{31-28} = V3{3-0}; + let Inst{27-24} = 0; + let Inst{23-20} = M5; + let Inst{19-12} = I4; + let Inst{11} = V1{4}; + let Inst{10} = V2{4}; + let Inst{9} = V3{4}; + let Inst{8} = 0; + let Inst{7-0} = op{7-0}; +} + +class InstVRIg op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<5> V2; + bits<8> I3; + bits<8> I4; + bits<4> M5; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = V1{3-0}; + let Inst{35-32} = V2{3-0}; + let Inst{31-24} = I4; + let Inst{23-20} = M5; + let Inst{19-12} = I3; + let Inst{11} = V1{4}; + let Inst{10} = V2{4}; + let Inst{9-8} = 0; + let Inst{7-0} = op{7-0}; +} + +class InstVRIh op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<16> I2; + bits<4> I3; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = V1{3-0}; + let Inst{35-32} = 0; + let Inst{31-16} = I2; + let Inst{15-12} = I3; + let Inst{11} = V1{4}; + let Inst{10-8} = 0; + let Inst{7-0} = op{7-0}; +} + +class InstVRIi op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<4> R2; + bits<8> I3; + bits<4> M4; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = V1{3-0}; + let Inst{35-32} = R2; + let Inst{31-24} = 0; + let Inst{23-20} = M4; + let Inst{19-12} = I3; + let Inst{11} = V1{4}; + let Inst{10-8} = 0; + let Inst{7-0} = op{7-0}; +} + // Depending on the instruction mnemonic, certain bits may be or-ed into // the M4 value provided as explicit operand. These are passed as m4or. class InstVRRa op, dag outs, dag ins, string asmstr, list pattern, @@ -1259,6 +1347,67 @@ class InstVRRf op, dag outs, dag ins, string asmstr, list pattern> let Inst{7-0} = op{7-0}; } +class InstVRRg op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = 0; + let Inst{35-32} = V1{3-0}; + let Inst{31-12} = 0; + let Inst{11} = 0; + let Inst{10} = V1{4}; + let Inst{9-8} = 0; + let Inst{7-0} = op{7-0}; +} + +class InstVRRh op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<5> V2; + bits<4> M3; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = 0; + let Inst{35-32} = V1{3-0}; + let Inst{31-28} = V2{3-0}; + let Inst{27-24} = 0; + let Inst{23-20} = M3; + let Inst{19-12} = 0; + let Inst{11} = 0; + let Inst{10} = V1{4}; + let Inst{9} = V2{4}; + let Inst{8} = 0; + let Inst{7-0} = op{7-0}; +} + +class InstVRRi op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<4> R1; + bits<5> V2; + bits<4> M3; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = R1; + let Inst{35-32} = V2{3-0}; + let Inst{31-24} = 0; + let Inst{23-20} = M3; + let Inst{19-12} = 0; + let Inst{11} = 0; + let Inst{10} = V2{4}; + let Inst{9-8} = 0; + let Inst{7-0} = op{7-0}; +} + class InstVRSa op, dag outs, dag ins, string asmstr, list pattern> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; @@ -1321,6 +1470,25 @@ class InstVRSc op, dag outs, dag ins, string asmstr, list pattern> let Inst{7-0} = op{7-0}; } +class InstVRSd op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<16> BD2; + bits<4> R3; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = 0; + let Inst{35-32} = R3; + let Inst{31-16} = BD2; + let Inst{15-12} = V1{3-0}; + let Inst{11-9} = 0; + let Inst{8} = V1{4}; + let Inst{7-0} = op{7-0}; +} + class InstVRV op, dag outs, dag ins, string asmstr, list pattern> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; @@ -1358,6 +1526,24 @@ class InstVRX op, dag outs, dag ins, string asmstr, list pattern> let Inst{7-0} = op{7-0}; } +class InstVSI op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<16> BD2; + bits<8> I3; + + let Inst{47-40} = op{15-8}; + let Inst{39-32} = I3; + let Inst{31-16} = BD2; + let Inst{15-12} = V1{3-0}; + let Inst{11-9} = 0; + let Inst{8} = V1{4}; + let Inst{7-0} = op{7-0}; +} + //===----------------------------------------------------------------------===// // Instruction classes for .insn directives //===----------------------------------------------------------------------===// @@ -1910,6 +2096,25 @@ class FixedCondBranchRX opcode> let M1 = V.ccmask; } +class CondBranchRXY opcode> + : InstRXYb { + let CCMaskFirst = 1; +} + +class AsmCondBranchRXY opcode> + : InstRXYb; + +class FixedCondBranchRXY opcode, + SDPatternOperator operator = null_frag> + : InstRXYb { + let isAsmParserOnly = V.alternate; + let M1 = V.ccmask; +} + class CmpBranchRIEa opcode, RegisterOperand cls, Immediate imm> : InstRIEa opcode, let AccessBytes = bytes; } +class StoreLengthVRSd opcode, + SDPatternOperator operator, bits<5> bytes> + : InstVRSd { + let mayStore = 1; + let AccessBytes = bytes; +} + +class StoreLengthVSI opcode, + SDPatternOperator operator, bits<5> bytes> + : InstVSI { + let mayStore = 1; + let AccessBytes = bytes; +} + class StoreMultipleRS opcode, RegisterOperand cls, AddressingMode mode = bdaddr12only> : InstRSa opcode, : InstRXa; +class SideEffectBinaryRXY opcode, + RegisterOperand cls> + : InstRXYa; + class SideEffectBinaryRILPC opcode, RegisterOperand cls> : InstRILb opcode> (ins VR128:$V2, imm32zx12:$I3, imm32zx4:$M4, imm32zx4:$M5), mnemonic#"\t$V1, $V2, $I3, $M4, $M5", []>; +class BinaryVRIh opcode> + : InstVRIh; + class BinaryVRRa opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0> : InstVRRa opcode, SDPatternOperator operator, mnemonic#"\t$V1, $R2, $R3", [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>; +class BinaryVRRi opcode, RegisterOperand cls> + : InstVRRi; + class BinaryVRSa opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<4> type> : InstVRSa opcode> (ins VR128:$V3, shift12only:$BD2, imm32zx4: $M4), mnemonic#"\t$R1, $V3, $BD2, $M4", []>; +class BinaryVRSd opcode, SDPatternOperator operator, + bits<5> bytes> + : InstVRSd { + let mayLoad = 1; + let AccessBytes = bytes; +} + class BinaryVRX opcode, SDPatternOperator operator, TypedReg tr, bits<5> bytes> : InstVRX opcode, RegisterOperand cls> let mayStore = 1; } +class BinaryVSI opcode, SDPatternOperator operator, + bits<5> bytes> + : InstVSI { + let mayLoad = 1; + let AccessBytes = bytes; +} + class StoreBinaryVRV opcode, bits<5> bytes, Immediate index> : InstVRV opcode> let M5 = 0; } +class CompareVRRh opcode> + : InstVRRh { + let isCompare = 1; +} + class TestRXE opcode, SDPatternOperator operator, RegisterOperand cls> : InstRXE opcode> let mayLoad = 1; } +class TestVRRg opcode> + : InstVRRg; + class SideEffectTernarySSc opcode> : InstSSc opcode, SDPatternOperator operator, let M5 = type; } +class TernaryVRIi opcode, RegisterOperand cls> + : InstVRIi; + class TernaryVRRa opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m4or> : InstVRRa opcode, SDPatternOperator operator, let M6 = 0; } +class TernaryVRRcFloat opcode, + SDPatternOperator operator, TypedReg tr1, TypedReg tr2, + bits<4> type = 0, bits<4> m5 = 0> + : InstVRRc { + let M4 = type; + let M5 = m5; +} + +class TernaryVRRcFloatGeneric opcode> + : InstVRRc; + class TernaryVRRd opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<4> type = 0> : InstVRRd opcode> let DisableEncoding = "$V1src"; } +class QuaternaryVRIf opcode> + : InstVRIf; + +class QuaternaryVRIg opcode> + : InstVRIg; + class QuaternaryVRRd opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, - bits<4> type, SDPatternOperator m6mask, bits<4> m6or> + TypedReg tr3, TypedReg tr4, bits<4> type, + SDPatternOperator m6mask = imm32zx4, bits<4> m6or = 0> : InstVRRd { let M5 = type; } +class QuaternaryVRRdGeneric opcode> + : InstVRRd; + // Declare a pair of instructions, one which sets CC and one which doesn't. // The CC-setting form ends with "S" and sets the low bit of M6. // Also create aliases to make use of M6 operand optional in assembler. @@ -4041,13 +4348,15 @@ multiclass QuaternaryOptVRRdSPair opcode, SDPatternOperator operator_cc, TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> modifier = 0> { - def "" : QuaternaryVRRd; def : InstAlias(NAME) tr1.op:$V1, tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, 0)>; let Defs = [CC] in - def S : QuaternaryVRRd; def : InstAlias(NAME#"S") tr1.op:$V1, tr2.op:$V2, @@ -4055,10 +4364,7 @@ multiclass QuaternaryOptVRRdSPair opcode, } multiclass QuaternaryOptVRRdSPairGeneric opcode> { - def "" : InstVRRd; + def "" : QuaternaryVRRdGeneric; def : InstAlias(NAME) VR128:$V1, VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5, 0)>; @@ -4366,10 +4672,10 @@ class RotateSelectRIEfPseudo // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is // the value of the PSW's 2-bit condition code field. -class SelectWrapper +class SelectWrapper : Pseudo<(outs cls:$dst), (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc), - [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2, + [(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc))]> { let usesCustomInserter = 1; // Although the instructions used by these nodes do not in themselves diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 66a5ff12be46..4533f4fdf21a 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -869,6 +869,37 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + // Move 128-bit floating-point values between VR128 and FP128. + if (SystemZ::VR128BitRegClass.contains(DestReg) && + SystemZ::FP128BitRegClass.contains(SrcReg)) { + unsigned SrcRegHi = + RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_h64), + SystemZ::subreg_r64, &SystemZ::VR128BitRegClass); + unsigned SrcRegLo = + RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_l64), + SystemZ::subreg_r64, &SystemZ::VR128BitRegClass); + + BuildMI(MBB, MBBI, DL, get(SystemZ::VMRHG), DestReg) + .addReg(SrcRegHi, getKillRegState(KillSrc)) + .addReg(SrcRegLo, getKillRegState(KillSrc)); + return; + } + if (SystemZ::FP128BitRegClass.contains(DestReg) && + SystemZ::VR128BitRegClass.contains(SrcReg)) { + unsigned DestRegHi = + RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_h64), + SystemZ::subreg_r64, &SystemZ::VR128BitRegClass); + unsigned DestRegLo = + RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_l64), + SystemZ::subreg_r64, &SystemZ::VR128BitRegClass); + + if (DestRegHi != SrcReg) + copyPhysReg(MBB, MBBI, DL, DestRegHi, SrcReg, false); + BuildMI(MBB, MBBI, DL, get(SystemZ::VREPG), DestRegLo) + .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1); + return; + } + // Everything else needs only one instruction. unsigned Opcode; if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg)) @@ -1434,6 +1465,7 @@ SystemZII::Branch SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const { switch (MI.getOpcode()) { case SystemZ::BR: + case SystemZ::BI: case SystemZ::J: case SystemZ::JG: return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY, diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index 4569be7602e4..f64c0d15ef83 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -48,6 +48,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in { let isIndirectBranch = 1 in { def BC : CondBranchRX<"b#", 0x47>; def BCR : CondBranchRR<"b#r", 0x07>; + def BIC : CondBranchRXY<"bi#", 0xe347>, + Requires<[FeatureMiscellaneousExtensions2]>; } } @@ -58,6 +60,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in { let isIndirectBranch = 1 in { def BCAsm : AsmCondBranchRX<"bc", 0x47>; def BCRAsm : AsmCondBranchRR<"bcr", 0x07>; + def BICAsm : AsmCondBranchRXY<"bic", 0xe347>, + Requires<[FeatureMiscellaneousExtensions2]>; } // Define AsmParser extended mnemonics for each general condition-code mask @@ -69,6 +73,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in { let isIndirectBranch = 1 in { def BAsm#V : FixedCondBranchRX , "b#", 0x47>; def BRAsm#V : FixedCondBranchRR , "b#r", 0x07>; + def BIAsm#V : FixedCondBranchRXY, "bi#", 0xe347>, + Requires<[FeatureMiscellaneousExtensions2]>; } } } @@ -81,6 +87,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in { let isIndirectBranch = 1 in { def B : FixedCondBranchRX; def BR : FixedCondBranchRR; + def BI : FixedCondBranchRXY, + Requires<[FeatureMiscellaneousExtensions2]>; } } @@ -316,9 +324,9 @@ let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in { // Select instructions //===----------------------------------------------------------------------===// -def Select32Mux : SelectWrapper, Requires<[FeatureHighWord]>; -def Select32 : SelectWrapper; -def Select64 : SelectWrapper; +def Select32Mux : SelectWrapper, Requires<[FeatureHighWord]>; +def Select32 : SelectWrapper; +def Select64 : SelectWrapper; // We don't define 32-bit Mux stores if we don't have STOCFH, because the // low-only STOC should then always be used if possible. @@ -921,6 +929,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in { // Addition of memory. defm AH : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>; defm A : BinaryRXPair<"a", 0x5A, 0xE35A, add, GR32, load, 4>; + def AGH : BinaryRXY<"agh", 0xE338, add, GR64, asextloadi16, 2>, + Requires<[FeatureMiscellaneousExtensions2]>; def AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>; def AG : BinaryRXY<"ag", 0xE308, add, GR64, load, 8>; @@ -1006,6 +1016,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in { // Subtraction of memory. defm SH : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>; defm S : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>; + def SGH : BinaryRXY<"sgh", 0xE339, sub, GR64, asextloadi16, 2>, + Requires<[FeatureMiscellaneousExtensions2]>; def SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>; def SG : BinaryRXY<"sg", 0xE309, sub, GR64, load, 8>; } @@ -1207,6 +1219,15 @@ defm : RMWIByte; // Multiplication //===----------------------------------------------------------------------===// +// Multiplication of a register, setting the condition code. We prefer these +// over MS(G)R if available, even though we cannot use the condition code, +// since they are three-operand instructions. +let Predicates = [FeatureMiscellaneousExtensions2], + Defs = [CC], isCommutable = 1 in { + def MSRKC : BinaryRRFa<"msrkc", 0xB9FD, mul, GR32, GR32, GR32>; + def MSGRKC : BinaryRRFa<"msgrkc", 0xB9ED, mul, GR64, GR64, GR64>; +} + // Multiplication of a register. let isCommutable = 1 in { def MSR : BinaryRRE<"msr", 0xB252, mul, GR32, GR32>; @@ -1226,21 +1247,37 @@ def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>; // Multiplication of memory. defm MH : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>; defm MS : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>; +def MGH : BinaryRXY<"mgh", 0xE33C, mul, GR64, asextloadi16, 2>, + Requires<[FeatureMiscellaneousExtensions2]>; def MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>; def MSG : BinaryRXY<"msg", 0xE30C, mul, GR64, load, 8>; +// Multiplication of memory, setting the condition code. +let Predicates = [FeatureMiscellaneousExtensions2], Defs = [CC] in { + def MSC : BinaryRXY<"msc", 0xE353, null_frag, GR32, load, 4>; + def MSGC : BinaryRXY<"msgc", 0xE383, null_frag, GR64, load, 8>; +} + // Multiplication of a register, producing two results. -def MR : BinaryRR <"mr", 0x1C, null_frag, GR128, GR32>; +def MR : BinaryRR <"mr", 0x1C, null_frag, GR128, GR32>; +def MGRK : BinaryRRFa<"mgrk", 0xB9EC, null_frag, GR128, GR64, GR64>, + Requires<[FeatureMiscellaneousExtensions2]>; def MLR : BinaryRRE<"mlr", 0xB996, null_frag, GR128, GR32>; def MLGR : BinaryRRE<"mlgr", 0xB986, null_frag, GR128, GR64>; +def : Pat<(z_smul_lohi GR64:$src1, GR64:$src2), + (MGRK GR64:$src1, GR64:$src2)>; def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2), (MLGR (AEXT128 GR64:$src1), GR64:$src2)>; // Multiplication of memory, producing two results. def M : BinaryRX <"m", 0x5C, null_frag, GR128, load, 4>; def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>; +def MG : BinaryRXY<"mg", 0xE384, null_frag, GR128, load, 8>, + Requires<[FeatureMiscellaneousExtensions2]>; def ML : BinaryRXY<"ml", 0xE396, null_frag, GR128, load, 4>; def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>; +def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))), + (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>; def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))), (MLG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>; @@ -1765,8 +1802,29 @@ let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in { GR128, GR128, GR128>; def PCC : SideEffectInherentRRE<"pcc", 0xB92C>; } + let Predicates = [FeatureMessageSecurityAssist5] in - def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>; + def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>; + let Predicates = [FeatureMessageSecurityAssist7], isAsmParserOnly = 1 in + def PRNO : SideEffectBinaryMemMemRRE<"prno", 0xB93C, GR128, GR128>; + + let Predicates = [FeatureMessageSecurityAssist8] in + def KMA : SideEffectTernaryMemMemMemRRFb<"kma", 0xB929, + GR128, GR128, GR128>; +} + +//===----------------------------------------------------------------------===// +// Guarded storage +//===----------------------------------------------------------------------===// + +let Predicates = [FeatureGuardedStorage] in { + def LGG : UnaryRXY<"lgg", 0xE34C, null_frag, GR64, 8>; + def LLGFSG : UnaryRXY<"llgfsg", 0xE348, null_frag, GR64, 4>; + + let mayLoad = 1 in + def LGSC : SideEffectBinaryRXY<"lgsc", 0xE34D, GR64>; + let mayStore = 1 in + def STGSC : SideEffectBinaryRXY<"stgsc", 0xE349, GR64>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td index a9803c2d83e9..0112ebf1eb10 100644 --- a/lib/Target/SystemZ/SystemZInstrSystem.td +++ b/lib/Target/SystemZ/SystemZInstrSystem.td @@ -126,6 +126,10 @@ let hasSideEffects = 1, Defs = [CC] in let Predicates = [FeatureResetReferenceBitsMultiple], hasSideEffects = 1 in def RRBM : UnaryRRE<"rrbm", 0xB9AE, null_frag, GR64, GR64>; +// Insert reference bits multiple. +let Predicates = [FeatureInsertReferenceBitsMultiple], hasSideEffects = 1 in + def IRBM : UnaryRRE<"irbm", 0xB9AC, null_frag, GR64, GR64>; + // Perform frame management function. let hasSideEffects = 1 in def PFMF : SideEffectBinaryMemRRE<"pfmf", 0xB9AF, GR32, GR64>; diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td index 0158fe6aec08..c9a02d9c8082 100644 --- a/lib/Target/SystemZ/SystemZInstrVector.td +++ b/lib/Target/SystemZ/SystemZInstrVector.td @@ -14,7 +14,7 @@ let Predicates = [FeatureVector] in { // Register move. def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>; - def VLR32 : UnaryAliasVRR; + def VLR32 : UnaryAliasVRR; def VLR64 : UnaryAliasVRR; // Load GR from VR element. @@ -141,7 +141,7 @@ let Predicates = [FeatureVector] in { // LEY and LDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - def VL32 : UnaryAliasVRX; + def VL32 : UnaryAliasVRX; def VL64 : UnaryAliasVRX; // Load logical element and zero. @@ -154,6 +154,11 @@ let Predicates = [FeatureVector] in { (VLLEZF bdxaddr12only:$addr)>; def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)), (VLLEZG bdxaddr12only:$addr)>; + let Predicates = [FeatureVectorEnhancements1] in { + def VLLEZLF : UnaryVRX<"vllezlf", 0xE704, z_vllezli32, v128f, 4, 6>; + def : Pat<(v4f32 (z_vllezlf32 bdxaddr12only:$addr)), + (VLLEZLF bdxaddr12only:$addr)>; + } // Load element. def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8, v128b, v128b, 1, imm32zx4>; @@ -170,6 +175,13 @@ let Predicates = [FeatureVector] in { def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>; } +let Predicates = [FeatureVectorPackedDecimal] in { + // Load rightmost with length. The number of loaded bytes is only known + // at run time. + def VLRL : BinaryVSI<"vlrl", 0xE635, int_s390_vlrl, 0>; + def VLRLR : BinaryVRSd<"vlrlr", 0xE637, int_s390_vlrl, 0>; +} + // Use replicating loads if we're inserting a single element into an // undefined vector. This avoids a false dependency on the previous // register contents. @@ -219,7 +231,7 @@ let Predicates = [FeatureVector] in { // STEY and STDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - def VST32 : StoreAliasVRX; + def VST32 : StoreAliasVRX; def VST64 : StoreAliasVRX; // Scatter element. @@ -227,6 +239,13 @@ let Predicates = [FeatureVector] in { def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>; } +let Predicates = [FeatureVectorPackedDecimal] in { + // Store rightmost with length. The number of stored bytes is only known + // at run time. + def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, int_s390_vstrl, 0>; + def VSTRLR : StoreLengthVRSd<"vstrlr", 0xE63F, int_s390_vstrl, 0>; +} + //===----------------------------------------------------------------------===// // Selects and permutes //===----------------------------------------------------------------------===// @@ -256,6 +275,10 @@ let Predicates = [FeatureVector] in { // Permute doubleword immediate. def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>; + // Bit Permute. + let Predicates = [FeatureVectorEnhancements1] in + def VBPERM : BinaryVRRc<"vbperm", 0xE785, int_s390_vbperm, v128g, v128b>; + // Replicate. def VREP: BinaryVRIcGeneric<"vrep", 0xE74D>; def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>; @@ -424,6 +447,10 @@ let Predicates = [FeatureVector] in { def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>; def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>; + // Not exclusive or. + let Predicates = [FeatureVectorEnhancements1] in + def VNX : BinaryVRRc<"vnx", 0xE76C, null_frag, v128any, v128any>; + // Exclusive or. def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>; @@ -567,6 +594,17 @@ let Predicates = [FeatureVector] in { def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>; def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>; + // Multiply sum logical. + let Predicates = [FeatureVectorEnhancements1] in { + def VMSL : QuaternaryVRRdGeneric<"vmsl", 0xE7B8>; + def VMSLG : QuaternaryVRRd<"vmslg", 0xE7B8, int_s390_vmslg, + v128q, v128g, v128g, v128q, 3>; + } + + // Nand. + let Predicates = [FeatureVectorEnhancements1] in + def VNN : BinaryVRRc<"vnn", 0xE76E, null_frag, v128any, v128any>; + // Nor. def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>; def : InstAlias<"vnot\t$V1, $V2", (VNO VR128:$V1, VR128:$V2, VR128:$V2), 0>; @@ -574,9 +612,19 @@ let Predicates = [FeatureVector] in { // Or. def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>; + // Or with complement. + let Predicates = [FeatureVectorEnhancements1] in + def VOC : BinaryVRRc<"voc", 0xE76F, null_frag, v128any, v128any>; + // Population count. def VPOPCT : UnaryVRRaGeneric<"vpopct", 0xE750>; def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>; + let Predicates = [FeatureVectorEnhancements1] in { + def VPOPCTB : UnaryVRRa<"vpopctb", 0xE750, ctpop, v128b, v128b, 0>; + def VPOPCTH : UnaryVRRa<"vpopcth", 0xE750, ctpop, v128h, v128h, 1>; + def VPOPCTF : UnaryVRRa<"vpopctf", 0xE750, ctpop, v128f, v128f, 2>; + def VPOPCTG : UnaryVRRa<"vpopctg", 0xE750, ctpop, v128g, v128g, 3>; + } // Element rotate left logical (with vector shift amount). def VERLLV : BinaryVRRcGeneric<"verllv", 0xE773>; @@ -724,6 +772,14 @@ multiclass BitwiseVectorOps { (VNO VR128:$x, VR128:$y)>; def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>; } + let Predicates = [FeatureVectorEnhancements1] in { + def : Pat<(type (z_vnot (xor VR128:$x, VR128:$y))), + (VNX VR128:$x, VR128:$y)>; + def : Pat<(type (z_vnot (and VR128:$x, VR128:$y))), + (VNN VR128:$x, VR128:$y)>; + def : Pat<(type (or VR128:$x, (z_vnot VR128:$y))), + (VOC VR128:$x, VR128:$y)>; + } } defm : BitwiseVectorOps; @@ -879,6 +935,11 @@ let Predicates = [FeatureVector] in { def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>; def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFASB : BinaryVRRc<"vfasb", 0xE7E3, fadd, v128sb, v128sb, 2, 0>; + def WFASB : BinaryVRRc<"wfasb", 0xE7E3, fadd, v32sb, v32sb, 2, 8>; + def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, fadd, v128xb, v128xb, 4, 8>; + } // Convert from fixed 64-bit. def VCDG : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>; @@ -910,6 +971,11 @@ let Predicates = [FeatureVector] in { def VFD : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>; def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>; def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, fdiv, v128sb, v128sb, 2, 0>; + def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, fdiv, v32sb, v32sb, 2, 8>; + def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, fdiv, v128xb, v128xb, 4, 8>; + } // Load FP integer. def VFI : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>; @@ -917,66 +983,213 @@ let Predicates = [FeatureVector] in { def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>; defm : VectorRounding; defm : VectorRounding; + let Predicates = [FeatureVectorEnhancements1] in { + def VFISB : TernaryVRRa<"vfisb", 0xE7C7, int_s390_vfisb, v128sb, v128sb, 2, 0>; + def WFISB : TernaryVRRa<"wfisb", 0xE7C7, null_frag, v32sb, v32sb, 2, 8>; + def WFIXB : TernaryVRRa<"wfixb", 0xE7C7, null_frag, v128xb, v128xb, 4, 8>; + defm : VectorRounding; + defm : VectorRounding; + defm : VectorRounding; + } // Load lengthened. def VLDE : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>; - def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>; - def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32eb, 2, 8>; + def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128sb, 2, 0>; + def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32sb, 2, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + let isAsmParserOnly = 1 in { + def VFLL : UnaryVRRaFloatGeneric<"vfll", 0xE7C4>; + def VFLLS : UnaryVRRa<"vflls", 0xE7C4, null_frag, v128db, v128sb, 2, 0>; + def WFLLS : UnaryVRRa<"wflls", 0xE7C4, null_frag, v64db, v32sb, 2, 8>; + } + def WFLLD : UnaryVRRa<"wflld", 0xE7C4, fpextend, v128xb, v64db, 3, 8>; + def : Pat<(f128 (fpextend (f32 VR32:$src))), + (WFLLD (WLDEB VR32:$src))>; + } - // Load rounded, + // Load rounded. def VLED : TernaryVRRaFloatGeneric<"vled", 0xE7C5>; - def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>; - def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>; + def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128sb, v128db, 3, 0>; + def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32sb, v64db, 3, 8>; def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>; - def : FPConversion; + def : FPConversion; + let Predicates = [FeatureVectorEnhancements1] in { + let isAsmParserOnly = 1 in { + def VFLR : TernaryVRRaFloatGeneric<"vflr", 0xE7C5>; + def VFLRD : TernaryVRRa<"vflrd", 0xE7C5, null_frag, v128sb, v128db, 3, 0>; + def WFLRD : TernaryVRRa<"wflrd", 0xE7C5, null_frag, v32sb, v64db, 3, 8>; + } + def WFLRX : TernaryVRRa<"wflrx", 0xE7C5, null_frag, v64db, v128xb, 4, 8>; + def : FPConversion; + def : Pat<(f32 (fpround (f128 VR128:$src))), + (WLEDB (WFLRX VR128:$src, 0, 3), 0, 0)>; + } + + // Maximum. + multiclass VectorMax { + def : FPMinMax; + def : FPMinMax; + } + let Predicates = [FeatureVectorEnhancements1] in { + def VFMAX : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>; + def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb, + v128db, v128db, 3, 0>; + def WFMAXDB : TernaryVRRcFloat<"wfmaxdb", 0xE7EF, null_frag, + v64db, v64db, 3, 8>; + def VFMAXSB : TernaryVRRcFloat<"vfmaxsb", 0xE7EF, int_s390_vfmaxsb, + v128sb, v128sb, 2, 0>; + def WFMAXSB : TernaryVRRcFloat<"wfmaxsb", 0xE7EF, null_frag, + v32sb, v32sb, 2, 8>; + def WFMAXXB : TernaryVRRcFloat<"wfmaxxb", 0xE7EF, null_frag, + v128xb, v128xb, 4, 8>; + defm : VectorMax; + defm : VectorMax; + defm : VectorMax; + defm : VectorMax; + defm : VectorMax; + } + + // Minimum. + multiclass VectorMin { + def : FPMinMax; + def : FPMinMax; + } + let Predicates = [FeatureVectorEnhancements1] in { + def VFMIN : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>; + def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb, + v128db, v128db, 3, 0>; + def WFMINDB : TernaryVRRcFloat<"wfmindb", 0xE7EE, null_frag, + v64db, v64db, 3, 8>; + def VFMINSB : TernaryVRRcFloat<"vfminsb", 0xE7EE, int_s390_vfminsb, + v128sb, v128sb, 2, 0>; + def WFMINSB : TernaryVRRcFloat<"wfminsb", 0xE7EE, null_frag, + v32sb, v32sb, 2, 8>; + def WFMINXB : TernaryVRRcFloat<"wfminxb", 0xE7EE, null_frag, + v128xb, v128xb, 4, 8>; + defm : VectorMin; + defm : VectorMin; + defm : VectorMin; + defm : VectorMin; + defm : VectorMin; + } // Multiply. def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>; def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>; def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, fmul, v128sb, v128sb, 2, 0>; + def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, fmul, v32sb, v32sb, 2, 8>; + def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, fmul, v128xb, v128xb, 4, 8>; + } // Multiply and add. def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>; def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>; def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, fma, v128sb, v128sb, 0, 2>; + def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, fma, v32sb, v32sb, 8, 2>; + def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, fma, v128xb, v128xb, 8, 4>; + } // Multiply and subtract. def VFMS : TernaryVRReFloatGeneric<"vfms", 0xE78E>; def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>; def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, fms, v128sb, v128sb, 0, 2>; + def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, fms, v32sb, v32sb, 8, 2>; + def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, fms, v128xb, v128xb, 8, 4>; + } + + // Negative multiply and add. + let Predicates = [FeatureVectorEnhancements1] in { + def VFNMA : TernaryVRReFloatGeneric<"vfnma", 0xE79F>; + def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, fnma, v128db, v128db, 0, 3>; + def WFNMADB : TernaryVRRe<"wfnmadb", 0xE79F, fnma, v64db, v64db, 8, 3>; + def VFNMASB : TernaryVRRe<"vfnmasb", 0xE79F, fnma, v128sb, v128sb, 0, 2>; + def WFNMASB : TernaryVRRe<"wfnmasb", 0xE79F, fnma, v32sb, v32sb, 8, 2>; + def WFNMAXB : TernaryVRRe<"wfnmaxb", 0xE79F, fnma, v128xb, v128xb, 8, 4>; + } + + // Negative multiply and subtract. + let Predicates = [FeatureVectorEnhancements1] in { + def VFNMS : TernaryVRReFloatGeneric<"vfnms", 0xE79E>; + def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, fnms, v128db, v128db, 0, 3>; + def WFNMSDB : TernaryVRRe<"wfnmsdb", 0xE79E, fnms, v64db, v64db, 8, 3>; + def VFNMSSB : TernaryVRRe<"vfnmssb", 0xE79E, fnms, v128sb, v128sb, 0, 2>; + def WFNMSSB : TernaryVRRe<"wfnmssb", 0xE79E, fnms, v32sb, v32sb, 8, 2>; + def WFNMSXB : TernaryVRRe<"wfnmsxb", 0xE79E, fnms, v128xb, v128xb, 8, 4>; + } // Perform sign operation. def VFPSO : BinaryVRRaFloatGeneric<"vfpso", 0xE7CC>; def VFPSODB : BinaryVRRa<"vfpsodb", 0xE7CC, null_frag, v128db, v128db, 3, 0>; def WFPSODB : BinaryVRRa<"wfpsodb", 0xE7CC, null_frag, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFPSOSB : BinaryVRRa<"vfpsosb", 0xE7CC, null_frag, v128sb, v128sb, 2, 0>; + def WFPSOSB : BinaryVRRa<"wfpsosb", 0xE7CC, null_frag, v32sb, v32sb, 2, 8>; + def WFPSOXB : BinaryVRRa<"wfpsoxb", 0xE7CC, null_frag, v128xb, v128xb, 4, 8>; + } // Load complement. def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>; def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFLCSB : UnaryVRRa<"vflcsb", 0xE7CC, fneg, v128sb, v128sb, 2, 0, 0>; + def WFLCSB : UnaryVRRa<"wflcsb", 0xE7CC, fneg, v32sb, v32sb, 2, 8, 0>; + def WFLCXB : UnaryVRRa<"wflcxb", 0xE7CC, fneg, v128xb, v128xb, 4, 8, 0>; + } // Load negative. def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>; def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFLNSB : UnaryVRRa<"vflnsb", 0xE7CC, fnabs, v128sb, v128sb, 2, 0, 1>; + def WFLNSB : UnaryVRRa<"wflnsb", 0xE7CC, fnabs, v32sb, v32sb, 2, 8, 1>; + def WFLNXB : UnaryVRRa<"wflnxb", 0xE7CC, fnabs, v128xb, v128xb, 4, 8, 1>; + } // Load positive. def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>; def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFLPSB : UnaryVRRa<"vflpsb", 0xE7CC, fabs, v128sb, v128sb, 2, 0, 2>; + def WFLPSB : UnaryVRRa<"wflpsb", 0xE7CC, fabs, v32sb, v32sb, 2, 8, 2>; + def WFLPXB : UnaryVRRa<"wflpxb", 0xE7CC, fabs, v128xb, v128xb, 4, 8, 2>; + } // Square root. def VFSQ : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>; def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>; def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, fsqrt, v128sb, v128sb, 2, 0>; + def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, fsqrt, v32sb, v32sb, 2, 8>; + def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, fsqrt, v128xb, v128xb, 4, 8>; + } // Subtract. def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>; def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>; def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, fsub, v128sb, v128sb, 2, 0>; + def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, fsub, v32sb, v32sb, 2, 8>; + def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, fsub, v128xb, v128xb, 4, 8>; + } // Test data class immediate. let Defs = [CC] in { def VFTCI : BinaryVRIeFloatGeneric<"vftci", 0xE74A>; def VFTCIDB : BinaryVRIe<"vftcidb", 0xE74A, z_vftci, v128g, v128db, 3, 0>; def WFTCIDB : BinaryVRIe<"wftcidb", 0xE74A, null_frag, v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFTCISB : BinaryVRIe<"vftcisb", 0xE74A, z_vftci, v128f, v128sb, 2, 0>; + def WFTCISB : BinaryVRIe<"wftcisb", 0xE74A, null_frag, v32f, v32sb, 2, 8>; + def WFTCIXB : BinaryVRIe<"wftcixb", 0xE74A, null_frag, v128q, v128xb, 4, 8>; + } } } @@ -989,12 +1202,20 @@ let Predicates = [FeatureVector] in { let Defs = [CC] in { def WFC : CompareVRRaFloatGeneric<"wfc", 0xE7CB>; def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>; + let Predicates = [FeatureVectorEnhancements1] in { + def WFCSB : CompareVRRa<"wfcsb", 0xE7CB, z_fcmp, v32sb, 2>; + def WFCXB : CompareVRRa<"wfcxb", 0xE7CB, z_fcmp, v128xb, 4>; + } } // Compare and signal scalar. let Defs = [CC] in { def WFK : CompareVRRaFloatGeneric<"wfk", 0xE7CA>; def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>; + let Predicates = [FeatureVectorEnhancements1] in { + def WFKSB : CompareVRRa<"wfksb", 0xE7CA, null_frag, v32sb, 2>; + def WFKXB : CompareVRRa<"wfkxb", 0xE7CA, null_frag, v128xb, 4>; + } } // Compare equal. @@ -1003,6 +1224,28 @@ let Predicates = [FeatureVector] in { v128g, v128db, 3, 0>; defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag, v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + defm VFCESB : BinaryVRRcSPair<"vfcesb", 0xE7E8, z_vfcmpe, z_vfcmpes, + v128f, v128sb, 2, 0>; + defm WFCESB : BinaryVRRcSPair<"wfcesb", 0xE7E8, null_frag, null_frag, + v32f, v32sb, 2, 8>; + defm WFCEXB : BinaryVRRcSPair<"wfcexb", 0xE7E8, null_frag, null_frag, + v128q, v128xb, 4, 8>; + } + + // Compare and signal equal. + let Predicates = [FeatureVectorEnhancements1] in { + defm VFKEDB : BinaryVRRcSPair<"vfkedb", 0xE7E8, null_frag, null_frag, + v128g, v128db, 3, 4>; + defm WFKEDB : BinaryVRRcSPair<"wfkedb", 0xE7E8, null_frag, null_frag, + v64g, v64db, 3, 12>; + defm VFKESB : BinaryVRRcSPair<"vfkesb", 0xE7E8, null_frag, null_frag, + v128f, v128sb, 2, 4>; + defm WFKESB : BinaryVRRcSPair<"wfkesb", 0xE7E8, null_frag, null_frag, + v32f, v32sb, 2, 12>; + defm WFKEXB : BinaryVRRcSPair<"wfkexb", 0xE7E8, null_frag, null_frag, + v128q, v128xb, 4, 12>; + } // Compare high. def VFCH : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>; @@ -1010,6 +1253,28 @@ let Predicates = [FeatureVector] in { v128g, v128db, 3, 0>; defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag, v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + defm VFCHSB : BinaryVRRcSPair<"vfchsb", 0xE7EB, z_vfcmph, z_vfcmphs, + v128f, v128sb, 2, 0>; + defm WFCHSB : BinaryVRRcSPair<"wfchsb", 0xE7EB, null_frag, null_frag, + v32f, v32sb, 2, 8>; + defm WFCHXB : BinaryVRRcSPair<"wfchxb", 0xE7EB, null_frag, null_frag, + v128q, v128xb, 4, 8>; + } + + // Compare and signal high. + let Predicates = [FeatureVectorEnhancements1] in { + defm VFKHDB : BinaryVRRcSPair<"vfkhdb", 0xE7EB, null_frag, null_frag, + v128g, v128db, 3, 4>; + defm WFKHDB : BinaryVRRcSPair<"wfkhdb", 0xE7EB, null_frag, null_frag, + v64g, v64db, 3, 12>; + defm VFKHSB : BinaryVRRcSPair<"vfkhsb", 0xE7EB, null_frag, null_frag, + v128f, v128sb, 2, 4>; + defm WFKHSB : BinaryVRRcSPair<"wfkhsb", 0xE7EB, null_frag, null_frag, + v32f, v32sb, 2, 12>; + defm WFKHXB : BinaryVRRcSPair<"wfkhxb", 0xE7EB, null_frag, null_frag, + v128q, v128xb, 4, 12>; + } // Compare high or equal. def VFCHE : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>; @@ -1017,6 +1282,28 @@ let Predicates = [FeatureVector] in { v128g, v128db, 3, 0>; defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag, v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + defm VFCHESB : BinaryVRRcSPair<"vfchesb", 0xE7EA, z_vfcmphe, z_vfcmphes, + v128f, v128sb, 2, 0>; + defm WFCHESB : BinaryVRRcSPair<"wfchesb", 0xE7EA, null_frag, null_frag, + v32f, v32sb, 2, 8>; + defm WFCHEXB : BinaryVRRcSPair<"wfchexb", 0xE7EA, null_frag, null_frag, + v128q, v128xb, 4, 8>; + } + + // Compare and signal high or equal. + let Predicates = [FeatureVectorEnhancements1] in { + defm VFKHEDB : BinaryVRRcSPair<"vfkhedb", 0xE7EA, null_frag, null_frag, + v128g, v128db, 3, 4>; + defm WFKHEDB : BinaryVRRcSPair<"wfkhedb", 0xE7EA, null_frag, null_frag, + v64g, v64db, 3, 12>; + defm VFKHESB : BinaryVRRcSPair<"vfkhesb", 0xE7EA, null_frag, null_frag, + v128f, v128sb, 2, 4>; + defm WFKHESB : BinaryVRRcSPair<"wfkhesb", 0xE7EA, null_frag, null_frag, + v32f, v32sb, 2, 12>; + defm WFKHEXB : BinaryVRRcSPair<"wfkhexb", 0xE7EA, null_frag, null_frag, + v128q, v128xb, 4, 12>; + } } //===----------------------------------------------------------------------===// @@ -1028,36 +1315,49 @@ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (f128 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (f128 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (f128 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (f128 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (f128 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (f128 VR128:$src))), (v2f64 VR128:$src)>; + +def : Pat<(f128 (bitconvert (v16i8 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v8i16 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v4i32 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v2i64 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v4f32 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v2f64 VR128:$src))), (f128 VR128:$src)>; //===----------------------------------------------------------------------===// // Replicating scalars @@ -1133,6 +1433,20 @@ let AddedComplexity = 4 in { (EXTRACT_SUBREG (VREPG VR128:$vec, imm32zx1:$index), subreg_r64)>; } +//===----------------------------------------------------------------------===// +// Support for 128-bit floating-point values in vector registers +//===----------------------------------------------------------------------===// + +let Predicates = [FeatureVectorEnhancements1] in { + def : Pat<(f128 (load bdxaddr12only:$addr)), + (VL bdxaddr12only:$addr)>; + def : Pat<(store (f128 VR128:$src), bdxaddr12only:$addr), + (VST VR128:$src, bdxaddr12only:$addr)>; + + def : Pat<(f128 fpimm0), (VZERO)>; + def : Pat<(f128 fpimmneg0), (WFLNXB (VZERO))>; +} + //===----------------------------------------------------------------------===// // String instructions //===----------------------------------------------------------------------===// @@ -1202,3 +1516,37 @@ let Predicates = [FeatureVector] in { defm VSTRCZF : QuaternaryOptVRRdSPair<"vstrczf", 0xE78A, int_s390_vstrczf, z_vstrcz_cc, v128f, v128f, 2, 2>; } + +//===----------------------------------------------------------------------===// +// Packed-decimal instructions +//===----------------------------------------------------------------------===// + +let Predicates = [FeatureVectorPackedDecimal] in { + def VLIP : BinaryVRIh<"vlip", 0xE649>; + + def VPKZ : BinaryVSI<"vpkz", 0xE634, null_frag, 0>; + def VUPKZ : StoreLengthVSI<"vupkz", 0xE63C, null_frag, 0>; + + let Defs = [CC] in { + def VCVB : BinaryVRRi<"vcvb", 0xE650, GR32>; + def VCVBG : BinaryVRRi<"vcvbg", 0xE652, GR64>; + def VCVD : TernaryVRIi<"vcvd", 0xE658, GR32>; + def VCVDG : TernaryVRIi<"vcvdg", 0xE65A, GR64>; + + def VAP : QuaternaryVRIf<"vap", 0xE671>; + def VSP : QuaternaryVRIf<"vsp", 0xE673>; + + def VMP : QuaternaryVRIf<"vmp", 0xE678>; + def VMSP : QuaternaryVRIf<"vmsp", 0xE679>; + + def VDP : QuaternaryVRIf<"vdp", 0xE67A>; + def VRP : QuaternaryVRIf<"vrp", 0xE67B>; + def VSDP : QuaternaryVRIf<"vsdp", 0xE67E>; + + def VSRP : QuaternaryVRIg<"vsrp", 0xE659>; + def VPSOP : QuaternaryVRIg<"vpsop", 0xE65B>; + + def VTP : TestVRRg<"vtp", 0xE65F>; + def VCP : CompareVRRh<"vcp", 0xE677>; + } +} diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index 9c6d5819f8a7..759a8bb0ce14 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -181,6 +181,7 @@ def z_select_ccmask : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask, [SDNPInGlue]>; def z_adjdynalloc : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>; def z_popcnt : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>; +def z_smul_lohi : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>; def z_umul_lohi : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>; def z_sdivrem : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>; def z_udivrem : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>; @@ -549,6 +550,12 @@ def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3), def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3), (fma node:$src2, node:$src3, (fneg node:$src1))>; +// Negative fused multiply-add and multiply-subtract. +def fnma : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (fneg (fma node:$src1, node:$src2, node:$src3))>; +def fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (fneg (fms node:$src1, node:$src2, node:$src3))>; + // Floating-point negative absolute. def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>; @@ -624,6 +631,19 @@ def z_vllezf64 : PatFrag<(ops node:$addr), (scalar_to_vector (f64 (load node:$addr))), (z_vzero))>; +// Similarly for the high element of a zeroed vector. +def z_vllezli32 : z_vllez; +def z_vllezlf32 : PatFrag<(ops node:$addr), + (bitconvert + (z_merge_high + (v2i64 + (bitconvert + (z_merge_high + (v4f32 (scalar_to_vector + (f32 (load node:$addr)))), + (v4f32 (z_vzero))))), + (v2i64 (z_vzero))))>; + // Store one element of a vector. class z_vste : PatFrag<(ops node:$vec, node:$addr, node:$index), diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td index 16a7ed784d70..152521fb66a8 100644 --- a/lib/Target/SystemZ/SystemZPatterns.td +++ b/lib/Target/SystemZ/SystemZPatterns.td @@ -167,3 +167,10 @@ class FPConversion suppress, bits<4> mode> : Pat<(tr1.vt (operator (tr2.vt tr2.op:$vec))), (insn tr2.op:$vec, suppress, mode)>; + +// Use INSN to perform mininum/maximum operation OPERATOR on type TR. +// FUNCTION is the type of minimum/maximum function to perform. +class FPMinMax function> + : Pat<(tr.vt (operator (tr.vt tr.op:$vec1), (tr.vt tr.op:$vec2))), + (insn tr.op:$vec1, tr.op:$vec2, function)>; diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td index 1cdc0949ff4a..0dca4582dc0d 100644 --- a/lib/Target/SystemZ/SystemZProcessors.td +++ b/lib/Target/SystemZ/SystemZProcessors.td @@ -33,3 +33,6 @@ def : ProcessorModel<"zEC12", ZEC12Model, Arch10SupportedFeatures.List>; def : ProcessorModel<"arch11", Z13Model, Arch11SupportedFeatures.List>; def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>; +def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>; +def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>; + diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td index 36809ea81dc1..52ba1a584017 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -260,10 +260,10 @@ defm VF128 : SystemZRegClass<"VF128", // All vector registers. defm VR128 : SystemZRegClass<"VR128", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, - (add (sequence "V%u", 0, 7), - (sequence "V%u", 16, 31), - (sequence "V%u", 8, 15))>; + [f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (add (sequence "V%u", 0, 7), + (sequence "V%u", 16, 31), + (sequence "V%u", 8, 15))>; // Attaches a ValueType to a register operand, to make the instruction // definitions easier. @@ -272,7 +272,8 @@ class TypedReg { RegisterOperand op = opin; } -def v32eb : TypedReg; +def v32f : TypedReg; +def v32sb : TypedReg; def v64g : TypedReg; def v64db : TypedReg; def v128b : TypedReg; @@ -280,8 +281,9 @@ def v128h : TypedReg; def v128f : TypedReg; def v128g : TypedReg; def v128q : TypedReg; -def v128eb : TypedReg; +def v128sb : TypedReg; def v128db : TypedReg; +def v128xb : TypedReg; def v128any : TypedReg; //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td index 1ce0168f95e9..8dba89f70a42 100644 --- a/lib/Target/SystemZ/SystemZSchedule.td +++ b/lib/Target/SystemZ/SystemZSchedule.td @@ -59,7 +59,7 @@ def FPU2 : SchedWrite; def DFU : SchedWrite; def DFU2 : SchedWrite; -// Vector sub units (z13) +// Vector sub units (z13 and later) def VecBF : SchedWrite; def VecBF2 : SchedWrite; def VecDF : SchedWrite; @@ -75,6 +75,7 @@ def VecXsPm : SchedWrite; def VBU : SchedWrite; +include "SystemZScheduleZ14.td" include "SystemZScheduleZ13.td" include "SystemZScheduleZEC12.td" include "SystemZScheduleZ196.td" diff --git a/lib/Target/SystemZ/SystemZScheduleZ14.td b/lib/Target/SystemZ/SystemZScheduleZ14.td new file mode 100644 index 000000000000..f11177af91a5 --- /dev/null +++ b/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -0,0 +1,1611 @@ +//-- SystemZScheduleZ14.td - SystemZ Scheduling Definitions ----*- tblgen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Z14 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def Z14Model : SchedMachineModel { + + let UnsupportedFeatures = Arch12UnsupportedFeatures.List; + + let IssueWidth = 8; + let MicroOpBufferSize = 60; // Issue queues + let LoadLatency = 1; // Optimistic load latency. + + let PostRAScheduler = 1; + + // Extra cycles for a mispredicted branch. + let MispredictPenalty = 20; +} + +let SchedModel = Z14Model in { + +// These definitions could be put in a subtarget common include file, +// but it seems the include system in Tablegen currently rejects +// multiple includes of same file. +def : WriteRes { + let NumMicroOps = 0; + let BeginGroup = 1; + let EndGroup = 1; +} +def : WriteRes { + let NumMicroOps = 0; + let BeginGroup = 1; +} +def : WriteRes { + let NumMicroOps = 0; + let EndGroup = 1; +} +def : WriteRes { let Latency = 2; let NumMicroOps = 0;} +def : WriteRes { let Latency = 3; let NumMicroOps = 0;} +def : WriteRes { let Latency = 4; let NumMicroOps = 0;} +def : WriteRes { let Latency = 5; let NumMicroOps = 0;} +def : WriteRes { let Latency = 6; let NumMicroOps = 0;} +def : WriteRes { let Latency = 7; let NumMicroOps = 0;} +def : WriteRes { let Latency = 8; let NumMicroOps = 0;} +def : WriteRes { let Latency = 9; let NumMicroOps = 0;} +def : WriteRes { let Latency = 10; let NumMicroOps = 0;} +def : WriteRes { let Latency = 11; let NumMicroOps = 0;} +def : WriteRes { let Latency = 12; let NumMicroOps = 0;} +def : WriteRes { let Latency = 15; let NumMicroOps = 0;} +def : WriteRes { let Latency = 20; let NumMicroOps = 0;} +def : WriteRes { let Latency = 30; let NumMicroOps = 0;} + +// Execution units. +def Z14_FXaUnit : ProcResource<2>; +def Z14_FXbUnit : ProcResource<2>; +def Z14_LSUnit : ProcResource<2>; +def Z14_VecUnit : ProcResource<2>; +def Z14_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ } +def Z14_VBUnit : ProcResource<2>; + +// Subtarget specific definitions of scheduling resources. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 8; } +def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 8; } +def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 2; } +def : WriteRes + { let Latency = 30; } +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 3; } +def : WriteRes; // Virtual Branching Unit + +// -------------------------- INSTRUCTIONS ---------------------------------- // + +// InstRW constructs have been used in order to preserve the +// readability of the InstrInfo files. + +// For each instruction, as matched by a regexp, provide a list of +// resources that it needs. These will be combined into a SchedClass. + +//===----------------------------------------------------------------------===// +// Stack allocation +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY + +//===----------------------------------------------------------------------===// +// Branch instructions +//===----------------------------------------------------------------------===// + +// Branch +def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>; +def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>; +def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>; +def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "BI(C)?(Asm.*)?$")>; +def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>; +def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>; +def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>; +def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone], + (instregex "B(R)?X(H|L).*$")>; + +// Compare and branch +def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>; +def : InstRW<[FXb, FXb, Lat2, GroupAlone], + (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>; + +//===----------------------------------------------------------------------===// +// Trap instructions +//===----------------------------------------------------------------------===// + +// Trap +def : InstRW<[VBU], (instregex "(Cond)?Trap$")>; + +// Compare and trap +def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>; +def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>; +def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>; + +//===----------------------------------------------------------------------===// +// Call and return instructions +//===----------------------------------------------------------------------===// + +// Call +def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>; +def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>; +def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>; +def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>; + +// Return +def : InstRW<[FXb, EndGroup], (instregex "Return$")>; +def : InstRW<[FXb], (instregex "CondReturn$")>; + +//===----------------------------------------------------------------------===// +// Select instructions +//===----------------------------------------------------------------------===// + +// Select pseudo +def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>; + +// CondStore pseudos +def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>; + +//===----------------------------------------------------------------------===// +// Move instructions +//===----------------------------------------------------------------------===// + +// Moves +def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>; + +// Move character +def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>; + +// Pseudo -> reg move +def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>; +def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>; +def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>; +def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>; +def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>; + +// Loads +def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>; +def : InstRW<[LSU], (instregex "LG(RL)?$")>; +def : InstRW<[LSU], (instregex "L128$")>; + +def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>; +def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>; + +def : InstRW<[FXa], (instregex "LG(F|H)I$")>; +def : InstRW<[FXa], (instregex "LHI(Mux)?$")>; +def : InstRW<[FXa], (instregex "LR(Mux)?$")>; + +// Load and zero rightmost byte +def : InstRW<[LSU], (instregex "LZR(F|G)$")>; + +// Load and trap +def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>; + +// Load and test +def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>; +def : InstRW<[FXa], (instregex "LT(G)?R$")>; + +// Stores +def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>; + +// String moves. +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>; + +//===----------------------------------------------------------------------===// +// Conditional move instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>; +def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>; +def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>; +def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>; + +//===----------------------------------------------------------------------===// +// Sign extensions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "L(B|H|G)R$")>; +def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>; + +def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>; +def : InstRW<[FXa], (instregex "LTGFR$")>; + +def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>; + +//===----------------------------------------------------------------------===// +// Zero extensions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>; +def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>; +def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>; +def : InstRW<[LSU], (instregex "LLC(Mux)?$")>; +def : InstRW<[LSU], (instregex "LLH(Mux)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>; +def : InstRW<[LSU], (instregex "LLHRL$")>; +def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>; + +// Load and zero rightmost byte +def : InstRW<[LSU], (instregex "LLZRGF$")>; + +// Load and trap +def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>; + +//===----------------------------------------------------------------------===// +// Truncations +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>; + +//===----------------------------------------------------------------------===// +// Multi-register moves +//===----------------------------------------------------------------------===// + +// Load multiple (estimated average of 5 ops) +def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], + (instregex "LM(H|Y|G)?$")>; + +// Load multiple disjoint +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>; + +// Store multiple (estimated average of ceil(5/2) FXb ops) +def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10, + GroupAlone], (instregex "STM(G|H|Y)?$")>; + +//===----------------------------------------------------------------------===// +// Byte swaps +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "LRV(G)?R$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>; + +//===----------------------------------------------------------------------===// +// Load address instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>; + +// Load the Global Offset Table address ( -> larl ) +def : InstRW<[FXa], (instregex "GOT$")>; + +//===----------------------------------------------------------------------===// +// Absolute and Negation +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "LP(G)?R$")>; +def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "L(N|P)GFR$")>; +def : InstRW<[FXa], (instregex "LN(R|GR)$")>; +def : InstRW<[FXa], (instregex "LC(R|GR)$")>; +def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>; + +//===----------------------------------------------------------------------===// +// Insertion +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>; +def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>; +def : InstRW<[FXa], (instregex "IIHF(64)?$")>; +def : InstRW<[FXa], (instregex "IIHH(64)?$")>; +def : InstRW<[FXa], (instregex "IIHL(64)?$")>; +def : InstRW<[FXa], (instregex "IILF(64)?$")>; +def : InstRW<[FXa], (instregex "IILH(64)?$")>; +def : InstRW<[FXa], (instregex "IILL(64)?$")>; + +//===----------------------------------------------------------------------===// +// Addition +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>; +def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>; +def : InstRW<[FXa], (instregex "AIH$")>; +def : InstRW<[FXa], (instregex "AFI(Mux)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>; +def : InstRW<[FXa], (instregex "AGFI$")>; +def : InstRW<[FXa], (instregex "AGHI(K)?$")>; +def : InstRW<[FXa], (instregex "AGR(K)?$")>; +def : InstRW<[FXa], (instregex "AHI(K)?$")>; +def : InstRW<[FXa], (instregex "AHIMux(K)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>; +def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>; +def : InstRW<[FXa], (instregex "ALGHSIK$")>; +def : InstRW<[FXa], (instregex "ALGF(I|R)$")>; +def : InstRW<[FXa], (instregex "ALGR(K)?$")>; +def : InstRW<[FXa], (instregex "ALR(K)?$")>; +def : InstRW<[FXa], (instregex "AR(K)?$")>; +def : InstRW<[FXa], (instregex "A(L)?HHHR$")>; +def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>; +def : InstRW<[FXa], (instregex "ALSIH(N)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>; + +// Logical addition with carry +def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>; +def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>; + +// Add with sign extension (16/32 -> 64) +def : InstRW<[FXa, LSU, Lat6], (instregex "AG(F|H)$")>; +def : InstRW<[FXa, Lat2], (instregex "AGFR$")>; + +//===----------------------------------------------------------------------===// +// Subtraction +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>; +def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>; +def : InstRW<[FXa], (instregex "SGR(K)?$")>; +def : InstRW<[FXa], (instregex "SLFI$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>; +def : InstRW<[FXa], (instregex "SLGF(I|R)$")>; +def : InstRW<[FXa], (instregex "SLGR(K)?$")>; +def : InstRW<[FXa], (instregex "SLR(K)?$")>; +def : InstRW<[FXa], (instregex "SR(K)?$")>; +def : InstRW<[FXa], (instregex "S(L)?HHHR$")>; +def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>; + +// Subtraction with borrow +def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>; +def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>; + +// Subtraction with sign extension (16/32 -> 64) +def : InstRW<[FXa, LSU, Lat6], (instregex "SG(F|H)$")>; +def : InstRW<[FXa, Lat2], (instregex "SGFR$")>; + +//===----------------------------------------------------------------------===// +// AND +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>; +def : InstRW<[FXa], (instregex "NGR(K)?$")>; +def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>; +def : InstRW<[FXa], (instregex "NIHF(64)?$")>; +def : InstRW<[FXa], (instregex "NIHH(64)?$")>; +def : InstRW<[FXa], (instregex "NIHL(64)?$")>; +def : InstRW<[FXa], (instregex "NILF(64)?$")>; +def : InstRW<[FXa], (instregex "NILH(64)?$")>; +def : InstRW<[FXa], (instregex "NILL(64)?$")>; +def : InstRW<[FXa], (instregex "NR(K)?$")>; +def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>; + +//===----------------------------------------------------------------------===// +// OR +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>; +def : InstRW<[FXa], (instregex "OGR(K)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>; +def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>; +def : InstRW<[FXa], (instregex "OIHF(64)?$")>; +def : InstRW<[FXa], (instregex "OIHH(64)?$")>; +def : InstRW<[FXa], (instregex "OIHL(64)?$")>; +def : InstRW<[FXa], (instregex "OILF(64)?$")>; +def : InstRW<[FXa], (instregex "OILH(64)?$")>; +def : InstRW<[FXa], (instregex "OILL(64)?$")>; +def : InstRW<[FXa], (instregex "OR(K)?$")>; +def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>; + +//===----------------------------------------------------------------------===// +// XOR +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>; +def : InstRW<[FXa], (instregex "XIFMux$")>; +def : InstRW<[FXa], (instregex "XGR(K)?$")>; +def : InstRW<[FXa], (instregex "XIHF(64)?$")>; +def : InstRW<[FXa], (instregex "XILF(64)?$")>; +def : InstRW<[FXa], (instregex "XR(K)?$")>; +def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>; + +//===----------------------------------------------------------------------===// +// Multiplication +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat9], (instregex "MS(GF|Y)?$")>; +def : InstRW<[FXa, Lat5], (instregex "MS(R|FI)$")>; +def : InstRW<[FXa, LSU, Lat11], (instregex "MSG$")>; +def : InstRW<[FXa, Lat7], (instregex "MSGR$")>; +def : InstRW<[FXa, Lat5], (instregex "MSGF(I|R)$")>; +def : InstRW<[FXa2, LSU, Lat12, GroupAlone], (instregex "MLG$")>; +def : InstRW<[FXa2, Lat8, GroupAlone], (instregex "MLGR$")>; +def : InstRW<[FXa, Lat4], (instregex "MGHI$")>; +def : InstRW<[FXa, Lat4], (instregex "MHI$")>; +def : InstRW<[FXa, LSU, Lat8], (instregex "MH(Y)?$")>; +def : InstRW<[FXa2, Lat6, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXa2, LSU, Lat10, GroupAlone], (instregex "M(FY|L)?$")>; +def : InstRW<[FXa, LSU, Lat8], (instregex "MGH$")>; +def : InstRW<[FXa, LSU, Lat12, GroupAlone], (instregex "MG$")>; +def : InstRW<[FXa, Lat8, GroupAlone], (instregex "MGRK$")>; +def : InstRW<[FXa, LSU, Lat9, GroupAlone], (instregex "MSC$")>; +def : InstRW<[FXa, LSU, Lat11, GroupAlone], (instregex "MSGC$")>; +def : InstRW<[FXa, Lat5], (instregex "MSRKC$")>; +def : InstRW<[FXa, Lat7], (instregex "MSGRKC$")>; + +//===----------------------------------------------------------------------===// +// Division and remainder +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>; +def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>; +def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>; +def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>; +def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>; +def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>; +def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>; + +//===----------------------------------------------------------------------===// +// Shifts +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "SLL(G|K)?$")>; +def : InstRW<[FXa], (instregex "SRL(G|K)?$")>; +def : InstRW<[FXa], (instregex "SRA(G|K)?$")>; +def : InstRW<[FXa], (instregex "SLA(G|K)?$")>; +def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone], + (instregex "S(L|R)D(A|L)$")>; + +// Rotate +def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>; + +// Rotate and insert +def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>; +def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>; +def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[FXa], (instregex "RISBMux$")>; + +// Rotate and Select +def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "R(N|O|X)SBG$")>; + +//===----------------------------------------------------------------------===// +// Comparison +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>; +def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>; +def : InstRW<[FXb], (instregex "CG(F|H)I$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>; +def : InstRW<[FXb], (instregex "C(G)?R$")>; +def : InstRW<[FXb], (instregex "CIH$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>; +def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>; +def : InstRW<[FXb], (instregex "CLGF(I|R)$")>; +def : InstRW<[FXb], (instregex "CLGR$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>; +def : InstRW<[FXb], (instregex "CLIH$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>; +def : InstRW<[FXb], (instregex "CLR$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>; +def : InstRW<[FXb], (instregex "C(L)?HHR$")>; +def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>; + +// Compare halfword +def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>; +def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>; +def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>; + +// Compare with sign extension (32 -> 64) +def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>; +def : InstRW<[FXb, Lat2], (instregex "CGFR$")>; + +// Compare logical character +def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>; + +// Test under mask +def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>; +def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>; +def : InstRW<[FXb], (instregex "TMHH(64)?$")>; +def : InstRW<[FXb], (instregex "TMHL(64)?$")>; +def : InstRW<[FXb], (instregex "TMLH(64)?$")>; +def : InstRW<[FXb], (instregex "TMLL(64)?$")>; + +// Compare logical characters under mask +def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>; + +//===----------------------------------------------------------------------===// +// Prefetch and execution hint +//===----------------------------------------------------------------------===// + +def : InstRW<[LSU], (instregex "PFD(RL)?$")>; +def : InstRW<[FXb, Lat2], (instregex "BPP$")>; +def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>; +def : InstRW<[FXb], (instregex "NIAI$")>; + +//===----------------------------------------------------------------------===// +// Atomic operations +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>; + +def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>; + +// Test and set +def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>; + +// Compare and swap +def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>; + +// Compare double and swap +def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone], + (instregex "CDS(Y)?$")>; +def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone], + (instregex "CDSG$")>; + +// Compare and swap and store +def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>; + +// Perform locked operation +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>; + +// Load/store pair from/to quadword +def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>; +def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>; + +// Load pair disjoint +def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; + +//===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>; +def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>; +def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>; +def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>; +def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>; + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR|A)?$")>; +def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC)$")>; +def : InstRW<[FXa, Lat30], (instregex "(PCC|PPNO|PRNO)$")>; + +//===----------------------------------------------------------------------===// +// Guarded storage +//===----------------------------------------------------------------------===// + +def : InstRW<[LSU], (instregex "LGG$")>; +def : InstRW<[LSU, Lat5], (instregex "LLGFSG$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)GSC$")>; + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone], + (instregex "CVBG$")>; +def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>; +def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone], + (instregex "CVDG$")>; +def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>; +def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; +def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>; +def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>; + +def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone], + (instregex "(A|S|ZA)P$")>; +def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone], + (instregex "(M|D)P$")>; +def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone], + (instregex "SRP$")>; +def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>; +def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; + +//===----------------------------------------------------------------------===// +// Access registers +//===----------------------------------------------------------------------===// + +// Extract/set/copy access register +def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>; + +// Load address extended +def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>; + +// Load/store access multiple (not modeled precisely) +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>; + +//===----------------------------------------------------------------------===// +// Program mask and addressing mode +//===----------------------------------------------------------------------===// + +// Insert Program Mask +def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>; + +// Set Program Mask +def : InstRW<[LSU, EndGroup], (instregex "SPM$")>; + +// Branch and link +def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>; + +// Test addressing mode +def : InstRW<[FXb], (instregex "TAM$")>; + +// Set addressing mode +def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>; + +// Branch (and save) and set mode. +def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>; +def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>; + +//===----------------------------------------------------------------------===// +// Transactional execution +//===----------------------------------------------------------------------===// + +// Transaction begin +def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone], + (instregex "TBEGIN(C|_nofloat)?$")>; + +// Transaction end +def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>; + +// Transaction abort +def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>; + +// Extract Transaction Nesting Depth +def : InstRW<[FXa], (instregex "ETND$")>; + +// Nontransactional store +def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>; + +//===----------------------------------------------------------------------===// +// Processor assist +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb], (instregex "PPA$")>; + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +//===----------------------------------------------------------------------===// + +// Find leftmost one +def : InstRW<[FXa, FXa, Lat4, GroupAlone], (instregex "FLOGR$")>; + +// Population count +def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>; + +// Extend +def : InstRW<[FXa], (instregex "AEXT128$")>; +def : InstRW<[FXa], (instregex "ZEXT128$")>; + +// String instructions +def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>; +def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; + +// Various complex instructions +def : InstRW<[LSU, Lat30], (instregex "CFC$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30], (instregex "CKSM$")>; +def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>; + +// Execute +def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>; + +//===----------------------------------------------------------------------===// +// .insn directive instructions +//===----------------------------------------------------------------------===// + +// An "empty" sched-class will be assigned instead of the "invalid sched-class". +// getNumDecoderSlots() will then return 1 instead of 0. +def : InstRW<[], (instregex "Insn.*")>; + + +// ----------------------------- Floating point ----------------------------- // + +//===----------------------------------------------------------------------===// +// FP: Select instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "Select(F32|F64|F128|VR128)$")>; +def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>; + +//===----------------------------------------------------------------------===// +// FP: Move instructions +//===----------------------------------------------------------------------===// + +// Load zero +def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>; +def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>; + +// Load +def : InstRW<[VecXsPm], (instregex "LER$")>; +def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[FXb, Lat3], (instregex "LGDR$")>; +def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>; + +// Load and Test +def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], + (instregex "LTXBRCompare(_VecPseudo)?$")>; + +// Copy sign +def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>; +def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>; + +//===----------------------------------------------------------------------===// +// FP: Load instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>; +def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>; +def : InstRW<[LSU], (instregex "LX$")>; + +//===----------------------------------------------------------------------===// +// FP: Store instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>; +def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>; + +//===----------------------------------------------------------------------===// +// FP: Conversion instructions +//===----------------------------------------------------------------------===// + +// Load rounded +def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>; +def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>; +def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>; + +// Load lengthened +def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>; +def : InstRW<[VecBF], (instregex "LDEBR$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>; + +// Convert from fixed / logical +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>; +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>; +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>; +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>; + +// Convert to fixed / logical +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>; +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>; +def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>; +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>; +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>; + +//===----------------------------------------------------------------------===// +// FP: Unary arithmetic +//===----------------------------------------------------------------------===// + +// Load Complement / Negative / Positive +def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>; +def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>; +def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>; +def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>; + +// Square root +def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>; +def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>; +def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>; + +// Load FP integer +def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>; +def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>; + +//===----------------------------------------------------------------------===// +// FP: Binary arithmetic +//===----------------------------------------------------------------------===// + +// Addition +def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>; +def : InstRW<[VecBF], (instregex "A(E|D)BR$")>; +def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>; + +// Subtraction +def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>; +def : InstRW<[VecBF], (instregex "S(E|D)BR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>; + +// Multiply +def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>; +def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>; +def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>; + +// Multiply and add / subtract +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; +def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>; + +// Division +def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>; +def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>; +def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>; + +// Divide to integer +def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>; + +//===----------------------------------------------------------------------===// +// FP: Comparisons +//===----------------------------------------------------------------------===// + +// Compare +def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>; +def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>; + +// Test Data Class +def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>; +def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>; + +//===----------------------------------------------------------------------===// +// FP: Floating-point control register instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>; +def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>; +def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>; +def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>; +def : InstRW<[FXa, Lat30], (instregex "SFASR$")>; +def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>; +def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>; + + +// --------------------- Hexadecimal floating point ------------------------- // + +//===----------------------------------------------------------------------===// +// HFP: Move instructions +//===----------------------------------------------------------------------===// + +// Load and Test +def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Conversion instructions +//===----------------------------------------------------------------------===// + +// Load rounded +def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>; +def : InstRW<[VecBF], (instregex "LEXR$")>; +def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>; + +// Load lengthened +def : InstRW<[LSU], (instregex "LDE$")>; +def : InstRW<[FXb], (instregex "LDER$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>; + +// Convert from fixed +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>; +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>; + +// Convert to fixed +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>; +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>; + +// Convert BFP to HFP / HFP to BFP. +def : InstRW<[VecBF], (instregex "THD(E)?R$")>; +def : InstRW<[VecBF], (instregex "TB(E)?DR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Unary arithmetic +//===----------------------------------------------------------------------===// + +// Load Complement / Negative / Positive +def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>; + +// Halve +def : InstRW<[VecBF], (instregex "H(E|D)R$")>; + +// Square root +def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>; +def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>; +def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>; + +// Load FP integer +def : InstRW<[VecBF], (instregex "FIER$")>; +def : InstRW<[VecBF], (instregex "FIDR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Binary arithmetic +//===----------------------------------------------------------------------===// + +// Addition +def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>; +def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>; +def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>; + +// Subtraction +def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>; +def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>; + +// Multiply +def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>; +def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>; +def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>; + +// Multiply and add / subtract +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>; + +// Division +def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>; +def : InstRW<[VecFPd], (instregex "D(E|D)R$")>; +def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Comparisons +//===----------------------------------------------------------------------===// + +// Compare +def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>; +def : InstRW<[VecBF], (instregex "C(E|D)R$")>; +def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>; + + +// ------------------------ Decimal floating point -------------------------- // + +//===----------------------------------------------------------------------===// +// DFP: Move instructions +//===----------------------------------------------------------------------===// + +// Load and Test +def : InstRW<[VecDF], (instregex "LTDTR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>; + +//===----------------------------------------------------------------------===// +// DFP: Conversion instructions +//===----------------------------------------------------------------------===// + +// Load rounded +def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>; +def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>; + +// Load lengthened +def : InstRW<[VecDF], (instregex "LDETR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>; + +// Convert from fixed / logical +def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>; +def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>; + +// Convert to fixed / logical +def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>; +def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>; + +// Convert from / to signed / unsigned packed +def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>; +def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>; +def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>; +def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>; + +// Convert from / to zoned +def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>; +def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>; +def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>; +def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>; + +// Convert from / to packed +def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>; +def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>; +def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>; +def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>; + +// Perform floating-point operation +def : InstRW<[FXb, Lat30], (instregex "PFPO$")>; + +//===----------------------------------------------------------------------===// +// DFP: Unary arithmetic +//===----------------------------------------------------------------------===// + +// Load FP integer +def : InstRW<[VecDF], (instregex "FIDTR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>; + +// Extract biased exponent +def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>; +def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>; + +// Extract significance +def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>; + +//===----------------------------------------------------------------------===// +// DFP: Binary arithmetic +//===----------------------------------------------------------------------===// + +// Addition +def : InstRW<[VecDF], (instregex "ADTR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>; + +// Subtraction +def : InstRW<[VecDF], (instregex "SDTR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>; + +// Multiply +def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>; + +// Division +def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>; + +// Quantize +def : InstRW<[VecDF], (instregex "QADTR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>; + +// Reround +def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>; + +// Shift significand left/right +def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>; +def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>; + +// Insert biased exponent +def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>; + +//===----------------------------------------------------------------------===// +// DFP: Comparisons +//===----------------------------------------------------------------------===// + +// Compare +def : InstRW<[VecDF], (instregex "(K|C)DTR$")>; +def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>; + +// Compare biased exponent +def : InstRW<[VecDF], (instregex "CEDTR$")>; +def : InstRW<[VecDF], (instregex "CEXTR$")>; + +// Test Data Class/Group +def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>; +def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; + + +// --------------------------------- Vector --------------------------------- // + +//===----------------------------------------------------------------------===// +// Vector: Move instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb], (instregex "VLR(32|64)?$")>; +def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>; +def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>; +def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Immediate instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm], (instregex "VZERO$")>; +def : InstRW<[VecXsPm], (instregex "VONE$")>; +def : InstRW<[VecXsPm], (instregex "VGBM$")>; +def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>; + +//===----------------------------------------------------------------------===// +// Vector: Loads +//===----------------------------------------------------------------------===// + +def : InstRW<[LSU], (instregex "VL(L|BB)?$")>; +def : InstRW<[LSU], (instregex "VL(32|64)$")>; +def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H|LF)?$")>; +def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>; +def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>; +def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>; +def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], + (instregex "VLM$")>; +def : InstRW<[LSU, Lat5], (instregex "VLRL(R)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Stores +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>; +def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>; +def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>; +def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone], + (instregex "VSTM$")>; +def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>; +def : InstRW<[FXb, LSU, Lat8], (instregex "VSTRL(R)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Selects and permutes +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VPERM$")>; +def : InstRW<[VecXsPm], (instregex "VPDI$")>; +def : InstRW<[VecXsPm], (instregex "VBPERM$")>; +def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VSEL$")>; + +//===----------------------------------------------------------------------===// +// Vector: Widening and narrowing +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>; +def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>; +def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>; +def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>; +def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Integer arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>; +def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>; +def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VN(C|O|N|X)?$")>; +def : InstRW<[VecXsPm], (instregex "VO(C)?$")>; +def : InstRW<[VecMul], (instregex "VCKSM$")>; +def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VX$")>; +def : InstRW<[VecMul], (instregex "VGFM?$")>; +def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>; +def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>; +def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>; +def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>; +def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>; +def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VML(B|F)?$")>; +def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>; +def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>; +def : InstRW<[VecBF2], (instregex "VMSL(G)?$")>; + +def : InstRW<[VecXsPm], (instregex "VPOPCT(B|F|G|H)?$")>; + +def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>; + +def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>; +def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSLB$")>; +def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>; +def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSR(A|L)B$")>; + +def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>; +def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>; +def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>; + +def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>; +def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>; +def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Integer comparison +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>; +def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>; +def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>; +def : InstRW<[VecStr, Lat5], (instregex "VTM$")>; + +//===----------------------------------------------------------------------===// +// Vector: Floating-point arithmetic +//===----------------------------------------------------------------------===// + +// Conversion and rounding +def : InstRW<[VecBF], (instregex "VCD(L)?G$")>; +def : InstRW<[VecBF], (instregex "VCD(L)?GB$")>; +def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>; +def : InstRW<[VecBF], (instregex "VC(L)?GD$")>; +def : InstRW<[VecBF], (instregex "VC(L)?GDB$")>; +def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>; +def : InstRW<[VecBF], (instregex "VL(DE|ED)$")>; +def : InstRW<[VecBF], (instregex "VL(DE|ED)B$")>; +def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>; +def : InstRW<[VecBF], (instregex "VFL(L|R)$")>; +def : InstRW<[VecBF], (instregex "VFL(LS|RD)$")>; +def : InstRW<[VecBF], (instregex "WFL(LS|RD)$")>; +def : InstRW<[VecBF2], (instregex "WFLLD$")>; +def : InstRW<[VecDF2, Lat10], (instregex "WFLRX$")>; +def : InstRW<[VecBF2], (instregex "VFI$")>; +def : InstRW<[VecBF], (instregex "VFIDB$")>; +def : InstRW<[VecBF], (instregex "WFIDB$")>; +def : InstRW<[VecBF2], (instregex "VFISB$")>; +def : InstRW<[VecBF], (instregex "WFISB$")>; +def : InstRW<[VecDF2, Lat10], (instregex "WFIXB$")>; + +// Sign operations +def : InstRW<[VecXsPm], (instregex "VFPSO$")>; +def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>; +def : InstRW<[VecXsPm], (instregex "(V|W)FPSOSB$")>; +def : InstRW<[VecXsPm], (instregex "WFPSOXB$")>; +def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>; +def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)SB$")>; +def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)XB$")>; + +// Minimum / maximum +def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)$")>; +def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)DB$")>; +def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)DB$")>; +def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)SB$")>; +def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)SB$")>; +def : InstRW<[VecDFX], (instregex "WF(MAX|MIN)XB$")>; + +// Test data class +def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCISB$")>; +def : InstRW<[VecDFX, Lat4], (instregex "WFTCIXB$")>; + +// Add / subtract +def : InstRW<[VecBF2], (instregex "VF(A|S)$")>; +def : InstRW<[VecBF], (instregex "VF(A|S)DB$")>; +def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>; +def : InstRW<[VecBF2], (instregex "VF(A|S)SB$")>; +def : InstRW<[VecBF], (instregex "WF(A|S)SB$")>; +def : InstRW<[VecDF2, Lat10], (instregex "WF(A|S)XB$")>; + +// Multiply / multiply-and-add/subtract +def : InstRW<[VecBF2], (instregex "VFM$")>; +def : InstRW<[VecBF], (instregex "VFMDB$")>; +def : InstRW<[VecBF], (instregex "WFMDB$")>; +def : InstRW<[VecBF2], (instregex "VFMSB$")>; +def : InstRW<[VecBF], (instregex "WFMSB$")>; +def : InstRW<[VecDF2, Lat20], (instregex "WFMXB$")>; +def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)$")>; +def : InstRW<[VecBF], (instregex "VF(N)?M(A|S)DB$")>; +def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)DB$")>; +def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)SB$")>; +def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)SB$")>; +def : InstRW<[VecDF2, Lat20], (instregex "WF(N)?M(A|S)XB$")>; + +// Divide / square root +def : InstRW<[VecFPd], (instregex "VFD$")>; +def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>; +def : InstRW<[VecFPd], (instregex "(V|W)FDSB$")>; +def : InstRW<[VecFPd], (instregex "WFDXB$")>; +def : InstRW<[VecFPd], (instregex "VFSQ$")>; +def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>; +def : InstRW<[VecFPd], (instregex "(V|W)FSQSB$")>; +def : InstRW<[VecFPd], (instregex "WFSQXB$")>; + +//===----------------------------------------------------------------------===// +// Vector: Floating-point comparison +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)$")>; +def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)DB$")>; +def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)DB$")>; +def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)SB$")>; +def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)SB$")>; +def : InstRW<[VecDFX], (instregex "WF(C|K)(E|H|HE)XB$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)DBS$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)DBS$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)SBS$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)SBS$")>; +def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)(E|H|HE)XBS$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)SB$")>; +def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)XB$")>; + +//===----------------------------------------------------------------------===// +// Vector: Floating-point insertion and extraction +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb], (instregex "LEFR$")>; +def : InstRW<[FXb, Lat4], (instregex "LFER$")>; + +//===----------------------------------------------------------------------===// +// Vector: String instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[VecStr], (instregex "VFAE(B)?$")>; +def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>; +def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>; +def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>; +def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>; +def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>; +def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>; +def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>; +def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>; +def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>; +def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>; +def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>; +def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>; +def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>; +def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>; +def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>; + +//===----------------------------------------------------------------------===// +// Vector: Packed-decimal instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[VecDF, VecDF, Lat10, GroupAlone], (instregex "VLIP$")>; +def : InstRW<[VecDFX, LSU, Lat12, GroupAlone], (instregex "VPKZ$")>; +def : InstRW<[VecDFX, FXb, LSU, Lat12, GroupAlone], (instregex "VUPKZ$")>; +def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVB(G)?$")>; +def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVD(G)?$")>; +def : InstRW<[VecDFX], (instregex "V(A|S)P$")>; +def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "VM(S)?P$")>; +def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "V(D|R)P$")>; +def : InstRW<[VecDFX, Lat30, GroupAlone], (instregex "VSDP$")>; +def : InstRW<[VecDF, VecDF, Lat11], (instregex "VSRP$")>; +def : InstRW<[VecDFX], (instregex "VPSOP$")>; +def : InstRW<[VecDFX], (instregex "V(T|C)P$")>; + + +// -------------------------------- System ---------------------------------- // + +//===----------------------------------------------------------------------===// +// System: Program-Status Word Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "EPSW$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>; +def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>; +def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>; +def : InstRW<[LSU, EndGroup], (instregex "SSM$")>; +def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>; +def : InstRW<[FXa, Lat3], (instregex "IAC$")>; +def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>; + +//===----------------------------------------------------------------------===// +// System: Control Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>; +def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>; +def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>; +def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>; +def : InstRW<[FXb, Lat30], (instregex "ESEA$")>; + +//===----------------------------------------------------------------------===// +// System: Prefix-Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>; + +//===----------------------------------------------------------------------===// +// System: Storage-Key and Real Memory Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "ISKE$")>; +def : InstRW<[FXb, Lat30], (instregex "IVSK$")>; +def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>; +def : InstRW<[FXb, Lat30], (instregex "IRBM$")>; +def : InstRW<[FXb, Lat30], (instregex "PFMF$")>; +def : InstRW<[FXb, Lat30], (instregex "TB$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>; + +//===----------------------------------------------------------------------===// +// System: Dynamic-Address-Translation Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "PTLB$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>; + +//===----------------------------------------------------------------------===// +// System: Memory-move Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>; +def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>; + +//===----------------------------------------------------------------------===// +// System: Address-Space Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>; +def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>; +def : InstRW<[FXb, Lat30], (instregex "PR$")>; +def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>; +def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>; +def : InstRW<[FXb, Lat20], (instregex "TAR$")>; + +//===----------------------------------------------------------------------===// +// System: Linkage-Stack Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>; +def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>; +def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>; + +//===----------------------------------------------------------------------===// +// System: Time-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "PTFF$")>; +def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>; +def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>; +def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>; +def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>; +def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone], + (instregex "STCK(F)?$")>; +def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone], + (instregex "STCKE$")>; +def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>; +def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>; +def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>; +def : InstRW<[FXb, Lat30], (instregex "PTF$")>; +def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>; + +//===----------------------------------------------------------------------===// +// System: Miscellaneous Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "SVC$")>; +def : InstRW<[FXb, GroupAlone], (instregex "MC$")>; +def : InstRW<[FXb, Lat30], (instregex "DIAG$")>; +def : InstRW<[FXb], (instregex "TRAC(E|G)$")>; +def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>; +def : InstRW<[FXb, Lat30], (instregex "SIGP$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Measurement Facility Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb], (instregex "LPP$")>; +def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>; +def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>; +def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>; +def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>; + +//===----------------------------------------------------------------------===// +// System: I/O Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>; +def : InstRW<[FXb, Lat30], (instregex "RCHP$")>; +def : InstRW<[FXb, Lat30], (instregex "SCHM$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>; +def : InstRW<[FXb, Lat30], (instregex "SAL$")>; + +} + diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td index e3e1999d8ad8..4d986e8391cf 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ196.td +++ b/lib/Target/SystemZ/SystemZScheduleZ196.td @@ -311,7 +311,7 @@ def : InstRW<[FXU], (instregex "ALGR(K)?$")>; def : InstRW<[FXU], (instregex "ALR(K)?$")>; def : InstRW<[FXU], (instregex "AR(K)?$")>; def : InstRW<[FXU], (instregex "A(L)?HHHR$")>; -def : InstRW<[FXU, FXU, Lat3], (instregex "A(L)?HHLR$")>; +def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "A(L)?HHLR$")>; def : InstRW<[FXU], (instregex "ALSIH(N)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>; @@ -337,7 +337,7 @@ def : InstRW<[FXU], (instregex "SLGR(K)?$")>; def : InstRW<[FXU], (instregex "SLR(K)?$")>; def : InstRW<[FXU], (instregex "SR(K)?$")>; def : InstRW<[FXU], (instregex "S(L)?HHHR$")>; -def : InstRW<[FXU, FXU, Lat3], (instregex "S(L)?HHLR$")>; +def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "S(L)?HHLR$")>; // Subtraction with borrow def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>; @@ -403,13 +403,13 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>; def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>; def : InstRW<[FXU, Lat8], (instregex "MSGR$")>; def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>; -def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>; -def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>; +def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>; +def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXU, Lat5], (instregex "MGHI$")>; def : InstRW<[FXU, Lat5], (instregex "MHI$")>; def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>; -def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; -def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; +def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder @@ -436,7 +436,8 @@ def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>; -def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>; +def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone], + (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -474,7 +475,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>; def : InstRW<[FXU], (instregex "CLR$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>; def : InstRW<[FXU], (instregex "C(L)?HHR$")>; -def : InstRW<[FXU, FXU, Lat3], (instregex "C(L)?HLR$")>; +def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "C(L)?HLR$")>; // Compare halfword def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>; @@ -499,7 +500,7 @@ def : InstRW<[FXU], (instregex "TMLH(64)?$")>; def : InstRW<[FXU], (instregex "TMLL(64)?$")>; // Compare logical characters under mask -def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>; +def : InstRW<[FXU, FXU, LSU, Lat5, GroupAlone], (instregex "CLM(H|Y)?$")>; //===----------------------------------------------------------------------===// // Prefetch @@ -532,7 +533,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone], (instregex "CDSG$")>; // Compare and swap and store -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>; // Perform locked operation def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>; @@ -548,36 +549,44 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; // Translate and convert //===----------------------------------------------------------------------===// -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TRT$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>; +def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>; +def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>; //===----------------------------------------------------------------------===// // Message-security assist //===----------------------------------------------------------------------===// -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; +def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; //===----------------------------------------------------------------------===// // Decimal arithmetic //===----------------------------------------------------------------------===// -def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; -def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>; +def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>; +def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone], + (instregex "CVDG$")>; +def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>; +def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>; +def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone], +def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone], (instregex "(A|S|ZA)P$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone], +def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "(M|D)P$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone], +def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone], (instregex "SRP$")>; -def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; -def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>; +def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; +def : InstRW<[DFU2, LSU, LSU, GroupAlone], (instregex "TP$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; //===----------------------------------------------------------------------===// @@ -621,7 +630,7 @@ def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>; //===----------------------------------------------------------------------===// // Find leftmost one -def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>; +def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>; // Population count def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>; @@ -632,14 +641,14 @@ def : InstRW<[FXU], (instregex "ZEXT128$")>; // String instructions def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>; -def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; // Various complex instructions -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; +def : InstRW<[LSU, Lat30], (instregex "CFC$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30], (instregex "CKSM$")>; +def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>; // Execute def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>; @@ -780,9 +789,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>; // Multiply and add / subtract -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>; -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>; // Division @@ -791,7 +800,7 @@ def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>; // Divide to integer -def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; +def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>; //===----------------------------------------------------------------------===// // FP: Comparisons @@ -813,9 +822,9 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>; def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>; def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>; def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>; -def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>; -def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>; +def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>; +def : InstRW<[FXU, Lat30], (instregex "SFASR$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>; def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>; @@ -900,16 +909,20 @@ def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>; def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>; def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>; -def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>; -def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>; +def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>; +def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>; +def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>; +def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>; // Multiply and add / subtract -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>; -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>; -def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>; -def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>; +def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>; +def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>; +def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>; // Division def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>; @@ -949,16 +962,21 @@ def : InstRW<[DFU, Lat20], (instregex "LDETR$")>; def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>; // Convert from fixed / logical -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>; -def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>; +def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>; +def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>; def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>; -def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>; // Convert to fixed / logical -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>; -def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>; -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>; -def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>; +def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>; +def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>; +def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>; // Convert from / to signed / unsigned packed def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>; @@ -967,7 +985,7 @@ def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>; def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>; // Perform floating-point operation -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>; +def : InstRW<[FXU, Lat30], (instregex "PFPO$")>; //===----------------------------------------------------------------------===// // DFP: Unary arithmetic @@ -979,7 +997,7 @@ def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>; // Extract biased exponent def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>; -def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>; +def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>; // Extract significance def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>; @@ -1010,15 +1028,15 @@ def : InstRW<[DFU, Lat30], (instregex "QADTR$")>; def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>; // Reround -def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>; def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>; // Shift significand left/right -def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>; +def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>; def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>; // Insert biased exponent -def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>; def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>; //===----------------------------------------------------------------------===// @@ -1027,15 +1045,15 @@ def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>; // Compare def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>; -def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>; +def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>; // Compare biased exponent def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>; -def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>; +def : InstRW<[DFU2, Lat9], (instregex "CEXTR$")>; // Test Data Class/Group def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>; -def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; +def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>; // -------------------------------- System ---------------------------------- // @@ -1046,19 +1064,20 @@ def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>; -def : InstRW<[FXU, Lat3], (instregex "IPK$")>; -def : InstRW<[LSU], (instregex "SPKA$")>; -def : InstRW<[LSU], (instregex "SSM$")>; -def : InstRW<[FXU], (instregex "ST(N|O)SM$")>; +def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>; +def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>; +def : InstRW<[LSU, EndGroup], (instregex "SSM$")>; +def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>; def : InstRW<[FXU, Lat3], (instregex "IAC$")>; -def : InstRW<[LSU], (instregex "SAC(F)?$")>; +def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>; //===----------------------------------------------------------------------===// // System: Control Register Instructions //===----------------------------------------------------------------------===// def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>; -def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>; +def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], + (instregex "STCT(L|G)$")>; def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>; def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>; def : InstRW<[FXU, Lat30], (instregex "ESEA$")>; @@ -1103,16 +1122,17 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>; //===----------------------------------------------------------------------===// def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>; -def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCSK$")>; +def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>; -def : InstRW<[LSU, Lat30], (instregex "MVPG$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>; //===----------------------------------------------------------------------===// // System: Address-Space Instructions //===----------------------------------------------------------------------===// def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>; -def : InstRW<[LSU], (instregex "PALB$")>; +def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>; def : InstRW<[FXU, Lat30], (instregex "PR$")>; def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>; @@ -1124,7 +1144,7 @@ def : InstRW<[FXU, Lat20], (instregex "TAR$")>; // System: Linkage-Stack Instructions //===----------------------------------------------------------------------===// -def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>; +def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>; def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>; def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>; @@ -1161,9 +1181,9 @@ def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>; //===----------------------------------------------------------------------===// def : InstRW<[FXU, Lat30], (instregex "SVC$")>; -def : InstRW<[FXU], (instregex "MC$")>; +def : InstRW<[FXU, GroupAlone], (instregex "MC$")>; def : InstRW<[FXU, Lat30], (instregex "DIAG$")>; -def : InstRW<[FXU], (instregex "TRAC(E|G)$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "TRAC(E|G)$")>; def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>; def : InstRW<[FXU, Lat30], (instregex "SIGP$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>; @@ -1176,7 +1196,8 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>; def : InstRW<[FXU], (instregex "LPP$")>; def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>; def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>; -def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>; +def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>; def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>; diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td index 59f37205f412..a0f2115eb9d7 100644 --- a/lib/Target/SystemZ/SystemZScheduleZEC12.td +++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -69,7 +69,7 @@ def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 8; } def : WriteRes { let Latency = 9; } def : WriteRes { let Latency = 2; } -def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } def : WriteRes; // Virtual Branching Unit // -------------------------- INSTRUCTIONS ---------------------------------- // @@ -251,7 +251,7 @@ def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], (instregex "LM(H|Y|G)?$")>; // Load multiple disjoint -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>; // Store multiple (estimated average of 3 ops) def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone], @@ -413,13 +413,13 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>; def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>; def : InstRW<[FXU, Lat8], (instregex "MSGR$")>; def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>; -def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>; -def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>; +def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>; +def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXU, Lat5], (instregex "MGHI$")>; def : InstRW<[FXU, Lat5], (instregex "MHI$")>; def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>; -def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; -def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; +def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder @@ -446,7 +446,8 @@ def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; def : InstRW<[FXU], (instregex "SLA(G|K)?$")>; -def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>; +def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone], + (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -544,7 +545,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone], (instregex "CDSG$")>; // Compare and swap and store -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>; // Perform locked operation def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>; @@ -560,36 +561,44 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; // Translate and convert //===----------------------------------------------------------------------===// -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>; +def : InstRW<[FXU, FXU, FXU, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>; +def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>; +def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>; //===----------------------------------------------------------------------===// // Message-security assist //===----------------------------------------------------------------------===// -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; +def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; //===----------------------------------------------------------------------===// // Decimal arithmetic //===----------------------------------------------------------------------===// -def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; -def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>; +def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>; +def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone], + (instregex "CVDG$")>; +def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>; +def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>; +def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone], +def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone], (instregex "(A|S|ZA)P$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone], +def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "(M|D)P$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone], +def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone], (instregex "SRP$")>; -def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; -def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>; +def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; +def : InstRW<[DFU2, LSU, LSU, Lat5, GroupAlone], (instregex "TP$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; //===----------------------------------------------------------------------===// @@ -659,7 +668,7 @@ def : InstRW<[FXU], (instregex "PPA$")>; //===----------------------------------------------------------------------===// // Find leftmost one -def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>; +def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>; // Population count def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>; @@ -670,14 +679,14 @@ def : InstRW<[FXU], (instregex "ZEXT128$")>; // String instructions def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>; -def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; // Various complex instructions -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; +def : InstRW<[LSU, Lat30], (instregex "CFC$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30], (instregex "CKSM$")>; +def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>; // Execute def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>; @@ -818,9 +827,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>; // Multiply and add / subtract -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>; -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>; // Division @@ -829,7 +838,7 @@ def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>; // Divide to integer -def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; +def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>; //===----------------------------------------------------------------------===// // FP: Comparisons @@ -851,10 +860,10 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>; def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>; def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>; def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>; -def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>; -def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>; -def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>; +def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>; +def : InstRW<[FXU, Lat30], (instregex "SFASR$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>; +def : InstRW<[FXU, GroupAlone], (instregex "SRNM(B|T)?$")>; // --------------------- Hexadecimal floating point ------------------------- // @@ -938,16 +947,20 @@ def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>; def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>; def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>; -def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>; -def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>; +def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>; +def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>; +def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>; +def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>; // Multiply and add / subtract -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>; -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>; -def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>; -def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>; +def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>; +def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>; +def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>; // Division def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>; @@ -987,16 +1000,21 @@ def : InstRW<[DFU, Lat20], (instregex "LDETR$")>; def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>; // Convert from fixed / logical -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>; -def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>; +def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>; +def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>; def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>; -def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>; // Convert to fixed / logical -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>; -def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>; -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>; -def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>; +def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>; +def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>; +def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>; // Convert from / to signed / unsigned packed def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>; @@ -1007,11 +1025,11 @@ def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$") // Convert from / to zoned def : InstRW<[LSU, DFU2, Lat7, GroupAlone], (instregex "CDZT$")>; def : InstRW<[LSU, LSU, DFU2, DFU2, Lat10, GroupAlone], (instregex "CXZT$")>; -def : InstRW<[FXU, LSU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>; +def : InstRW<[FXU, LSU, DFU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>; def : InstRW<[FXU, LSU, DFU, DFU, Lat15, GroupAlone], (instregex "CZXT$")>; // Perform floating-point operation -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>; +def : InstRW<[FXU, Lat30], (instregex "PFPO$")>; //===----------------------------------------------------------------------===// // DFP: Unary arithmetic @@ -1023,7 +1041,7 @@ def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>; // Extract biased exponent def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>; -def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>; +def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>; // Extract significance def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>; @@ -1054,15 +1072,15 @@ def : InstRW<[DFU, Lat30], (instregex "QADTR$")>; def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>; // Reround -def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>; def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>; // Shift significand left/right -def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>; +def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>; def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>; // Insert biased exponent -def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>; def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>; //===----------------------------------------------------------------------===// @@ -1071,15 +1089,15 @@ def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>; // Compare def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>; -def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>; +def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>; // Compare biased exponent def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>; -def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>; +def : InstRW<[DFU, DFU, Lat9], (instregex "CEXTR$")>; // Test Data Class/Group def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>; -def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; +def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>; // -------------------------------- System ---------------------------------- // @@ -1090,19 +1108,20 @@ def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>; -def : InstRW<[FXU, Lat3], (instregex "IPK$")>; -def : InstRW<[LSU], (instregex "SPKA$")>; -def : InstRW<[LSU], (instregex "SSM$")>; -def : InstRW<[FXU], (instregex "ST(N|O)SM$")>; +def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>; +def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>; +def : InstRW<[LSU, EndGroup], (instregex "SSM$")>; +def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>; def : InstRW<[FXU, Lat3], (instregex "IAC$")>; -def : InstRW<[LSU], (instregex "SAC(F)?$")>; +def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>; //===----------------------------------------------------------------------===// // System: Control Register Instructions //===----------------------------------------------------------------------===// def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>; -def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>; +def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat30, GroupAlone], + (instregex "STCT(L|G)$")>; def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>; def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>; def : InstRW<[FXU, Lat30], (instregex "ESEA$")>; @@ -1148,16 +1167,17 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>; //===----------------------------------------------------------------------===// def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>; -def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[LSU, Lat6, Lat30, GroupAlone], (instregex "MVCSK$")>; +def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>; -def : InstRW<[LSU, Lat30], (instregex "MVPG$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>; //===----------------------------------------------------------------------===// // System: Address-Space Instructions //===----------------------------------------------------------------------===// def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>; -def : InstRW<[LSU], (instregex "PALB$")>; +def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>; def : InstRW<[FXU, Lat30], (instregex "PR$")>; def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>; @@ -1169,7 +1189,7 @@ def : InstRW<[FXU, Lat20], (instregex "TAR$")>; // System: Linkage-Stack Instructions //===----------------------------------------------------------------------===// -def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>; +def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>; def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>; def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>; @@ -1206,7 +1226,7 @@ def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>; //===----------------------------------------------------------------------===// def : InstRW<[FXU, Lat30], (instregex "SVC$")>; -def : InstRW<[FXU], (instregex "MC$")>; +def : InstRW<[FXU, GroupAlone], (instregex "MC$")>; def : InstRW<[FXU, Lat30], (instregex "DIAG$")>; def : InstRW<[FXU], (instregex "TRAC(E|G)$")>; def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>; @@ -1221,7 +1241,8 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>; def : InstRW<[FXU], (instregex "LPP$")>; def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>; def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>; -def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>; +def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>; def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>; diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp index 7391df8342ef..13ceb371a425 100644 --- a/lib/Target/SystemZ/SystemZShortenInst.cpp +++ b/lib/Target/SystemZ/SystemZShortenInst.cpp @@ -200,14 +200,26 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { Changed |= shortenOn001AddCC(MI, SystemZ::ADBR); break; + case SystemZ::WFASB: + Changed |= shortenOn001AddCC(MI, SystemZ::AEBR); + break; + case SystemZ::WFDDB: Changed |= shortenOn001(MI, SystemZ::DDBR); break; + case SystemZ::WFDSB: + Changed |= shortenOn001(MI, SystemZ::DEBR); + break; + case SystemZ::WFIDB: Changed |= shortenFPConv(MI, SystemZ::FIDBRA); break; + case SystemZ::WFISB: + Changed |= shortenFPConv(MI, SystemZ::FIEBRA); + break; + case SystemZ::WLDEB: Changed |= shortenOn01(MI, SystemZ::LDEBR); break; @@ -220,30 +232,58 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { Changed |= shortenOn001(MI, SystemZ::MDBR); break; + case SystemZ::WFMSB: + Changed |= shortenOn001(MI, SystemZ::MEEBR); + break; + case SystemZ::WFLCDB: Changed |= shortenOn01(MI, SystemZ::LCDFR); break; + case SystemZ::WFLCSB: + Changed |= shortenOn01(MI, SystemZ::LCDFR_32); + break; + case SystemZ::WFLNDB: Changed |= shortenOn01(MI, SystemZ::LNDFR); break; + case SystemZ::WFLNSB: + Changed |= shortenOn01(MI, SystemZ::LNDFR_32); + break; + case SystemZ::WFLPDB: Changed |= shortenOn01(MI, SystemZ::LPDFR); break; + case SystemZ::WFLPSB: + Changed |= shortenOn01(MI, SystemZ::LPDFR_32); + break; + case SystemZ::WFSQDB: Changed |= shortenOn01(MI, SystemZ::SQDBR); break; + case SystemZ::WFSQSB: + Changed |= shortenOn01(MI, SystemZ::SQEBR); + break; + case SystemZ::WFSDB: Changed |= shortenOn001AddCC(MI, SystemZ::SDBR); break; + case SystemZ::WFSSB: + Changed |= shortenOn001AddCC(MI, SystemZ::SEBR); + break; + case SystemZ::WFCDB: Changed |= shortenOn01(MI, SystemZ::CDBR); break; + case SystemZ::WFCSB: + Changed |= shortenOn01(MI, SystemZ::CEBR); + break; + case SystemZ::VL32: // For z13 we prefer LDE over LE to avoid partial register dependencies. Changed |= shortenOn0(MI, SystemZ::LDE32); diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp index eb4a0962f7eb..9cd09b0f911e 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -47,6 +47,10 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU, HasVector(false), HasLoadStoreOnCond2(false), HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false), HasDFPPackedConversion(false), + HasMiscellaneousExtensions2(false), HasGuardedStorage(false), + HasMessageSecurityAssist7(false), HasMessageSecurityAssist8(false), + HasVectorEnhancements1(false), HasVectorPackedDecimal(false), + HasInsertReferenceBitsMultiple(false), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), TSInfo(), FrameLowering() {} diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h index b05a1bb6cafd..4829f73e080e 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.h +++ b/lib/Target/SystemZ/SystemZSubtarget.h @@ -56,6 +56,13 @@ protected: bool HasLoadAndZeroRightmostByte; bool HasMessageSecurityAssist5; bool HasDFPPackedConversion; + bool HasMiscellaneousExtensions2; + bool HasGuardedStorage; + bool HasMessageSecurityAssist7; + bool HasMessageSecurityAssist8; + bool HasVectorEnhancements1; + bool HasVectorPackedDecimal; + bool HasInsertReferenceBitsMultiple; private: Triple TargetTriple; @@ -168,6 +175,33 @@ public: // Return true if the target has the vector facility. bool hasVector() const { return HasVector; } + // Return true if the target has the miscellaneous-extensions facility 2. + bool hasMiscellaneousExtensions2() const { + return HasMiscellaneousExtensions2; + } + + // Return true if the target has the guarded-storage facility. + bool hasGuardedStorage() const { return HasGuardedStorage; } + + // Return true if the target has the message-security-assist + // extension facility 7. + bool hasMessageSecurityAssist7() const { return HasMessageSecurityAssist7; } + + // Return true if the target has the message-security-assist + // extension facility 8. + bool hasMessageSecurityAssist8() const { return HasMessageSecurityAssist8; } + + // Return true if the target has the vector-enhancements facility 1. + bool hasVectorEnhancements1() const { return HasVectorEnhancements1; } + + // Return true if the target has the vector-packed-decimal facility. + bool hasVectorPackedDecimal() const { return HasVectorPackedDecimal; } + + // Return true if the target has the insert-reference-bits-multiple facility. + bool hasInsertReferenceBitsMultiple() const { + return HasInsertReferenceBitsMultiple; + } + // Return true if GV can be accessed using LARL for reloc model RM // and code model CM. bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const; diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index cb81c0e5276e..025bf73d2df0 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -143,8 +143,10 @@ public: } // end anonymous namespace void SystemZPassConfig::addIRPasses() { - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None) { addPass(createSystemZTDCPass()); + addPass(createLoopDataPrefetchPass()); + } TargetPassConfig::addIRPasses(); } diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 9ac768b2189d..506dc7427993 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -372,6 +372,9 @@ int SystemZTTIImpl::getArithmeticInstrCost( Opcode == Instruction::FMul || Opcode == Instruction::FDiv) { switch (ScalarBits) { case 32: { + // The vector enhancements facility 1 provides v4f32 instructions. + if (ST->hasVectorEnhancements1()) + return NumVectors; // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 6923fc6fc910..a0c6fa94f8c1 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -56,6 +56,10 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; + unsigned getCacheLineSize() { return 256; } + unsigned getPrefetchDistance() { return 2000; } + unsigned getMinPrefetchStride() { return 2048; } + bool prefersVectorizedAddressing() { return false; } bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index fc4adddc149b..6e08d4cff6ea 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -37,6 +37,7 @@ endif() set(sources X86AsmPrinter.cpp X86CallFrameOptimization.cpp + X86CmovConversion.cpp X86ExpandPseudo.cpp X86FastISel.cpp X86FixupBWInsts.cpp diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 19c93cfff0fe..91201d1fec85 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -83,6 +83,9 @@ FunctionPass *createX86WinEHStatePass(); /// the MachineInstr to MC. FunctionPass *createX86ExpandPseudoPass(); +/// This pass converts X86 cmov instructions into branch when profitable. +FunctionPass *createX86CmovConverterPass(); + /// Return a Machine IR pass that selectively replaces /// certain byte and word instructions by equivalent 32 bit instructions, /// in order to eliminate partial register usage, false dependences on diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 4ca57fe9fb00..54eabeac5126 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -814,10 +814,8 @@ def : Proc<"bdver4", [ FeatureMWAITX ]>; -// TODO: The scheduler model falls to BTVER2 model. -// The znver1 model has to be put in place. -// Zen -def: ProcessorModel<"znver1", BtVer2Model, [ +// Znver1 +def: ProcessorModel<"znver1", Znver1Model, [ FeatureADX, FeatureAES, FeatureAVX2, diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 6decb550ad5f..26461986427d 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -448,7 +448,7 @@ def RetCC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::Swift", CCDelegateTo>, // Handle explicit CC selection - CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo>, + CCIfCC<"CallingConv::Win64", CCDelegateTo>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo>, // Handle Vectorcall CC @@ -1004,7 +1004,7 @@ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::HiPE", CCDelegateTo>, CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo>, CCIfCC<"CallingConv::AnyReg", CCDelegateTo>, - CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo>, + CCIfCC<"CallingConv::Win64", CCDelegateTo>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, CCIfCC<"CallingConv::HHVM", CCDelegateTo>, diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp new file mode 100644 index 000000000000..bfc834435de5 --- /dev/null +++ b/lib/Target/X86/X86CmovConversion.cpp @@ -0,0 +1,611 @@ +//====-- X86CmovConversion.cpp - Convert Cmov to Branch -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements a pass that converts X86 cmov instructions into branch +/// when profitable. This pass is conservative, i.e., it applies transformation +/// if and only if it can gaurantee a gain with high confidence. +/// +/// Thus, the optimization applies under the following conditions: +/// 1. Consider as a candidate only CMOV in most inner loop, assuming that +/// most hotspots are represented by these loops. +/// 2. Given a group of CMOV instructions, that are using same EFLAGS def +/// instruction: +/// a. Consider them as candidates only if all have same code condition or +/// opposite one, to prevent generating more than one conditional jump +/// per EFLAGS def instruction. +/// b. Consider them as candidates only if all are profitable to be +/// converted, assuming that one bad conversion may casue a degradation. +/// 3. Apply conversion only for loop that are found profitable and only for +/// CMOV candidates that were found profitable. +/// a. Loop is considered profitable only if conversion will reduce its +/// depth cost by some thrishold. +/// b. CMOV is considered profitable if the cost of its condition is higher +/// than the average cost of its true-value and false-value by 25% of +/// branch-misprediction-penalty, this to assure no degredassion even +/// with 25% branch misprediction. +/// +/// Note: This pass is assumed to run on SSA machine code. +//===----------------------------------------------------------------------===// +// +// External interfaces: +// FunctionPass *llvm::createX86CmovConverterPass(); +// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF); +// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-cmov-converter" + +STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups"); +STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates"); +STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops"); +STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups"); + +namespace { +// This internal switch can be used to turn off the cmov/branch optimization. +static cl::opt + EnableCmovConverter("x86-cmov-converter", + cl::desc("Enable the X86 cmov-to-branch optimization."), + cl::init(true), cl::Hidden); + +/// Converts X86 cmov instructions into branches when profitable. +class X86CmovConverterPass : public MachineFunctionPass { +public: + X86CmovConverterPass() : MachineFunctionPass(ID) {} + ~X86CmovConverterPass() {} + + StringRef getPassName() const override { return "X86 cmov Conversion"; } + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + /// Pass identification, replacement for typeid. + static char ID; + + const MachineRegisterInfo *MRI; + const TargetInstrInfo *TII; + TargetSchedModel TSchedModel; + + /// List of consecutive CMOV instructions. + typedef SmallVector CmovGroup; + typedef SmallVector CmovGroups; + + /// Collect all CMOV-group-candidates in \p CurrLoop and update \p + /// CmovInstGroups accordingly. + /// + /// \param CurrLoop Loop being processed. + /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. + /// \returns true iff it found any CMOV-group-candidate. + bool collectCmovCandidates(MachineLoop *CurrLoop, CmovGroups &CmovInstGroups); + + /// Check if it is profitable to transform each CMOV-group-candidates into + /// branch. Remove all groups that are not profitable from \p CmovInstGroups. + /// + /// \param CurrLoop Loop being processed. + /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. + /// \returns true iff any CMOV-group-candidate remain. + bool checkForProfitableCmovCandidates(MachineLoop *CurrLoop, + CmovGroups &CmovInstGroups); + + /// Convert the given list of consecutive CMOV instructions into a branch. + /// + /// \param Group Consecutive CMOV instructions to be converted into branch. + void convertCmovInstsToBranches(SmallVectorImpl &Group) const; +}; + +char X86CmovConverterPass::ID = 0; + +void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); +} + +bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + if (!EnableCmovConverter) + return false; + + DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() + << "**********\n"); + + bool Changed = false; + MachineLoopInfo &MLI = getAnalysis(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + MRI = &MF.getRegInfo(); + TII = STI.getInstrInfo(); + TSchedModel.init(STI.getSchedModel(), &STI, TII); + + //===--------------------------------------------------------------------===// + // Algorithm + // --------- + // For each inner most loop + // collectCmovCandidates() { + // Find all CMOV-group-candidates. + // } + // + // checkForProfitableCmovCandidates() { + // * Calculate both loop-depth and optimized-loop-depth. + // * Use these depth to check for loop transformation profitability. + // * Check for CMOV-group-candidate transformation profitability. + // } + // + // For each profitable CMOV-group-candidate + // convertCmovInstsToBranches() { + // * Create FalseBB, SinkBB, Conditional branch to SinkBB. + // * Replace each CMOV instruction with a PHI instruction in SinkBB. + // } + // + // Note: For more details, see each function description. + //===--------------------------------------------------------------------===// + for (MachineBasicBlock &MBB : MF) { + MachineLoop *CurrLoop = MLI.getLoopFor(&MBB); + + // Optimize only inner most loops. + if (!CurrLoop || CurrLoop->getHeader() != &MBB || + !CurrLoop->getSubLoops().empty()) + continue; + + // List of consecutive CMOV instructions to be processed. + CmovGroups CmovInstGroups; + + if (!collectCmovCandidates(CurrLoop, CmovInstGroups)) + continue; + + if (!checkForProfitableCmovCandidates(CurrLoop, CmovInstGroups)) + continue; + + Changed = true; + for (auto &Group : CmovInstGroups) + convertCmovInstsToBranches(Group); + } + return Changed; +} + +bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop, + CmovGroups &CmovInstGroups) { + //===--------------------------------------------------------------------===// + // Collect all CMOV-group-candidates and add them into CmovInstGroups. + // + // CMOV-group: + // CMOV instructions, in same MBB, that uses same EFLAGS def instruction. + // + // CMOV-group-candidate: + // CMOV-group where all the CMOV instructions are + // 1. consecutive. + // 2. have same condition code or opposite one. + // 3. have only operand registers (X86::CMOVrr). + //===--------------------------------------------------------------------===// + // List of possible improvement (TODO's): + // -------------------------------------- + // TODO: Add support for X86::CMOVrm instructions. + // TODO: Add support for X86::SETcc instructions. + // TODO: Add support for CMOV-groups with non consecutive CMOV instructions. + //===--------------------------------------------------------------------===// + + // Current processed CMOV-Group. + CmovGroup Group; + for (auto *MBB : CurrLoop->getBlocks()) { + Group.clear(); + // Condition code of first CMOV instruction current processed range and its + // opposite condition code. + X86::CondCode FirstCC, FirstOppCC; + // Indicator of a non CMOVrr instruction in the current processed range. + bool FoundNonCMOVInst = false; + // Indicator for current processed CMOV-group if it should be skipped. + bool SkipGroup = false; + + for (auto &I : *MBB) { + X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode()); + // Check if we found a X86::CMOVrr instruction. + if (CC != X86::COND_INVALID && !I.mayLoad()) { + if (Group.empty()) { + // We found first CMOV in the range, reset flags. + FirstCC = CC; + FirstOppCC = X86::GetOppositeBranchCondition(CC); + FoundNonCMOVInst = false; + SkipGroup = false; + } + Group.push_back(&I); + // Check if it is a non-consecutive CMOV instruction or it has different + // condition code than FirstCC or FirstOppCC. + if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC)) + // Mark the SKipGroup indicator to skip current processed CMOV-Group. + SkipGroup = true; + continue; + } + // If Group is empty, keep looking for first CMOV in the range. + if (Group.empty()) + continue; + + // We found a non X86::CMOVrr instruction. + FoundNonCMOVInst = true; + // Check if this instruction define EFLAGS, to determine end of processed + // range, as there would be no more instructions using current EFLAGS def. + if (I.definesRegister(X86::EFLAGS)) { + // Check if current processed CMOV-group should not be skipped and add + // it as a CMOV-group-candidate. + if (!SkipGroup) + CmovInstGroups.push_back(Group); + else + ++NumOfSkippedCmovGroups; + Group.clear(); + } + } + // End of basic block is considered end of range, check if current processed + // CMOV-group should not be skipped and add it as a CMOV-group-candidate. + if (Group.empty()) + continue; + if (!SkipGroup) + CmovInstGroups.push_back(Group); + else + ++NumOfSkippedCmovGroups; + } + + NumOfCmovGroupCandidate += CmovInstGroups.size(); + return !CmovInstGroups.empty(); +} + +/// \returns Depth of CMOV instruction as if it was converted into branch. +/// \param TrueOpDepth depth cost of CMOV true value operand. +/// \param FalseOpDepth depth cost of CMOV false value operand. +static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) { + //===--------------------------------------------------------------------===// + // With no info about branch weight, we assume 50% for each value operand. + // Thus, depth of optimized CMOV instruction is the rounded up average of + // its True-Operand-Value-Depth and False-Operand-Value-Depth. + //===--------------------------------------------------------------------===// + return (TrueOpDepth + FalseOpDepth + 1) / 2; +} + +bool X86CmovConverterPass::checkForProfitableCmovCandidates( + MachineLoop *CurrLoop, CmovGroups &CmovInstGroups) { + struct DepthInfo { + /// Depth of original loop. + unsigned Depth; + /// Depth of optimized loop. + unsigned OptDepth; + }; + /// Number of loop iterations to calculate depth for ?! + static const unsigned LoopIterations = 2; + DenseMap DepthMap; + DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}}; + enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 }; + /// For each register type maps the register to its last def instruction. + DenseMap RegDefMaps[RegTypeNum]; + /// Maps register operand to its def instruction, which can be nullptr if it + /// is unknown (e.g., operand is defined outside the loop). + DenseMap OperandToDefMap; + + // Set depth of unknown instruction (i.e., nullptr) to zero. + DepthMap[nullptr] = {0, 0}; + + SmallPtrSet CmovInstructions; + for (auto &Group : CmovInstGroups) + CmovInstructions.insert(Group.begin(), Group.end()); + + //===--------------------------------------------------------------------===// + // Step 1: Calculate instruction depth and loop depth. + // Optimized-Loop: + // loop with CMOV-group-candidates converted into branches. + // + // Instruction-Depth: + // instruction latency + max operand depth. + // * For CMOV instruction in optimized loop the depth is calculated as: + // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth) + // TODO: Find a better way to estimate the latency of the branch instruction + // rather than using the CMOV latency. + // + // Loop-Depth: + // max instruction depth of all instructions in the loop. + // Note: instruction with max depth represents the critical-path in the loop. + // + // Loop-Depth[i]: + // Loop-Depth calculated for first `i` iterations. + // Note: it is enough to calculate depth for up to two iterations. + // + // Depth-Diff[i]: + // Number of cycles saved in first 'i` iterations by optimizing the loop. + //===--------------------------------------------------------------------===// + for (unsigned I = 0; I < LoopIterations; ++I) { + DepthInfo &MaxDepth = LoopDepth[I]; + for (auto *MBB : CurrLoop->getBlocks()) { + // Clear physical registers Def map. + RegDefMaps[PhyRegType].clear(); + for (MachineInstr &MI : *MBB) { + unsigned MIDepth = 0; + unsigned MIDepthOpt = 0; + bool IsCMOV = CmovInstructions.count(&MI); + for (auto &MO : MI.uses()) { + // Checks for "isUse()" as "uses()" returns also implicit definitions. + if (!MO.isReg() || !MO.isUse()) + continue; + unsigned Reg = MO.getReg(); + auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)]; + if (MachineInstr *DefMI = RDM.lookup(Reg)) { + OperandToDefMap[&MO] = DefMI; + DepthInfo Info = DepthMap.lookup(DefMI); + MIDepth = std::max(MIDepth, Info.Depth); + if (!IsCMOV) + MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth); + } + } + + if (IsCMOV) + MIDepthOpt = getDepthOfOptCmov( + DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth, + DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth); + + // Iterates over all operands to handle implicit definitions as well. + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI; + } + + unsigned Latency = TSchedModel.computeInstrLatency(&MI); + DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency}; + MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth); + MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt); + } + } + } + + unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth, + LoopDepth[1].Depth - LoopDepth[1].OptDepth}; + + //===--------------------------------------------------------------------===// + // Step 2: Check if Loop worth to be optimized. + // Worth-Optimize-Loop: + // case 1: Diff[1] == Diff[0] + // Critical-path is iteration independent - there is no dependency + // of critical-path instructions on critical-path instructions of + // previous iteration. + // Thus, it is enough to check gain percent of 1st iteration - + // To be conservative, the optimized loop need to have a depth of + // 12.5% cycles less than original loop, per iteration. + // + // case 2: Diff[1] > Diff[0] + // Critical-path is iteration dependent - there is dependency of + // critical-path instructions on critical-path instructions of + // previous iteration. + // Thus, it is required to check the gradient of the gain - the + // change in Depth-Diff compared to the change in Loop-Depth between + // 1st and 2nd iterations. + // To be conservative, the gradient need to be at least 50%. + // + // If loop is not worth optimizing, remove all CMOV-group-candidates. + //===--------------------------------------------------------------------===// + bool WorthOptLoop = false; + if (Diff[1] == Diff[0]) + WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth; + else if (Diff[1] > Diff[0]) + WorthOptLoop = + (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth); + + if (!WorthOptLoop) + return false; + + ++NumOfLoopCandidate; + + //===--------------------------------------------------------------------===// + // Step 3: Check for each CMOV-group-candidate if it worth to be optimized. + // Worth-Optimize-Group: + // Iff it worths to optimize all CMOV instructions in the group. + // + // Worth-Optimize-CMOV: + // Predicted branch is faster than CMOV by the difference between depth of + // condition operand and depth of taken (predicted) value operand. + // To be conservative, the gain of such CMOV transformation should cover at + // at least 25% of branch-misprediction-penalty. + //===--------------------------------------------------------------------===// + unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; + CmovGroups TempGroups; + std::swap(TempGroups, CmovInstGroups); + for (auto &Group : TempGroups) { + bool WorthOpGroup = true; + for (auto *MI : Group) { + // Avoid CMOV instruction which value is used as a pointer to load from. + // This is another conservative check to avoid converting CMOV instruction + // used with tree-search like algorithm, where the branch is unpredicted. + auto UIs = MRI->use_instructions(MI->defs().begin()->getReg()); + if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) { + unsigned Op = UIs.begin()->getOpcode(); + if (Op == X86::MOV64rm || Op == X86::MOV32rm) { + WorthOpGroup = false; + break; + } + } + + unsigned CondCost = + DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth; + unsigned ValCost = getDepthOfOptCmov( + DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth, + DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth); + if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) { + WorthOpGroup = false; + break; + } + } + + if (WorthOpGroup) + CmovInstGroups.push_back(Group); + } + + return !CmovInstGroups.empty(); +} + +static bool checkEFLAGSLive(MachineInstr *MI) { + if (MI->killsRegister(X86::EFLAGS)) + return false; + + // The EFLAGS operand of MI might be missing a kill marker. + // Figure out whether EFLAGS operand should LIVE after MI instruction. + MachineBasicBlock *BB = MI->getParent(); + MachineBasicBlock::iterator ItrMI = MI; + + // Scan forward through BB for a use/def of EFLAGS. + for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) { + if (I->readsRegister(X86::EFLAGS)) + return true; + if (I->definesRegister(X86::EFLAGS)) + return false; + } + + // We hit the end of the block, check whether EFLAGS is live into a successor. + for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) { + if ((*I)->isLiveIn(X86::EFLAGS)) + return true; + } + + return false; +} + +void X86CmovConverterPass::convertCmovInstsToBranches( + SmallVectorImpl &Group) const { + assert(!Group.empty() && "No CMOV instructions to convert"); + ++NumOfOptimizedCmovGroups; + + // To convert a CMOVcc instruction, we actually have to insert the diamond + // control-flow pattern. The incoming instruction knows the destination vreg + // to set, the condition code register to branch on, the true/false values to + // select between, and a branch opcode to use. + + // Before + // ----- + // MBB: + // cond = cmp ... + // v1 = CMOVge t1, f1, cond + // v2 = CMOVlt t2, f2, cond + // v3 = CMOVge v1, f3, cond + // + // After + // ----- + // MBB: + // cond = cmp ... + // jge %SinkMBB + // + // FalseMBB: + // jmp %SinkMBB + // + // SinkMBB: + // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB] + // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch + // ; true-value with false-value + // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use + // ; previous Phi instruction result + + MachineInstr &MI = *Group.front(); + MachineInstr *LastCMOV = Group.back(); + DebugLoc DL = MI.getDebugLoc(); + X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode())); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction::iterator It = ++MBB->getIterator(); + MachineFunction *F = MBB->getParent(); + const BasicBlock *BB = MBB->getBasicBlock(); + + MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB); + MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB); + F->insert(It, FalseMBB); + F->insert(It, SinkMBB); + + // If the EFLAGS register isn't dead in the terminator, then claim that it's + // live into the sink and copy blocks. + if (checkEFLAGSLive(LastCMOV)) { + FalseMBB->addLiveIn(X86::EFLAGS); + SinkMBB->addLiveIn(X86::EFLAGS); + } + + // Transfer the remainder of BB and its successor edges to SinkMBB. + SinkMBB->splice(SinkMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end()); + SinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // Add the false and sink blocks as its successors. + MBB->addSuccessor(FalseMBB); + MBB->addSuccessor(SinkMBB); + + // Create the conditional branch instruction. + BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB); + + // Add the sink block to the false block successors. + FalseMBB->addSuccessor(SinkMBB); + + MachineInstrBuilder MIB; + MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); + MachineBasicBlock::iterator MIItEnd = + std::next(MachineBasicBlock::iterator(LastCMOV)); + MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); + // As we are creating the PHIs, we have to be careful if there is more than + // one. Later CMOVs may reference the results of earlier CMOVs, but later + // PHIs have to reference the individual true/false inputs from earlier PHIs. + // That also means that PHI construction must work forward from earlier to + // later, and that the code must maintain a mapping from earlier PHI's + // destination registers, and the registers that went into the PHI. + DenseMap> RegRewriteTable; + + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { + unsigned DestReg = MIIt->getOperand(0).getReg(); + unsigned Op1Reg = MIIt->getOperand(1).getReg(); + unsigned Op2Reg = MIIt->getOperand(2).getReg(); + + // If this CMOV we are processing is the opposite condition from the jump we + // generated, then we have to swap the operands for the PHI that is going to + // be generated. + if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC) + std::swap(Op1Reg, Op2Reg); + + auto Op1Itr = RegRewriteTable.find(Op1Reg); + if (Op1Itr != RegRewriteTable.end()) + Op1Reg = Op1Itr->second.first; + + auto Op2Itr = RegRewriteTable.find(Op2Reg); + if (Op2Itr != RegRewriteTable.end()) + Op2Reg = Op2Itr->second.second; + + // SinkMBB: + // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ] + // ... + MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg) + .addReg(Op1Reg) + .addMBB(FalseMBB) + .addReg(Op2Reg) + .addMBB(MBB); + (void)MIB; + DEBUG(dbgs() << "\tFrom: "; MIIt->dump()); + DEBUG(dbgs() << "\tTo: "; MIB->dump()); + + // Add this PHI to the rewrite table. + RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); + } + + // Now remove the CMOV(s). + MBB->erase(MIItBegin, MIItEnd); +} + +} // End anonymous namespace. + +FunctionPass *llvm::createX86CmovConverterPass() { + return new X86CmovConverterPass(); +} diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index ee9e78146305..527e5d568ac6 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -1187,7 +1187,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { CC != CallingConv::X86_StdCall && CC != CallingConv::X86_ThisCall && CC != CallingConv::X86_64_SysV && - CC != CallingConv::X86_64_Win64) + CC != CallingConv::Win64) return false; // Don't handle popping bytes if they don't fit the ret's immediate. @@ -3171,7 +3171,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { case CallingConv::X86_FastCall: case CallingConv::X86_StdCall: case CallingConv::X86_ThisCall: - case CallingConv::X86_64_Win64: + case CallingConv::Win64: case CallingConv::X86_64_SysV: break; } diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp index c28746f96439..95c6f2a3fa34 100644 --- a/lib/Target/X86/X86FixupBWInsts.cpp +++ b/lib/Target/X86/X86FixupBWInsts.cpp @@ -22,7 +22,7 @@ /// instructions and register-to-register moves. It would /// seem like cmov(s) would also be affected, but because of the way cmov is /// really implemented by most machines as reading both the destination and -/// and source regsters, and then "merging" the two based on a condition, +/// and source registers, and then "merging" the two based on a condition, /// it really already should be considered as having a true dependence on the /// destination register as well. /// diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 65486cf7f529..44eecd664714 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1335,6 +1335,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTTZ, VT, Custom); } + // NonVLX sub-targets extend 128/256 vectors to use the 512 version. + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64, + MVT::v8i64}) { + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); + } + // Need to promote to 64-bit even though we have 32-bit masked instructions // because the IR optimizers rearrange bitcasts around logic ops leaving // too many variations to handle if we don't promote them. @@ -1663,10 +1670,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; - // TODO: These control memcmp expansion in CGP and are set low to prevent - // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder. - MaxLoadsPerMemcmp = 1; - MaxLoadsPerMemcmpOptSize = 1; + // TODO: These control memcmp expansion in CGP and could be raised higher, but + // that needs to benchmarked and balanced with the potential use of vector + // load/store types (PR33329). + MaxLoadsPerMemcmp = 4; + MaxLoadsPerMemcmpOptSize = 2; // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). setPrefLoopAlignment(ExperimentalPrefLoopAlignment); @@ -2661,7 +2669,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { // C calling conventions: case CallingConv::C: - case CallingConv::X86_64_Win64: + case CallingConv::Win64: case CallingConv::X86_64_SysV: // Callee pop conventions: case CallingConv::X86_ThisCall: @@ -20188,7 +20196,10 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast(ScaleOp); + auto *C = dyn_cast(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); EVT MaskVT = Mask.getValueType(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); @@ -20210,7 +20221,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast(ScaleOp); + auto *C = dyn_cast(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); @@ -20235,7 +20249,10 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast(ScaleOp); + auto *C = dyn_cast(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -20254,7 +20271,10 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast(ScaleOp); + auto *C = dyn_cast(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -22665,10 +22685,31 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SDLoc DL(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); + unsigned Opcode = Op.getOpcode(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + + if (Subtarget.hasAVX512()) { + // Attempt to rotate by immediate. + APInt UndefElts; + SmallVector EltBits; + if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) { + if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) { + return EltBits[0] == V; + })) { + unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); + uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits); + return DAG.getNode(Op, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); + } + } + + // Else, fall-back on VPROLV/VPRORV. + return Op; + } assert(VT.isVector() && "Custom lowering only for vector rotates!"); assert(Subtarget.hasXOP() && "XOP support required for vector rotates!"); - assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); + assert((Opcode == ISD::ROTL) && "Only ROTL supported"); // XOP has 128-bit vector variable + immediate rotates. // +ve/-ve Amt = rotate left/right. @@ -22683,7 +22724,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, if (auto *BVAmt = dyn_cast(Amt)) { if (auto *RotateConst = BVAmt->getConstantSplatNode()) { uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); - assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); + assert(RotateAmt < EltSizeInBits && "Rotation out of range"); return DAG.getNode(X86ISD::VPROTI, DL, VT, R, DAG.getConstant(RotateAmt, DL, MVT::i8)); } @@ -24030,7 +24071,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); - case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); + case ISD::ROTL: + case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index cc5c09cbf0e5..705d0f7a5cf7 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1759,29 +1759,29 @@ let Predicates = Preds in { (i64 0)), (COPY_TO_REGCLASS (!cast(InstrStr##rr) _.RC:$src1, _.RC:$src2), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), + (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))))), (i64 0)), (COPY_TO_REGCLASS (!cast(InstrStr##rm) _.RC:$src1, addr:$src2), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and _.KRCWM:$mask, + (_.KVT (and _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))), (i64 0)), (COPY_TO_REGCLASS (!cast(InstrStr##rrk) _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and (_.KVT _.KRCWM:$mask), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))))))), (i64 0)), - (COPY_TO_REGCLASS (!cast(InstrStr##rmk) _.KRCWM:$mask, + (COPY_TO_REGCLASS (!cast(InstrStr##rmk) _.KRCWM:$mask, _.RC:$src1, addr:$src2), NewInf.KRC)>; } @@ -1798,7 +1798,7 @@ let Predicates = Preds in { (i64 0)), (COPY_TO_REGCLASS (!cast(InstrStr##rmb) _.RC:$src1, addr:$src2), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), (_.KVT (and (_.KVT _.KRCWM:$mask), (_.KVT (OpNode (_.VT _.RC:$src1), @@ -1879,7 +1879,7 @@ defm : avx512_icmp_packed_rmb_lowering; -defm : avx512_icmp_packed_rmb_lowering; defm : avx512_icmp_packed_rmb_lowering; @@ -2127,17 +2127,17 @@ multiclass avx512_icmp_cc_packed_lowering Preds> { let Predicates = Preds in { def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), imm:$cc)), (i64 0)), - (COPY_TO_REGCLASS (!cast(InstrStr##rri) _.RC:$src1, + (COPY_TO_REGCLASS (!cast(InstrStr##rri) _.RC:$src1, _.RC:$src2, imm:$cc), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), + (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc)), (i64 0)), @@ -2145,37 +2145,37 @@ let Predicates = Preds in { addr:$src2, imm:$cc), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and _.KRCWM:$mask, + (_.KVT (and _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc))), (i64 0)), (COPY_TO_REGCLASS (!cast(InstrStr##rrik) _.KRCWM:$mask, - _.RC:$src1, + _.RC:$src1, _.RC:$src2, imm:$cc), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and (_.KVT _.KRCWM:$mask), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc)))), (i64 0)), - (COPY_TO_REGCLASS (!cast(InstrStr##rmik) _.KRCWM:$mask, + (COPY_TO_REGCLASS (!cast(InstrStr##rmik) _.KRCWM:$mask, _.RC:$src1, addr:$src2, imm:$cc), NewInf.KRC)>; } } - + multiclass avx512_icmp_cc_packed_rmb_lowering Preds> + list Preds> : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> { let Predicates = Preds in { def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), @@ -2187,7 +2187,7 @@ let Predicates = Preds in { addr:$src2, imm:$cc), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), (_.KVT (and (_.KVT _.KRCWM:$mask), (_.KVT (OpNode (_.VT _.RC:$src1), @@ -2447,17 +2447,17 @@ multiclass avx512_fcmp_cc_packed_lowering Preds> { let Predicates = Preds in { def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (X86cmpm (_.VT _.RC:$src1), - (_.VT _.RC:$src2), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), imm:$cc)), (i64 0)), - (COPY_TO_REGCLASS (!cast(InstrStr##rri) _.RC:$src1, + (COPY_TO_REGCLASS (!cast(InstrStr##rri) _.RC:$src1, _.RC:$src2, imm:$cc), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (X86cmpm (_.VT _.RC:$src1), + (_.KVT (X86cmpm (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc)), (i64 0)), @@ -2477,19 +2477,19 @@ let Predicates = Preds in { NewInf.KRC)>; } } - + multiclass avx512_fcmp_cc_packed_sae_lowering Preds> + string InstrStr, list Preds> : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> { let Predicates = Preds in def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), - (_.VT _.RC:$src2), + (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), imm:$cc, (i32 FROUND_NO_EXC))), (i64 0)), - (COPY_TO_REGCLASS (!cast(InstrStr##rrib) _.RC:$src1, + (COPY_TO_REGCLASS (!cast(InstrStr##rrib) _.RC:$src1, _.RC:$src2, imm:$cc), NewInf.KRC)>; @@ -2817,16 +2817,16 @@ let Predicates = [HasAVX512] in { def : Pat<(maskVT (scalar_to_vector GR32:$src)), (COPY_TO_REGCLASS GR32:$src, maskRC)>; - def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), + def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), (COPY_TO_REGCLASS maskRC:$src, GR32)>; def : Pat<(maskVT (scalar_to_vector GR8:$src)), (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>; - def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), + def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>; - def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), + def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), (COPY_TO_REGCLASS maskRC:$src, GR32)>; } @@ -3036,7 +3036,7 @@ def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; -def : Pat<(insert_subvector (v16i1 immAllZerosV), +def : Pat<(insert_subvector (v16i1 immAllZerosV), (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (i64 0)), (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrr) @@ -3044,8 +3044,8 @@ def : Pat<(insert_subvector (v16i1 immAllZerosV), (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), (i8 8)), (i8 8))>; -def : Pat<(insert_subvector (v16i1 immAllZerosV), - (v8i1 (and VK8:$mask, +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (and VK8:$mask, (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))), (i64 0)), (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrrk) @@ -3063,7 +3063,7 @@ def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2) (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), imm:$cc), VK8)>; -def : Pat<(insert_subvector (v16i1 immAllZerosV), +def : Pat<(insert_subvector (v16i1 immAllZerosV), (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)), (i64 0)), (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrri) @@ -3072,8 +3072,8 @@ def : Pat<(insert_subvector (v16i1 immAllZerosV), imm:$cc), (i8 8)), (i8 8))>; -def : Pat<(insert_subvector (v16i1 immAllZerosV), - (v8i1 (and VK8:$mask, +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (and VK8:$mask, (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))), (i64 0)), (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrrik) @@ -3379,35 +3379,35 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, - HasAVX512, "VMOVDQA32">, + HasAVX512, "VMOVDQA32">, PD, EVEX_CD8<32, CD8VF>; defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, - HasAVX512, "VMOVDQA64">, + HasAVX512, "VMOVDQA64">, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>, avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, - HasBWI, "VMOVDQU8">, + HasBWI, "VMOVDQU8">, XD, EVEX_CD8<8, CD8VF>; defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, - HasBWI, "VMOVDQU16">, + HasBWI, "VMOVDQU16">, XD, VEX_W, EVEX_CD8<16, CD8VF>; defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, null_frag>, avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, - HasAVX512, "VMOVDQU32">, + HasAVX512, "VMOVDQU32">, XS, EVEX_CD8<32, CD8VF>; defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, null_frag>, avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, - HasAVX512, "VMOVDQU64">, + HasAVX512, "VMOVDQU64">, XS, VEX_W, EVEX_CD8<64, CD8VF>; // Special instructions to help with spilling when we don't have VLX. We need @@ -3964,49 +3964,49 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; let hasSideEffects = 0 in { - def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src1, FR32X:$src2), "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], NoItinerary>, XS, EVEX_4V, VEX_LIG, FoldGenData<"VMOVSSZrr">; let Constraints = "$src0 = $dst" in - def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), - (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, + def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2), "vmovss.s\t{$src2, $src1, $dst {${mask}}|"# "$dst {${mask}}, $src1, $src2}", [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG, FoldGenData<"VMOVSSZrrk">; - - def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + + def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2), "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"# "$dst {${mask}} {z}, $src1, $src2}", [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG, FoldGenData<"VMOVSSZrrkz">; - def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src1, FR64X:$src2), "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W, FoldGenData<"VMOVSDZrr">; let Constraints = "$src0 = $dst" in - def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), - (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, + def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, VR128X:$src1, FR64X:$src2), "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"# "$dst {${mask}}, $src1, $src2}", [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG, - VEX_W, FoldGenData<"VMOVSDZrrk">; + VEX_W, FoldGenData<"VMOVSDZrrk">; - def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), - (ins f64x_info.KRCWM:$mask, VR128X:$src1, + def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f64x_info.KRCWM:$mask, VR128X:$src1, FR64X:$src2), "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"# "$dst {${mask}} {z}, $src1, $src2}", - [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, + [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, VEX_W, FoldGenData<"VMOVSDZrrkz">; } @@ -5676,6 +5676,109 @@ defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>; defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>; defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>; + +// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; +} + +// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; +} + //===-------------------------------------------------------------------===// // 1-src variable permutation VPERMW/D/Q //===-------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 7e4cba1c8345..343da2573b55 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -224,7 +224,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, const TargetRegisterClass * X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { const Function *F = MF.getFunction(); - if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64)) + if (IsWin64 || (F && F->getCallingConv() == CallingConv::Win64)) return &X86::GR64_TCW64RegClass; else if (Is64Bit) return &X86::GR64_TCRegClass; @@ -334,7 +334,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (Is64Bit) return CSR_64_MostRegs_SaveList; break; - case CallingConv::X86_64_Win64: + case CallingConv::Win64: if (!HasSSE) return CSR_Win64_NoSSE_SaveList; return CSR_Win64_SaveList; @@ -450,7 +450,7 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (Is64Bit) return CSR_64_MostRegs_RegMask; break; - case CallingConv::X86_64_Win64: + case CallingConv::Win64: return CSR_Win64_RegMask; case CallingConv::X86_64_SysV: return CSR_64_RegMask; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index a12fa68faf4f..d831a7974359 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -663,5 +663,6 @@ include "X86ScheduleAtom.td" include "X86SchedSandyBridge.td" include "X86SchedHaswell.td" include "X86ScheduleSLM.td" +include "X86ScheduleZnver1.td" include "X86ScheduleBtVer2.td" diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index ed53893b779c..9dcc968a1a7a 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -370,6 +370,22 @@ def : WriteRes { let Latency = 100; } def : WriteRes; def : WriteRes; +//////////////////////////////////////////////////////////////////////////////// +// SSE4A instructions. +//////////////////////////////////////////////////////////////////////////////// + +def WriteEXTRQ: SchedWriteRes<[JFPU01]> { + let Latency = 1; + let ResourceCycles = [1]; +} +def : InstRW<[WriteEXTRQ], (instregex "EXTRQ")>; + +def WriteINSERTQ: SchedWriteRes<[JFPU01]> { + let Latency = 2; + let ResourceCycles = [4]; +} +def : InstRW<[WriteINSERTQ], (instregex "INSERTQ")>; + //////////////////////////////////////////////////////////////////////////////// // AVX instructions. //////////////////////////////////////////////////////////////////////////////// diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td new file mode 100644 index 000000000000..d5b4cfe2ddee --- /dev/null +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -0,0 +1,223 @@ +//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Znver1 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def Znver1Model : SchedMachineModel { + // Zen can decode 4 instructions per cycle. + let IssueWidth = 4; + // Based on the reorder buffer we define MicroOpBufferSize + let MicroOpBufferSize = 192; + let LoadLatency = 4; + let MispredictPenalty = 17; + let HighLatency = 25; + let PostRAScheduler = 1; + + // FIXME: This variable is required for incomplete model. + // We haven't catered all instructions. + // So, we reset the value of this variable so as to + // say that the model is incomplete. + let CompleteModel = 0; +} + +let SchedModel = Znver1Model in { + +// Zen can issue micro-ops to 10 different units in one cycle. +// These are +// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3) +// * Two AGU units (ZAGU0, ZAGU1) +// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3) +// AGUs feed load store queues @two loads and 1 store per cycle. + +// Four ALU units are defined below +def ZnALU0 : ProcResource<1>; +def ZnALU1 : ProcResource<1>; +def ZnALU2 : ProcResource<1>; +def ZnALU3 : ProcResource<1>; + +// Two AGU units are defined below +def ZnAGU0 : ProcResource<1>; +def ZnAGU1 : ProcResource<1>; + +// Four FPU units are defined below +def ZnFPU0 : ProcResource<1>; +def ZnFPU1 : ProcResource<1>; +def ZnFPU2 : ProcResource<1>; +def ZnFPU3 : ProcResource<1>; + +// FPU grouping +def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>; +def ZnFPU013 : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>; +def ZnFPU01 : ProcResGroup<[ZnFPU0, ZnFPU1]>; +def ZnFPU12 : ProcResGroup<[ZnFPU1, ZnFPU2]>; +def ZnFPU13 : ProcResGroup<[ZnFPU1, ZnFPU3]>; +def ZnFPU23 : ProcResGroup<[ZnFPU2, ZnFPU3]>; +def ZnFPU02 : ProcResGroup<[ZnFPU0, ZnFPU2]>; +def ZnFPU03 : ProcResGroup<[ZnFPU0, ZnFPU3]>; + +// Below are the grouping of the units. +// Micro-ops to be issued to multiple units are tackled this way. + +// ALU grouping +// ZnALU03 - 0,3 grouping +def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>; + +// 56 Entry (14x4 entries) Int Scheduler +def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> { + let BufferSize=56; +} + +// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations +// but are relevant for some instructions +def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> { + let BufferSize=28; +} + +// Integer Multiplication issued on ALU1. +def ZnMultiplier : ProcResource<1>; + +// Integer division issued on ALU2. +def ZnDivider : ProcResource<1>; + +// 4 Cycles load-to use Latency is captured +def : ReadAdvance; + +// (a folded load is an instruction that loads and does some operation) +// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops. +// a. load and +// b. addpd +// This multiclass is for folded loads for integer units. +multiclass ZnWriteResPair { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes { let Latency = Lat; } + + // Memory variant also uses a cycle on ZnAGU + // adds 4 cycles to the latency. + def : WriteRes { + let Latency = !add(Lat, 4); + } +} + +// This multiclass is for folded loads for floating point units. +multiclass ZnWriteResFpuPair { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes { let Latency = Lat; } + + // Memory variant also uses a cycle on ZnAGU + // adds 7 cycles to the latency. + def : WriteRes { + let Latency = !add(Lat, 7); + } +} + +// WriteRMW is set for instructions with Memory write +// operation in codegen +def : WriteRes; + +def : WriteRes; +def : WriteRes; +def : WriteRes { let Latency = 8; } + +def : WriteRes; +def : WriteRes; +defm : ZnWriteResPair; +defm : ZnWriteResPair; +defm : ZnWriteResPair; + +// IDIV +def : WriteRes { + let Latency = 41; + let ResourceCycles = [1, 41]; +} + +def : WriteRes { + let Latency = 45; + let ResourceCycles = [1, 4, 41]; +} + +// IMUL +def : WriteRes{ + let Latency = 4; +} +def : WriteRes { + let Latency = 4; +} + +def : WriteRes { + let Latency = 8; +} + +// Floating point operations +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; + +// Vector integer operations which uses FPU units +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; + +// Vector Shift Operations +defm : ZnWriteResFpuPair; + +// AES Instructions. +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; + +def : WriteRes; +def : WriteRes; + +// Following instructions with latency=100 are microcoded. +// We set long latency so as to block the entire pipeline. +defm : ZnWriteResFpuPair; + +//Microcoded Instructions +let Latency = 100 in { + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + } +} diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index fa0afe29586b..427a0001bef9 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -597,7 +597,7 @@ public: case CallingConv::Intel_OCL_BI: return isTargetWin64(); // This convention allows using the Win64 convention on other targets. - case CallingConv::X86_64_Win64: + case CallingConv::Win64: return true; // This convention allows using the SysV convention on Windows targets. case CallingConv::X86_64_SysV: diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 8d891c983fab..08c2cdaefe71 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -375,6 +375,7 @@ bool X86PassConfig::addILPOpts() { addPass(&EarlyIfConverterID); if (EnableMachineCombinerPass) addPass(&MachineCombinerID); + addPass(createX86CmovConverterPass()); return true; } diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index aaa6d58bd134..c16207973b39 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -40,6 +40,8 @@ public: ~X86TargetMachine() override; const X86Subtarget *getSubtargetImpl(const Function &F) const override; + // The no argument getSubtargetImpl, while it exists on some targets, is + // deprecated and should not be used. const X86Subtarget *getSubtargetImpl() const = delete; TargetIRAnalysis getTargetIRAnalysis() override; diff --git a/lib/ToolDrivers/CMakeLists.txt b/lib/ToolDrivers/CMakeLists.txt index ad458450fda3..28da36bba209 100644 --- a/lib/ToolDrivers/CMakeLists.txt +++ b/lib/ToolDrivers/CMakeLists.txt @@ -1 +1,2 @@ +add_subdirectory(llvm-dlltool) add_subdirectory(llvm-lib) diff --git a/lib/ToolDrivers/LLVMBuild.txt b/lib/ToolDrivers/LLVMBuild.txt index 7da9a5c01005..a49e04bdf3c1 100644 --- a/lib/ToolDrivers/LLVMBuild.txt +++ b/lib/ToolDrivers/LLVMBuild.txt @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = llvm-lib +subdirectories = llvm-dlltool llvm-lib [component_0] type = Group diff --git a/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt b/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt new file mode 100644 index 000000000000..52bd5cba86f4 --- /dev/null +++ b/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt @@ -0,0 +1,9 @@ +set(LLVM_TARGET_DEFINITIONS Options.td) +tablegen(LLVM Options.inc -gen-opt-parser-defs) +add_public_tablegen_target(DllOptionsTableGen) + +add_llvm_library(LLVMDlltoolDriver + DlltoolDriver.cpp + ) + +add_dependencies(LLVMDlltoolDriver DllOptionsTableGen) diff --git a/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp new file mode 100644 index 000000000000..a7de79306074 --- /dev/null +++ b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp @@ -0,0 +1,160 @@ +//===- DlltoolDriver.cpp - dlltool.exe-compatible driver ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Defines an interface to a dlltool.exe-compatible driver. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h" +#include "llvm/Object/ArchiveWriter.h" +#include "llvm/Object/COFF.h" +#include "llvm/Object/COFFImportFile.h" +#include "llvm/Object/COFFModuleDefinition.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" +#include "llvm/Support/Path.h" + +#include +#include + +using namespace llvm; +using namespace llvm::object; +using namespace llvm::COFF; + +namespace { + +enum { + OPT_INVALID = 0, +#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID, +#include "Options.inc" +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE; +#include "Options.inc" +#undef PREFIX + +static const llvm::opt::OptTable::Info infoTable[] = { +#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X7, X8, X9, X10, X11, X12) \ + {X1, X2, X10, X11, OPT_##ID, llvm::opt::Option::KIND##Class, \ + X9, X8, OPT_##GROUP, OPT_##ALIAS, X7, X12}, +#include "Options.inc" +#undef OPTION +}; + +class DllOptTable : public llvm::opt::OptTable { +public: + DllOptTable() : OptTable(infoTable, false) {} +}; + +} // namespace + +std::vector> OwningMBs; + +// Opens a file. Path has to be resolved already. +// Newly created memory buffers are owned by this driver. +MemoryBufferRef openFile(StringRef Path) { + ErrorOr> MB = MemoryBuffer::getFile(Path); + + if (std::error_code EC = MB.getError()) + llvm::errs() << "fail openFile: " << EC.message() << "\n"; + + MemoryBufferRef MBRef = MB.get()->getMemBufferRef(); + OwningMBs.push_back(std::move(MB.get())); // take ownership + return MBRef; +} + +static MachineTypes getEmulation(StringRef S) { + return StringSwitch(S) + .Case("i386", IMAGE_FILE_MACHINE_I386) + .Case("i386:x86-64", IMAGE_FILE_MACHINE_AMD64) + .Case("arm", IMAGE_FILE_MACHINE_ARMNT) + .Default(IMAGE_FILE_MACHINE_UNKNOWN); +} + +static std::string getImplibPath(std::string Path) { + SmallString<128> Out = StringRef("lib"); + Out.append(Path); + sys::path::replace_extension(Out, ".a"); + return Out.str(); +} + +int llvm::dlltoolDriverMain(llvm::ArrayRef ArgsArr) { + DllOptTable Table; + unsigned MissingIndex; + unsigned MissingCount; + llvm::opt::InputArgList Args = + Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount); + if (MissingCount) { + llvm::errs() << Args.getArgString(MissingIndex) << ": missing argument\n"; + return 1; + } + + // Handle when no input or output is specified + if (Args.hasArgNoClaim(OPT_INPUT) || + (!Args.hasArgNoClaim(OPT_d) && !Args.hasArgNoClaim(OPT_l))) { + Table.PrintHelp(outs(), ArgsArr[0], "dlltool", false); + llvm::outs() << "\nTARGETS: i386, i386:x86-64, arm\n"; + return 1; + } + + if (!Args.hasArgNoClaim(OPT_m) && Args.hasArgNoClaim(OPT_d)) { + llvm::errs() << "error: no target machine specified\n" + << "supported targets: i386, i386:x86-64, arm\n"; + return 1; + } + + for (auto *Arg : Args.filtered(OPT_UNKNOWN)) + llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n"; + + MemoryBufferRef MB; + if (auto *Arg = Args.getLastArg(OPT_d)) + MB = openFile(Arg->getValue()); + + if (!MB.getBufferSize()) { + llvm::errs() << "definition file empty\n"; + return 1; + } + + COFF::MachineTypes Machine = IMAGE_FILE_MACHINE_UNKNOWN; + if (auto *Arg = Args.getLastArg(OPT_m)) + Machine = getEmulation(Arg->getValue()); + + if (Machine == IMAGE_FILE_MACHINE_UNKNOWN) { + llvm::errs() << "unknown target\n"; + return 1; + } + + Expected Def = + parseCOFFModuleDefinition(MB, Machine, true); + + if (!Def) { + llvm::errs() << "error parsing definition\n" + << errorToErrorCode(Def.takeError()).message(); + return 1; + } + + // Do this after the parser because parseCOFFModuleDefinition sets OutputFile. + if (auto *Arg = Args.getLastArg(OPT_D)) + Def->OutputFile = Arg->getValue(); + + if (Def->OutputFile.empty()) { + llvm::errs() << "no output file specified\n"; + return 1; + } + + std::string Path = Args.getLastArgValue(OPT_l); + if (Path.empty()) + Path = getImplibPath(Def->OutputFile); + + if (writeImportLibrary(Def->OutputFile, Path, Def->Exports, Machine)) + return 1; + return 0; +} diff --git a/lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt b/lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt new file mode 100644 index 000000000000..11736eb47bcb --- /dev/null +++ b/lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt -------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = DlltoolDriver +parent = Libraries +required_libraries = Object Option Support diff --git a/lib/ToolDrivers/llvm-dlltool/Options.td b/lib/ToolDrivers/llvm-dlltool/Options.td new file mode 100644 index 000000000000..213c6a4d7674 --- /dev/null +++ b/lib/ToolDrivers/llvm-dlltool/Options.td @@ -0,0 +1,26 @@ +include "llvm/Option/OptParser.td" + +def m: JoinedOrSeparate<["-"], "m">, HelpText<"Set target machine">; +def m_long : JoinedOrSeparate<["--"], "machine">, Alias; + +def l: JoinedOrSeparate<["-"], "l">, HelpText<"Generate an import lib">; +def l_long : JoinedOrSeparate<["--"], "output-lib">, Alias; + +def D: JoinedOrSeparate<["-"], "D">, HelpText<"Specify the input DLL Name">; +def D_long : JoinedOrSeparate<["--"], "dllname">, Alias; + +def d: JoinedOrSeparate<["-"], "d">, HelpText<"Input .def File">; +def d_long : JoinedOrSeparate<["--"], "input-def">, Alias; + +//============================================================================== +// The flags below do nothing. They are defined only for dlltool compatibility. +//============================================================================== + +def k: Flag<["-"], "k">, HelpText<"Kill @n Symbol from export">; +def k_alias: Flag<["--"], "kill-at">, Alias; + +def S: JoinedOrSeparate<["-"], "S">, HelpText<"Assembler">; +def S_alias: JoinedOrSeparate<["--"], "as">, Alias; + +def f: JoinedOrSeparate<["-"], "f">, HelpText<"Assembler Flags">; +def f_alias: JoinedOrSeparate<["--"], "as-flags">, Alias; diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 3d57acf06e74..93eab680ca6b 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -2026,6 +2026,24 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI, continue; } + // LLVM's definition of dominance allows instructions that are cyclic + // in unreachable blocks, e.g.: + // %pat = select i1 %condition, @global, i16* %pat + // because any instruction dominates an instruction in a block that's + // not reachable from entry. + // So, remove unreachable blocks from the function, because a) there's + // no point in analyzing them and b) GlobalOpt should otherwise grow + // some more complicated logic to break these cycles. + // Removing unreachable blocks might invalidate the dominator so we + // recalculate it. + if (!F->isDeclaration()) { + if (removeUnreachableBlocks(*F)) { + auto &DT = LookupDomTree(*F); + DT.recalculate(*F); + Changed = true; + } + } + Changed |= processGlobal(*F, TLI, LookupDomTree); if (!F->hasLocalLinkage()) diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 00ddb93df830..317770d133b3 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -909,7 +909,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // To check this we also need to nuke any dead constant uses (perhaps // made dead by this operation on other functions). Callee.removeDeadConstantUsers(); - if (Callee.use_empty()) { + if (Callee.use_empty() && !CG.isLibFunction(Callee)) { Calls.erase( std::remove_if(Calls.begin() + i + 1, Calls.end(), [&Callee](const std::pair &Call) { diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index ac4765f96075..6baada2c1ae1 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -173,8 +173,10 @@ protected: void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); bool computeBlockWeights(Function &F); void findEquivalenceClasses(Function &F); + template void findEquivalencesFor(BasicBlock *BB1, ArrayRef Descendants, - DominatorTreeBase *DomTree); + DominatorTreeBase *DomTree); + void propagateWeights(Function &F); uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(Function &F); @@ -217,7 +219,7 @@ protected: /// \brief Dominance, post-dominance and loop information. std::unique_ptr DT; - std::unique_ptr> PDT; + std::unique_ptr> PDT; std::unique_ptr LI; AssumptionCacheTracker *ACT; @@ -773,9 +775,10 @@ bool SampleProfileLoader::inlineHotFunctions( /// \param DomTree Opposite dominator tree. If \p Descendants is filled /// with blocks from \p BB1's dominator tree, then /// this is the post-dominator tree, and vice versa. +template void SampleProfileLoader::findEquivalencesFor( BasicBlock *BB1, ArrayRef Descendants, - DominatorTreeBase *DomTree) { + DominatorTreeBase *DomTree) { const BasicBlock *EC = EquivalenceClass[BB1]; uint64_t Weight = BlockWeights[EC]; for (const auto *BB2 : Descendants) { @@ -1283,7 +1286,7 @@ void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) { DT.reset(new DominatorTree); DT->recalculate(F); - PDT.reset(new DominatorTreeBase(true)); + PDT.reset(new PostDomTreeBase()); PDT->recalculate(F); LI.reset(new LoopInfo); diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 773c86e23707..fdc9c373b95e 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1284,6 +1284,16 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (Value *V = SimplifyBSwap(I, Builder)) return replaceInstUsesWith(I, V); + if (match(Op1, m_One())) { + // (1 << x) & 1 --> zext(x == 0) + // (1 >> x) & 1 --> zext(x == 0) + Value *X; + if (match(Op0, m_OneUse(m_LogicalShift(m_One(), m_Value(X))))) { + Value *IsZero = Builder.CreateICmpEQ(X, ConstantInt::get(I.getType(), 0)); + return new ZExtInst(IsZero, I.getType()); + } + } + if (ConstantInt *AndRHS = dyn_cast(Op1)) { const APInt &AndRHSMask = AndRHS->getValue(); @@ -1315,23 +1325,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { break; } - case Instruction::Sub: - // -x & 1 -> x & 1 - if (AndRHSMask.isOneValue() && match(Op0LHS, m_Zero())) - return BinaryOperator::CreateAnd(Op0RHS, AndRHS); - - break; - - case Instruction::Shl: - case Instruction::LShr: - // (1 << x) & 1 --> zext(x == 0) - // (1 >> x) & 1 --> zext(x == 0) - if (AndRHSMask.isOneValue() && Op0LHS == AndRHS) { - Value *NewICmp = - Builder.CreateICmpEQ(Op0RHS, Constant::getNullValue(I.getType())); - return new ZExtInst(NewICmp, I.getType()); - } - break; } // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth @@ -1417,12 +1410,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { } } - // (A&((~A)|B)) -> A&B - if (match(Op0, m_c_Or(m_Not(m_Specific(Op1)), m_Value(A)))) - return BinaryOperator::CreateAnd(A, Op1); - if (match(Op1, m_c_Or(m_Not(m_Specific(Op0)), m_Value(A)))) - return BinaryOperator::CreateAnd(A, Op0); - // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) @@ -2020,18 +2007,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { Value *A, *B; - // ((~A & B) | A) -> (A | B) - if (match(Op0, m_c_And(m_Not(m_Specific(Op1)), m_Value(A)))) - return BinaryOperator::CreateOr(A, Op1); - if (match(Op1, m_c_And(m_Not(m_Specific(Op0)), m_Value(A)))) - return BinaryOperator::CreateOr(Op0, A); - - // ((A & B) | ~A) -> (~A | B) - // The NOT is guaranteed to be in the RHS by complexity ordering. - if (match(Op1, m_Not(m_Value(A))) && - match(Op0, m_c_And(m_Specific(A), m_Value(B)))) - return BinaryOperator::CreateOr(Op1, B); - // (A & C)|(B & D) Value *C = nullptr, *D = nullptr; if (match(Op0, m_And(m_Value(A), m_Value(C))) && @@ -2176,17 +2151,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { return BinaryOperator::CreateOr(Not, Op0); } - // (A & B) | (~A ^ B) -> (~A ^ B) - // (A & B) | (B ^ ~A) -> (~A ^ B) - // (B & A) | (~A ^ B) -> (~A ^ B) - // (B & A) | (B ^ ~A) -> (~A ^ B) - // The match order is important: match the xor first because the 'not' - // operation defines 'A'. We do not need to match the xor as Op0 because the - // xor was canonicalized to Op1 above. - if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && - match(Op0, m_c_And(m_Specific(A), m_Specific(B)))) - return BinaryOperator::CreateXor(Builder.CreateNot(A), B); - if (SwappedForXor) std::swap(Op0, Op1); diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 60d1cde971dd..a8faaecb5c34 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1814,9 +1814,21 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or, Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType())); Value *CmpQ = Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType())); - auto LogicOpc = Pred == ICmpInst::Predicate::ICMP_EQ ? Instruction::And - : Instruction::Or; - return BinaryOperator::Create(LogicOpc, CmpP, CmpQ); + auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or; + return BinaryOperator::Create(BOpc, CmpP, CmpQ); + } + + // Are we using xors to bitwise check for a pair of (in)equalities? Convert to + // a shorter form that has more potential to be folded even further. + Value *X1, *X2, *X3, *X4; + if (match(Or->getOperand(0), m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) && + match(Or->getOperand(1), m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) { + // ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4) + // ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4) + Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2); + Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4); + auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or; + return BinaryOperator::Create(BOpc, Cmp12, Cmp34); } return nullptr; @@ -3737,6 +3749,11 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal, const APInt &CVal = CI->getValue(); if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth) return nullptr; + } else { + // In this case we could have the operand of the binary operation + // being defined in another block, and performing the replacement + // could break the dominance relation. + return nullptr; } } else { // Other uses prohibit this transformation. @@ -3856,18 +3873,17 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal, } else if (BinaryOperator *BO = dyn_cast(U)) { assert(BO->getOpcode() == Instruction::And); // Replace (mul & mask) --> zext (mul.with.overflow & short_mask) - Value *ShortMask = - Builder.CreateTrunc(BO->getOperand(1), Builder.getIntNTy(MulWidth)); + ConstantInt *CI = cast(BO->getOperand(1)); + APInt ShortMask = CI->getValue().trunc(MulWidth); Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask); - Value *Zext = Builder.CreateZExt(ShortAnd, BO->getType()); - if (auto *ZextI = dyn_cast(Zext)) - IC.Worklist.Add(ZextI); + Instruction *Zext = + cast(Builder.CreateZExt(ShortAnd, BO->getType())); + IC.Worklist.Add(Zext); IC.replaceInstUsesWith(*BO, Zext); } else { llvm_unreachable("Unexpected Binary operation"); } - if (auto *UI = dyn_cast(U)) - IC.Worklist.Add(UI); + IC.Worklist.Add(cast(U)); } } if (isa(OtherVal)) diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index c59e1ce69ac2..451036545741 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -998,8 +998,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { // that this code is not reachable. We do this instead of inserting // an unreachable instruction directly because we cannot modify the // CFG. - new StoreInst(UndefValue::get(LI.getType()), - Constant::getNullValue(Op->getType()), &LI); + StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()), + Constant::getNullValue(Op->getType()), &LI); + SI->setDebugLoc(LI.getDebugLoc()); return replaceInstUsesWith(LI, UndefValue::get(LI.getType())); } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 5689c0604239..a20f474cbf40 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -417,8 +417,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // the highest demanded bit, we just return the other side. if (DemandedFromOps.isSubsetOf(RHSKnown.Zero)) return I->getOperand(0); - // We can't do this with the LHS for subtraction. - if (I->getOpcode() == Instruction::Add && + // We can't do this with the LHS for subtraction, unless we are only + // demanding the LSB. + if ((I->getOpcode() == Instruction::Add || + DemandedFromOps.isOneValue()) && DemandedFromOps.isSubsetOf(LHSKnown.Zero)) return I->getOperand(1); } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 90e232399155..c7766568fd9d 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -636,17 +636,35 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS; Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op' + Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)); + Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I)); + // Do "A op C" and "B op C" both simplify? - if (Value *L = - SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I))) - if (Value *R = - SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I))) { - // They do! Return "L op' R". - ++NumExpand; - C = Builder.CreateBinOp(InnerOpcode, L, R); - C->takeName(&I); - return C; - } + if (L && R) { + // They do! Return "L op' R". + ++NumExpand; + C = Builder.CreateBinOp(InnerOpcode, L, R); + C->takeName(&I); + return C; + } + + // Does "A op C" simplify to the identity value for the inner opcode? + if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) { + // They do! Return "B op C". + ++NumExpand; + C = Builder.CreateBinOp(TopLevelOpcode, B, C); + C->takeName(&I); + return C; + } + + // Does "B op C" simplify to the identity value for the inner opcode? + if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) { + // They do! Return "A op C". + ++NumExpand; + C = Builder.CreateBinOp(TopLevelOpcode, A, C); + C->takeName(&I); + return C; + } } if (Op1 && LeftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) { @@ -655,17 +673,35 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1); Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op' + Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I)); + Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)); + // Do "A op B" and "A op C" both simplify? - if (Value *L = - SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I))) - if (Value *R = - SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I))) { - // They do! Return "L op' R". - ++NumExpand; - A = Builder.CreateBinOp(InnerOpcode, L, R); - A->takeName(&I); - return A; - } + if (L && R) { + // They do! Return "L op' R". + ++NumExpand; + A = Builder.CreateBinOp(InnerOpcode, L, R); + A->takeName(&I); + return A; + } + + // Does "A op B" simplify to the identity value for the inner opcode? + if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) { + // They do! Return "A op C". + ++NumExpand; + A = Builder.CreateBinOp(TopLevelOpcode, A, C); + A->takeName(&I); + return A; + } + + // Does "A op C" simplify to the identity value for the inner opcode? + if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) { + // They do! Return "A op B". + ++NumExpand; + A = Builder.CreateBinOp(TopLevelOpcode, A, B); + A->takeName(&I); + return A; + } } // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0)) diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 184940b7ea58..057f746e052d 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -22,9 +22,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Argument.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" @@ -43,6 +45,7 @@ #include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation.h" @@ -192,6 +195,11 @@ static cl::opt ClMaxInlinePoisoningSize( static cl::opt ClUseAfterReturn("asan-use-after-return", cl::desc("Check stack-use-after-return"), cl::Hidden, cl::init(true)); +static cl::opt ClRedzoneByvalArgs("asan-redzone-byval-args", + cl::desc("Create redzones for byval " + "arguments (extra copy " + "required)"), cl::Hidden, + cl::init(true)); static cl::opt ClUseAfterScope("asan-use-after-scope", cl::desc("Check stack-use-after-scope"), cl::Hidden, cl::init(false)); @@ -747,6 +755,9 @@ struct FunctionStackPoisoner : public InstVisitor { bool runOnFunction() { if (!ClStack) return false; + + if (ClRedzoneByvalArgs) copyArgsPassedByValToAllocas(); + // Collect alloca, ret, lifetime instructions etc. for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB); @@ -763,6 +774,11 @@ struct FunctionStackPoisoner : public InstVisitor { return true; } + // Arguments marked with the "byval" attribute are implicitly copied without + // using an alloca instruction. To produce redzones for those arguments, we + // copy them a second time into memory allocated with an alloca instruction. + void copyArgsPassedByValToAllocas(); + // Finds all Alloca instructions and puts // poisoned red zones around all of them. // Then unpoison everything back before the function returns. @@ -2528,6 +2544,28 @@ static int StackMallocSizeClass(uint64_t LocalStackSize) { llvm_unreachable("impossible LocalStackSize"); } +void FunctionStackPoisoner::copyArgsPassedByValToAllocas() { + BasicBlock &FirstBB = *F.begin(); + IRBuilder<> IRB(&FirstBB, FirstBB.getFirstInsertionPt()); + const DataLayout &DL = F.getParent()->getDataLayout(); + for (Argument &Arg : F.args()) { + if (Arg.hasByValAttr()) { + Type *Ty = Arg.getType()->getPointerElementType(); + unsigned Align = Arg.getParamAlignment(); + if (Align == 0) Align = DL.getABITypeAlignment(Ty); + + const std::string &Name = Arg.hasName() ? Arg.getName().str() : + "Arg" + llvm::to_string(Arg.getArgNo()); + AllocaInst *AI = IRB.CreateAlloca(Ty, nullptr, Twine(Name) + ".byval"); + AI->setAlignment(Align); + Arg.replaceAllUsesWith(AI); + + uint64_t AllocSize = DL.getTypeAllocSize(Ty); + IRB.CreateMemCpy(AI, &Arg, AllocSize, Align); + } + } +} + PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue, Instruction *ThenTerm, diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 1348e0ed0ed0..b7c6271869cd 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3039,7 +3039,7 @@ struct VarArgAMD64Helper : public VarArgHelper { } void visitVAStartInst(VAStartInst &I) override { - if (F.getCallingConv() == CallingConv::X86_64_Win64) + if (F.getCallingConv() == CallingConv::Win64) return; IRBuilder<> IRB(&I); VAStartInstrumentationList.push_back(&I); @@ -3053,7 +3053,7 @@ struct VarArgAMD64Helper : public VarArgHelper { } void visitVACopyInst(VACopyInst &I) override { - if (F.getCallingConv() == CallingConv::X86_64_Win64) + if (F.getCallingConv() == CallingConv::Win64) return; IRBuilder<> IRB(&I); Value *VAListTag = I.getArgOperand(0); diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index e3c36c98ab0d..06fe07598374 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -281,6 +281,16 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { SanCovTraceSwitchFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction( SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy)); + // Make sure smaller parameters are zero-extended to i64 as required by the + // x86_64 ABI. + if (TargetTriple.getArch() == Triple::x86_64) { + for (int i = 0; i < 3; i++) { + SanCovTraceCmpFunction[i]->addParamAttr(0, Attribute::ZExt); + SanCovTraceCmpFunction[i]->addParamAttr(1, Attribute::ZExt); + } + SanCovTraceDivFunction[0]->addParamAttr(0, Attribute::ZExt); + } + // We insert an empty inline asm after cov callbacks to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 7fd77a082b82..c5c9b2c185d6 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -562,13 +562,27 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration, if (!MSSA) return false; + // If MemorySSA has determined that one of EarlierInst or LaterInst does not + // read/write memory, then we can safely return true here. + // FIXME: We could be more aggressive when checking doesNotAccessMemory(), + // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass + // by also checking the MemorySSA MemoryAccess on the instruction. Initial + // experiments suggest this isn't worthwhile, at least for C/C++ code compiled + // with the default optimization pipeline. + auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst); + if (!EarlierMA) + return true; + auto *LaterMA = MSSA->getMemoryAccess(LaterInst); + if (!LaterMA) + return true; + // Since we know LaterDef dominates LaterInst and EarlierInst dominates // LaterInst, if LaterDef dominates EarlierInst then it can't occur between // EarlierInst and LaterInst and neither can any other write that potentially // clobbers LaterInst. MemoryAccess *LaterDef = MSSA->getWalker()->getClobberingMemoryAccess(LaterInst); - return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst)); + return MSSA->dominates(LaterDef, EarlierMA); } bool EarlyCSE::processNode(DomTreeNode *Node) { diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 0fe72f3f7331..ea28705e684d 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -1168,6 +1168,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, LI->isVolatile(), LI->getAlignment(), LI->getOrdering(), LI->getSyncScopeID(), UnavailablePred->getTerminator()); + NewLoad->setDebugLoc(LI->getDebugLoc()); // Transfer the old load's AA tags to the new load. AAMDNodes Tags; diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index a40c22c3fce9..99b4458ea0fa 100644 --- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -805,6 +805,25 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP ConstantInt *One = ConstantInt::get(IndVarTy, 1); // TODO: generalize the predicates here to also match their unsigned variants. if (IsIncreasing) { + bool DecreasedRightValueByOne = false; + // Try to turn eq/ne predicates to those we can work with. + if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) + // while (++i != len) { while (++i < len) { + // ... ---> ... + // } } + Pred = ICmpInst::ICMP_SLT; + else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && + !CanBeSMin(SE, RightSCEV)) { + // while (true) { while (true) { + // if (++i == len) ---> if (++i > len - 1) + // break; break; + // ... ... + // } } + Pred = ICmpInst::ICMP_SGT; + RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())); + DecreasedRightValueByOne = true; + } + bool FoundExpectedPred = (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) || (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0); @@ -829,16 +848,41 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return None; } - IRBuilder<> B(Preheader->getTerminator()); - RightValue = B.CreateAdd(RightValue, One); + // We need to increase the right value unless we have already decreased + // it virtually when we replaced EQ with SGT. + if (!DecreasedRightValueByOne) { + IRBuilder<> B(Preheader->getTerminator()); + RightValue = B.CreateAdd(RightValue, One); + } } else { if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart, RightSCEV)) { FailureReason = "Induction variable start not bounded by upper limit"; return None; } + assert(!DecreasedRightValueByOne && + "Right value can be decreased only for LatchBrExitIdx == 0!"); } } else { + bool IncreasedRightValueByOne = false; + // Try to turn eq/ne predicates to those we can work with. + if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) + // while (--i != len) { while (--i > len) { + // ... ---> ... + // } } + Pred = ICmpInst::ICMP_SGT; + else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && + !CanBeSMax(SE, RightSCEV)) { + // while (true) { while (true) { + // if (--i == len) ---> if (--i < len + 1) + // break; break; + // ... ... + // } } + Pred = ICmpInst::ICMP_SLT; + RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); + IncreasedRightValueByOne = true; + } + bool FoundExpectedPred = (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) || (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0); @@ -863,14 +907,20 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return None; } - IRBuilder<> B(Preheader->getTerminator()); - RightValue = B.CreateSub(RightValue, One); + // We need to decrease the right value unless we have already increased + // it virtually when we replaced EQ with SLT. + if (!IncreasedRightValueByOne) { + IRBuilder<> B(Preheader->getTerminator()); + RightValue = B.CreateSub(RightValue, One); + } } else { if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart, RightSCEV)) { FailureReason = "Induction variable start not bounded by lower limit"; return None; } + assert(!IncreasedRightValueByOne && + "Right value can be increased only for LatchBrExitIdx == 0!"); } } @@ -922,14 +972,18 @@ LoopConstrainer::calculateSubRanges() const { bool Increasing = MainLoopStructure.IndVarIncreasing; - // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the - // range of values the induction variable takes. + // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or + // [Smallest, GreatestSeen] is the range of values the induction variable + // takes. - const SCEV *Smallest = nullptr, *Greatest = nullptr; + const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr; + const SCEV *One = SE.getOne(Ty); if (Increasing) { Smallest = Start; Greatest = End; + // No overflow, because the range [Smallest, GreatestSeen] is not empty. + GreatestSeen = SE.getMinusSCEV(End, One); } else { // These two computations may sign-overflow. Here is why that is okay: // @@ -947,9 +1001,9 @@ LoopConstrainer::calculateSubRanges() const { // will be an empty range. Returning an empty range is always safe. // - const SCEV *One = SE.getOne(Ty); Smallest = SE.getAddExpr(End, One); Greatest = SE.getAddExpr(Start, One); + GreatestSeen = Start; } auto Clamp = [this, Smallest, Greatest](const SCEV *S) { @@ -964,7 +1018,7 @@ LoopConstrainer::calculateSubRanges() const { Result.LowLimit = Clamp(Range.getBegin()); bool ProvablyNoPostLoop = - SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd()); + SE.isKnownPredicate(ICmpInst::ICMP_SLT, GreatestSeen, Range.getEnd()); if (!ProvablyNoPostLoop) Result.HighLimit = Clamp(Range.getEnd()); diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index ee3de51b1360..4056cc5cb346 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -2168,11 +2168,19 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { return false; } -/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form +/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the +/// same BB in the form /// bb: /// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ... -/// %s = select p, trueval, falseval +/// %s = select %p, trueval, falseval /// +/// or +/// +/// bb: +/// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ... +/// %c = cmp %p, 0 +/// %s = select %c, trueval, falseval +// /// And expand the select into a branch structure. This later enables /// jump-threading over bb in this pass. /// @@ -2186,44 +2194,54 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { if (LoopHeaders.count(BB)) return false; - // Look for a Phi/Select pair in the same basic block. The Phi feeds the - // condition of the Select and at least one of the incoming values is a - // constant. for (BasicBlock::iterator BI = BB->begin(); PHINode *PN = dyn_cast(BI); ++BI) { - unsigned NumPHIValues = PN->getNumIncomingValues(); - if (NumPHIValues == 0 || !PN->hasOneUse()) - continue; - - SelectInst *SI = dyn_cast(PN->user_back()); - if (!SI || SI->getParent() != BB) - continue; - - Value *Cond = SI->getCondition(); - if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1)) + // Look for a Phi having at least one constant incoming value. + if (llvm::all_of(PN->incoming_values(), + [](Value *V) { return !isa(V); })) continue; - bool HasConst = false; - for (unsigned i = 0; i != NumPHIValues; ++i) { - if (PN->getIncomingBlock(i) == BB) + auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) { + // Check if SI is in BB and use V as condition. + if (SI->getParent() != BB) return false; - if (isa(PN->getIncomingValue(i))) - HasConst = true; - } + Value *Cond = SI->getCondition(); + return (Cond && Cond == V && Cond->getType()->isIntegerTy(1)); + }; - if (HasConst) { - // Expand the select. - TerminatorInst *Term = - SplitBlockAndInsertIfThen(SI->getCondition(), SI, false); - PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); - NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); - NewPN->addIncoming(SI->getFalseValue(), BB); - SI->replaceAllUsesWith(NewPN); - SI->eraseFromParent(); - return true; + SelectInst *SI = nullptr; + for (Use &U : PN->uses()) { + if (ICmpInst *Cmp = dyn_cast(U.getUser())) { + // Look for a ICmp in BB that compares PN with a constant and is the + // condition of a Select. + if (Cmp->getParent() == BB && Cmp->hasOneUse() && + isa(Cmp->getOperand(1 - U.getOperandNo()))) + if (SelectInst *SelectI = dyn_cast(Cmp->user_back())) + if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) { + SI = SelectI; + break; + } + } else if (SelectInst *SelectI = dyn_cast(U.getUser())) { + // Look for a Select in BB that uses PN as condtion. + if (isUnfoldCandidate(SelectI, U.get())) { + SI = SelectI; + break; + } + } } + + if (!SI) + continue; + // Expand the select. + TerminatorInst *Term = + SplitBlockAndInsertIfThen(SI->getCondition(), SI, false); + PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); + NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); + NewPN->addIncoming(SI->getFalseValue(), BB); + SI->replaceAllUsesWith(NewPN); + SI->eraseFromParent(); + return true; } - return false; } diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp index 606136dc31a4..2e0d8e0374c0 100644 --- a/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/lib/Transforms/Scalar/LoopInterchange.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -323,9 +324,10 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { class LoopInterchangeLegality { public: LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, - LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA) + LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA, + OptimizationRemarkEmitter *ORE) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), - PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {} + PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {} /// Check if the loops can be interchanged. bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, @@ -353,6 +355,8 @@ private: LoopInfo *LI; DominatorTree *DT; bool PreserveLCSSA; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; bool InnerLoopHasReduction; }; @@ -361,8 +365,9 @@ private: /// loop. class LoopInterchangeProfitability { public: - LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) - : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} + LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + OptimizationRemarkEmitter *ORE) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} /// Check if the loop interchange is profitable. bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, @@ -376,6 +381,8 @@ private: /// Scev analysis. ScalarEvolution *SE; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; }; /// LoopInterchangeTransform interchanges the loop. @@ -422,6 +429,9 @@ struct LoopInterchange : public FunctionPass { DependenceInfo *DI; DominatorTree *DT; bool PreserveLCSSA; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; + LoopInterchange() : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) { initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); @@ -435,6 +445,7 @@ struct LoopInterchange : public FunctionPass { AU.addRequired(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); + AU.addRequired(); } bool runOnFunction(Function &F) override { @@ -446,6 +457,7 @@ struct LoopInterchange : public FunctionPass { DI = &getAnalysis().getDI(); auto *DTWP = getAnalysisIfAvailable(); DT = DTWP ? &DTWP->getDomTree() : nullptr; + ORE = &getAnalysis().getORE(); PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); // Build up a worklist of loop pairs to analyze. @@ -575,18 +587,23 @@ struct LoopInterchange : public FunctionPass { Loop *OuterLoop = LoopList[OuterLoopId]; LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, - PreserveLCSSA); + PreserveLCSSA, ORE); if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); return false; } DEBUG(dbgs() << "Loops are legal to interchange\n"); - LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE); + LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { DEBUG(dbgs() << "Interchanging loops not profitable\n"); return false; } + ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Loop interchanged with enclosing loop."); + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, LIL.hasInnerLoopReduction()); LIT.transform(); @@ -760,6 +777,12 @@ bool LoopInterchangeLegality::currentLimitations() { if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes " << "are supported currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedPHIInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with induction or reduction PHI nodes can be" + " interchange currently."); return true; } @@ -767,6 +790,12 @@ bool LoopInterchangeLegality::currentLimitations() { if (Inductions.size() != 1) { DEBUG(dbgs() << "We currently only support loops with 1 induction variable." << "Failed to interchange due to current limitation\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "MultiInductionInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with 1 induction variable can be " + "interchanged currently."); return true; } if (Reductions.size() > 0) @@ -777,6 +806,12 @@ bool LoopInterchangeLegality::currentLimitations() { if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes " << "are supported currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with induction or reduction PHI nodes can be" + " interchanged currently."); return true; } @@ -785,18 +820,35 @@ bool LoopInterchangeLegality::currentLimitations() { if (!Reductions.empty()) { DEBUG(dbgs() << "Outer loops with reductions are not supported " << "currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "ReductionsOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Outer loops with reductions cannot be interchangeed " + "currently."); return true; } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { DEBUG(dbgs() << "Loops with more than 1 induction variables are not " << "supported currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "MultiIndutionOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with 1 induction variable can be " + "interchanged currently."); return true; } // TODO: Triangular loops are not handled for now. if (!isLoopStructureUnderstood(InnerInductionVar)) { DEBUG(dbgs() << "Loop structure not understood by pass\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedStructureInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Inner loop structure not understood currently."); return true; } @@ -805,12 +857,24 @@ bool LoopInterchangeLegality::currentLimitations() { getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader); if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) { DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoLCSSAPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with LCSSA PHIs can be interchange " + "currently."); return true; } LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader); if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) { DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoLCSSAPHIOuterInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with LCSSA PHIs can be interchange " + "currently."); return true; } @@ -835,6 +899,11 @@ bool LoopInterchangeLegality::currentLimitations() { if (!InnerIndexVarInc) { DEBUG(dbgs() << "Did not find an instruction to increment the induction " << "variable.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoIncrementInInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "The inner loop does not increment the induction variable."); return true; } @@ -852,6 +921,12 @@ bool LoopInterchangeLegality::currentLimitations() { if (!I.isIdenticalTo(InnerIndexVarInc)) { DEBUG(dbgs() << "Found unsupported instructions between induction " << "variable increment and branch.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedInsBetweenInduction", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Found unsupported instruction between induction variable " + "increment and branch."); return true; } @@ -862,6 +937,11 @@ bool LoopInterchangeLegality::currentLimitations() { // current limitation. if (!FoundInduction) { DEBUG(dbgs() << "Did not find the induction variable.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoIndutionVariable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Did not find the induction variable."); return true; } return false; @@ -875,6 +955,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << " due to dependence\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "Dependence", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Cannot interchange loops due to dependences."); return false; } @@ -910,6 +995,12 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, // Check if the loops are tightly nested. if (!tightlyNested(OuterLoop, InnerLoop)) { DEBUG(dbgs() << "Loops not tightly nested\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NotTightlyNested", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Cannot interchange loops because they are not tightly " + "nested."); return false; } @@ -1005,9 +1096,18 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, // It is not profitable as per current cache profitability model. But check if // we can move this loop outside to improve parallelism. - bool ImprovesPar = - isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); - return ImprovesPar; + if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) + return true; + + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "InterchangeNotProfitable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Interchanging loops is too costly (cost=" + << ore::NV("Cost", Cost) << ", threshold=" + << ore::NV("Threshold", LoopInterchangeCostThreshold) << + ") and it does not improve parallelism."); + return false; } void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, @@ -1291,6 +1391,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", "Interchanges loops for cache reuse", false, false) diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 9397b87cdf56..90c5c243f464 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -68,6 +68,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" @@ -90,16 +91,10 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced"); /// If it contains any dynamic allocas, returns false. static bool canTRE(Function &F) { // Because of PR962, we don't TRE dynamic allocas. - for (auto &BB : F) { - for (auto &I : BB) { - if (AllocaInst *AI = dyn_cast(&I)) { - if (!AI->isStaticAlloca()) - return false; - } - } - } - - return true; + return llvm::all_of(instructions(F), [](Instruction &I) { + auto *AI = dyn_cast(&I); + return !AI || AI->isStaticAlloca(); + }); } namespace { diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 5170c68e2915..d43ce7abb7cd 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -22,6 +22,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -736,7 +737,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, // remainder are connected to the original Loop's exit blocks. The remaining // work is to update the phi nodes in the original loop, and take in the // values from the cloned region. Also update the dominator info for - // OtherExits, since we have new edges into OtherExits. + // OtherExits and their immediate successors, since we have new edges into + // OtherExits. + SmallSet ImmediateSuccessorsOfExitBlocks; for (auto *BB : OtherExits) { for (auto &II : *BB) { @@ -759,12 +762,35 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, cast(VMap[Phi->getIncomingBlock(i)])); } } +#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG) + for (BasicBlock *SuccBB : successors(BB)) { + assert(!(any_of(OtherExits, + [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) || + SuccBB == LatchExit) && + "Breaks the definition of dedicated exits!"); + } +#endif // Update the dominator info because the immediate dominator is no longer the // header of the original Loop. BB has edges both from L and remainder code. // Since the preheader determines which loop is run (L or directly jump to // the remainder code), we set the immediate dominator as the preheader. - if (DT) + if (DT) { DT->changeImmediateDominator(BB, PreHeader); + // Also update the IDom for immediate successors of BB. If the current + // IDom is the header, update the IDom to be the preheader because that is + // the nearest common dominator of all predecessors of SuccBB. We need to + // check for IDom being the header because successors of exit blocks can + // have edges from outside the loop, and we should not incorrectly update + // the IDom in that case. + for (BasicBlock *SuccBB: successors(BB)) + if (ImmediateSuccessorsOfExitBlocks.insert(SuccBB).second) { + if (DT->getNode(SuccBB)->getIDom()->getBlock() == Header) { + assert(!SuccBB->getSinglePredecessor() && + "BB should be the IDom then!"); + DT->changeImmediateDominator(SuccBB, PreHeader); + } + } + } } // Loop structure should be the following: diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index eb82ee283d44..012b10c8a9b0 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -574,11 +574,9 @@ protected: /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(Loop *NewLoop); - /// Emit a bypass check to see if the trip count would overflow, or we - /// wouldn't have enough iterations to execute one vector loop. + /// Emit a bypass check to see if the vector trip count is zero, including if + /// it overflows. void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); - /// Emit a bypass check to see if the vector trip count is nonzero. - void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. void emitSCEVChecks(Loop *L, BasicBlock *Bypass); @@ -3289,37 +3287,16 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *BB = L->getLoopPreheader(); IRBuilder<> Builder(BB->getTerminator()); - // Generate code to check that the loop's trip count that we computed by - // adding one to the backedge-taken count will not overflow. - Value *CheckMinIters = Builder.CreateICmpULT( - Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); + // Generate code to check if the loop's trip count is less than VF * UF, or + // equal to it in case a scalar epilogue is required; this implies that the + // vector trip count is zero. This check also covers the case where adding one + // to the backedge-taken count overflowed leading to an incorrect trip count + // of zero. In this case we will also jump to the scalar loop. + auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; + Value *CheckMinIters = Builder.CreateICmp( + P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); - BasicBlock *NewBB = - BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, CheckMinIters)); - LoopBypassBlocks.push_back(BB); -} - -void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, - BasicBlock *Bypass) { - Value *TC = getOrCreateVectorTripCount(L); - BasicBlock *BB = L->getLoopPreheader(); - IRBuilder<> Builder(BB->getTerminator()); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. - Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), - "cmp.zero"); - - // Generate code to check that the loop's trip count that we computed by - // adding one to the backedge-taken count will not overflow. BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass @@ -3328,7 +3305,7 @@ void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, if (L->getParentLoop()) L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, Cmp)); + BranchInst::Create(Bypass, NewBB, CheckMinIters)); LoopBypassBlocks.push_back(BB); } @@ -3477,14 +3454,13 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() { Value *StartIdx = ConstantInt::get(IdxTy, 0); - // We need to test whether the backedge-taken count is uint##_max. Adding one - // to it will cause overflow and an incorrect loop trip count in the vector - // body. In case of overflow we want to directly jump to the scalar remainder - // loop. - emitMinimumIterationCountCheck(Lp, ScalarPH); // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. - emitVectorLoopEnteredCheck(Lp, ScalarPH); + // jump to the scalar loop. This check also covers the case where the + // backedge-taken count is uint##_max: adding one to it will overflow leading + // to an incorrect trip count of zero. In this (rare) case we will also jump + // to the scalar loop. + emitMinimumIterationCountCheck(Lp, ScalarPH); + // Generate the code to check any assumptions that we've made for SCEV // expressions. emitSCEVChecks(Lp, ScalarPH); @@ -3527,7 +3503,7 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() { // We know what the end value is. EndValue = CountRoundDown; } else { - IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); + IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = CastInst::getCastOpcode(CountRoundDown, true, StepType, true); @@ -4168,7 +4144,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // To do so, we need to generate the 'identity' vector and override // one of the elements with the incoming scalar reduction. We need // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); // This is the vector-clone of the value that leaves the loop. Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4425043ad39a..dcbcab459a6b 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -434,7 +434,7 @@ private: /// \returns the pointer to the vectorized value if \p VL is already /// vectorized, or NULL. They may happen in cycles. - Value *alreadyVectorized(ArrayRef VL) const; + Value *alreadyVectorized(ArrayRef VL, Value *OpValue) const; /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. @@ -857,7 +857,7 @@ private: /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP); + bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, Value *OpValue); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef VL, Value *OpValue); @@ -1212,7 +1212,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Check that all of the users of the scalars that we want to vectorize are // schedulable. Instruction *VL0 = cast(VL[0]); - BasicBlock *BB = cast(VL0)->getParent(); + BasicBlock *BB = VL0->getParent(); if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with @@ -1237,7 +1237,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this)) { + if (!BS.tryScheduleBundle(VL, this, VL0)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && @@ -2427,8 +2427,8 @@ Value *BoUpSLP::Gather(ArrayRef VL, VectorType *Ty) { return Vec; } -Value *BoUpSLP::alreadyVectorized(ArrayRef VL) const { - if (const TreeEntry *En = getTreeEntry(VL[0])) { +Value *BoUpSLP::alreadyVectorized(ArrayRef VL, Value *OpValue) const { + if (const TreeEntry *En = getTreeEntry(OpValue)) { if (En->isSame(VL) && En->VectorizedValue) return En->VectorizedValue; } @@ -2553,7 +2553,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *InVec = vectorizeTree(INVL); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; CastInst *CI = dyn_cast(VL0); @@ -2575,7 +2575,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *L = vectorizeTree(LHSV); Value *R = vectorizeTree(RHSV); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; CmpInst::Predicate P0 = cast(VL0)->getPredicate(); @@ -2604,7 +2604,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *True = vectorizeTree(TrueVec); Value *False = vectorizeTree(FalseVec); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; Value *V = Builder.CreateSelect(Cond, True, False); @@ -2644,7 +2644,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; BinaryOperator *BinOp = cast(VL0); @@ -2806,7 +2806,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; // Create a vector of LHS op1 RHS @@ -3097,8 +3097,8 @@ void BoUpSLP::optimizeGatherSequence() { // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, - BoUpSLP *SLP) { - if (isa(VL[0])) + BoUpSLP *SLP, Value *OpValue) { + if (isa(OpValue)) return true; // Initialize the instruction bundle. @@ -3106,7 +3106,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, ScheduleData *PrevInBundle = nullptr; ScheduleData *Bundle = nullptr; bool ReSchedule = false; - DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); + DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n"); // Make sure that the scheduling region contains all // instructions of the bundle. @@ -3177,7 +3177,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, } } if (!Bundle->isReady()) { - cancelScheduling(VL, VL[0]); + cancelScheduling(VL, OpValue); return false; } return true; diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt index 3e3eff39d637..f475878e2f2a 100644 --- a/runtimes/CMakeLists.txt +++ b/runtimes/CMakeLists.txt @@ -30,7 +30,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}) list(INSERT CMAKE_MODULE_PATH 0 "${CMAKE_CURRENT_SOURCE_DIR}/../cmake" "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules" - "${LLVM_BINARY_DIR}/lib/cmake/llvm" + "${LLVM_LIBRARY_DIR}/cmake/llvm" ) # Some of the runtimes will conditionally use the compiler-rt sanitizers @@ -123,7 +123,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}) set(LLVM_RUNTIME_OUTPUT_INTDIR "${LLVM_TOOLS_BINARY_DIR}/${LLVM_RUNTIMES_TARGET}") endif() endif() - + # Between each sub-project we want to cache and clear the LIT properties set_property(GLOBAL PROPERTY LLVM_LIT_TESTSUITES) set_property(GLOBAL PROPERTY LLVM_LIT_PARAMS) @@ -154,7 +154,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}) if(LLVM_INCLUDE_TESTS) # Add a global check rule now that all subdirectories have been traversed # and we know the total set of lit testsuites. - + add_lit_target(check-runtimes "Running all regression tests" ${RUNTIMES_LIT_TESTSUITES} @@ -331,6 +331,7 @@ else() # if this is included from LLVM's CMake # Builtins were built separately above CMAKE_ARGS -DCOMPILER_RT_BUILD_BUILTINS=Off -DLLVM_INCLUDE_TESTS=${LLVM_INCLUDE_TESTS} + -DLLVM_LIBRARY_DIR=${LLVM_LIBRARY_DIR} -DCMAKE_C_COMPILER_TARGET=${target} -DCMAKE_CXX_COMPILER_TARGET=${target} -DCMAKE_ASM_COMPILER_TARGET=${target} @@ -339,6 +340,7 @@ else() # if this is included from LLVM's CMake -DCMAKE_ASM_COMPILER_WORKS=ON -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON ${${target}_extra_args} + TOOLCHAIN_TOOLS clang lld llvm-ar llvm-ranlib PASSTHROUGH_PREFIXES ${prefixes} EXTRA_TARGETS ${${target}_extra_targets} ${${target}_test_targets} @@ -376,6 +378,7 @@ else() # if this is included from LLVM's CMake # Builtins were built separately above CMAKE_ARGS -DCOMPILER_RT_BUILD_BUILTINS=Off -DLLVM_INCLUDE_TESTS=${LLVM_INCLUDE_TESTS} + -DLLVM_LIBRARY_DIR=${LLVM_LIBRARY_DIR} PASSTHROUGH_PREFIXES ${prefixes} EXTRA_TARGETS ${extra_targets} ${test_targets} diff --git a/test/Analysis/CostModel/SystemZ/fp-arith.ll b/test/Analysis/CostModel/SystemZ/fp-arith.ll index 08a7c291138f..5f92db1ababf 100644 --- a/test/Analysis/CostModel/SystemZ/fp-arith.ll +++ b/test/Analysis/CostModel/SystemZ/fp-arith.ll @@ -1,4 +1,7 @@ -; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s +; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-Z13 %s +; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z14 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-Z14 %s ; ; Note: The scalarized vector instructions cost is not including any ; extracts, due to the undef operands @@ -21,13 +24,17 @@ define void @fadd() { ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res0 = fadd float undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res1 = fadd double undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res2 = fadd fp128 undef, undef -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res3 = fadd <2 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction: %res3 = fadd <2 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction: %res3 = fadd <2 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res4 = fadd <2 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res5 = fadd <4 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction: %res5 = fadd <4 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction: %res5 = fadd <4 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res6 = fadd <4 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %res7 = fadd <8 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction: %res7 = fadd <8 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction: %res7 = fadd <8 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %res8 = fadd <8 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %res9 = fadd <16 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction: %res9 = fadd <16 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction: %res9 = fadd <16 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res10 = fadd <16 x double> undef, undef ret void; @@ -49,13 +56,17 @@ define void @fsub() { ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res0 = fsub float undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res1 = fsub double undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res2 = fsub fp128 undef, undef -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res3 = fsub <2 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction: %res3 = fsub <2 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction: %res3 = fsub <2 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res4 = fsub <2 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res5 = fsub <4 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction: %res5 = fsub <4 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction: %res5 = fsub <4 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res6 = fsub <4 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %res7 = fsub <8 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction: %res7 = fsub <8 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction: %res7 = fsub <8 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %res8 = fsub <8 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %res9 = fsub <16 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction: %res9 = fsub <16 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction: %res9 = fsub <16 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res10 = fsub <16 x double> undef, undef ret void; @@ -77,13 +88,17 @@ define void @fmul() { ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res0 = fmul float undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res1 = fmul double undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res2 = fmul fp128 undef, undef -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res3 = fmul <2 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction: %res3 = fmul <2 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction: %res3 = fmul <2 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res4 = fmul <2 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res5 = fmul <4 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction: %res5 = fmul <4 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction: %res5 = fmul <4 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res6 = fmul <4 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %res7 = fmul <8 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction: %res7 = fmul <8 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction: %res7 = fmul <8 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %res8 = fmul <8 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %res9 = fmul <16 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction: %res9 = fmul <16 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction: %res9 = fmul <16 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res10 = fmul <16 x double> undef, undef ret void; @@ -105,13 +120,17 @@ define void @fdiv() { ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res0 = fdiv float undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res1 = fdiv double undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res2 = fdiv fp128 undef, undef -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res3 = fdiv <2 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction: %res3 = fdiv <2 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction: %res3 = fdiv <2 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res4 = fdiv <2 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res5 = fdiv <4 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction: %res5 = fdiv <4 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction: %res5 = fdiv <4 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res6 = fdiv <4 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %res7 = fdiv <8 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction: %res7 = fdiv <8 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction: %res7 = fdiv <8 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %res8 = fdiv <8 x double> undef, undef -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %res9 = fdiv <16 x float> undef, undef +; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction: %res9 = fdiv <16 x float> undef, undef +; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction: %res9 = fdiv <16 x float> undef, undef ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %res10 = fdiv <16 x double> undef, undef ret void; diff --git a/test/Assembler/diimportedentity.ll b/test/Assembler/diimportedentity.ll index bc85ca09f658..6a0e1eb931c1 100644 --- a/test/Assembler/diimportedentity.ll +++ b/test/Assembler/diimportedentity.ll @@ -18,9 +18,9 @@ ; CHECK: !3 = !DICompositeType({{.*}}) !3 = !DICompositeType(tag: DW_TAG_structure_type, name: "Class", size: 32, align: 32) -; CHECK-NEXT: !4 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0, entity: !1, line: 7) +; CHECK-NEXT: !4 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0, entity: !1, file: !2, line: 7) !4 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0, - entity: !1, line: 7) + entity: !1, file: !2, line: 7) ; CHECK-NEXT: !5 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !0) !5 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !0) diff --git a/test/Bitcode/DIGlobalVariableExpression.ll b/test/Bitcode/DIGlobalVariableExpression.ll index 31c3fda1b00a..3cf082472829 100644 --- a/test/Bitcode/DIGlobalVariableExpression.ll +++ b/test/Bitcode/DIGlobalVariableExpression.ll @@ -36,4 +36,4 @@ !9 = !{!"clang version 4.0.0 (trunk 286129) (llvm/trunk 286128)"} !10 = distinct !DIGlobalVariable(name: "c", scope: !1, file: !2, line: 1, type: !5, isLocal: false, isDefinition: true, expr: !DIExpression(DW_OP_constu, 23, DW_OP_stack_value)) !11 = distinct !DIGlobalVariable(name: "h", scope: !1, file: !2, line: 2, type: !5, isLocal: false, isDefinition: true) -!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 1, scope: !1, entity: !11) +!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !2, line: 1, scope: !1, entity: !11) diff --git a/test/Bitcode/compatibility-3.6.ll b/test/Bitcode/compatibility-3.6.ll index cf6c30e7c26c..e9313dfba870 100644 --- a/test/Bitcode/compatibility-3.6.ll +++ b/test/Bitcode/compatibility-3.6.ll @@ -368,9 +368,9 @@ declare cc78 void @f.cc78() declare x86_64_sysvcc void @f.x86_64_sysvcc() ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc() declare cc79 void @f.cc79() -; CHECK: declare x86_64_win64cc void @f.cc79() -declare x86_64_win64cc void @f.x86_64_win64cc() -; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.cc79() +declare win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.x86_64_win64cc() declare cc80 void @f.cc80() ; CHECK: declare x86_vectorcallcc void @f.cc80() declare x86_vectorcallcc void @f.x86_vectorcallcc() diff --git a/test/Bitcode/compatibility-3.7.ll b/test/Bitcode/compatibility-3.7.ll index 180dad258b68..82fc99055357 100644 --- a/test/Bitcode/compatibility-3.7.ll +++ b/test/Bitcode/compatibility-3.7.ll @@ -368,9 +368,9 @@ declare cc78 void @f.cc78() declare x86_64_sysvcc void @f.x86_64_sysvcc() ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc() declare cc79 void @f.cc79() -; CHECK: declare x86_64_win64cc void @f.cc79() -declare x86_64_win64cc void @f.x86_64_win64cc() -; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.cc79() +declare win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.x86_64_win64cc() declare cc80 void @f.cc80() ; CHECK: declare x86_vectorcallcc void @f.cc80() declare x86_vectorcallcc void @f.x86_vectorcallcc() diff --git a/test/Bitcode/compatibility-3.8.ll b/test/Bitcode/compatibility-3.8.ll index 370c7f51a2b7..2e70a380d10e 100644 --- a/test/Bitcode/compatibility-3.8.ll +++ b/test/Bitcode/compatibility-3.8.ll @@ -393,9 +393,9 @@ declare cc78 void @f.cc78() declare x86_64_sysvcc void @f.x86_64_sysvcc() ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc() declare cc79 void @f.cc79() -; CHECK: declare x86_64_win64cc void @f.cc79() -declare x86_64_win64cc void @f.x86_64_win64cc() -; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.cc79() +declare win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.x86_64_win64cc() declare cc80 void @f.cc80() ; CHECK: declare x86_vectorcallcc void @f.cc80() declare x86_vectorcallcc void @f.x86_vectorcallcc() diff --git a/test/Bitcode/compatibility-3.9.ll b/test/Bitcode/compatibility-3.9.ll index 4115cbd8fe64..7c84daa7d3c4 100644 --- a/test/Bitcode/compatibility-3.9.ll +++ b/test/Bitcode/compatibility-3.9.ll @@ -422,9 +422,9 @@ declare cc78 void @f.cc78() declare x86_64_sysvcc void @f.x86_64_sysvcc() ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc() declare cc79 void @f.cc79() -; CHECK: declare x86_64_win64cc void @f.cc79() -declare x86_64_win64cc void @f.x86_64_win64cc() -; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.cc79() +declare win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.x86_64_win64cc() declare cc80 void @f.cc80() ; CHECK: declare x86_vectorcallcc void @f.cc80() declare x86_vectorcallcc void @f.x86_vectorcallcc() diff --git a/test/Bitcode/compatibility-4.0.ll b/test/Bitcode/compatibility-4.0.ll index eef925564ecb..9e34d48c95f7 100644 --- a/test/Bitcode/compatibility-4.0.ll +++ b/test/Bitcode/compatibility-4.0.ll @@ -422,9 +422,9 @@ declare cc78 void @f.cc78() declare x86_64_sysvcc void @f.x86_64_sysvcc() ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc() declare cc79 void @f.cc79() -; CHECK: declare x86_64_win64cc void @f.cc79() -declare x86_64_win64cc void @f.x86_64_win64cc() -; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.cc79() +declare win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.x86_64_win64cc() declare cc80 void @f.cc80() ; CHECK: declare x86_vectorcallcc void @f.cc80() declare x86_vectorcallcc void @f.x86_vectorcallcc() diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll index ebd727ba9aee..7df1535a6923 100644 --- a/test/Bitcode/compatibility.ll +++ b/test/Bitcode/compatibility.ll @@ -425,9 +425,9 @@ declare cc78 void @f.cc78() declare x86_64_sysvcc void @f.x86_64_sysvcc() ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc() declare cc79 void @f.cc79() -; CHECK: declare x86_64_win64cc void @f.cc79() -declare x86_64_win64cc void @f.x86_64_win64cc() -; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc() +; CHECK: declare win64cc void @f.cc79() +declare win64cc void @f.win64cc() +; CHECK: declare win64cc void @f.win64cc() declare cc80 void @f.cc80() ; CHECK: declare x86_vectorcallcc void @f.cc80() declare x86_vectorcallcc void @f.x86_vectorcallcc() diff --git a/test/Bitcode/upgrade-importedentity.ll b/test/Bitcode/upgrade-importedentity.ll new file mode 100644 index 000000000000..134ccf1f3eaf --- /dev/null +++ b/test/Bitcode/upgrade-importedentity.ll @@ -0,0 +1,15 @@ +; RUN: llvm-dis < %s.bc | FileCheck %s +; RUN: verify-uselistorder < %s.bc + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!9, !10} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)", emissionKind: FullDebug, imports: !3) +!1 = !DIFile(filename: "using.ii", directory: "/") +!3 = !{!4} +!4 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !5, entity: !8, line: 301) +; CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !5) +!5 = !DINamespace(name: "M", scope: null) +!8 = !DINamespace(name: "N", scope: null) +!9 = !{i32 2, !"Dwarf Version", i32 4} +!10 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/test/Bitcode/upgrade-importedentity.ll.bc b/test/Bitcode/upgrade-importedentity.ll.bc new file mode 100644 index 000000000000..7fa833b50462 Binary files /dev/null and b/test/Bitcode/upgrade-importedentity.ll.bc differ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b52b6018e026..124f0c72fd75 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -48,6 +48,7 @@ set(LLVM_TEST_DEPENDS llvm-cvtres llvm-diff llvm-dis + llvm-dlltool llvm-dsymutil llvm-dwarfdump llvm-dwp @@ -58,6 +59,7 @@ set(LLVM_TEST_DEPENDS llvm-mc llvm-mcmarkup llvm-modextract + llvm-mt llvm-nm llvm-objdump llvm-opt-report @@ -65,6 +67,7 @@ set(LLVM_TEST_DEPENDS llvm-profdata llvm-ranlib llvm-readobj + llvm-readelf llvm-rtdyld llvm-size llvm-split diff --git a/test/CodeGen/AArch64/GlobalISel/select-fma.mir b/test/CodeGen/AArch64/GlobalISel/select-fma.mir new file mode 100644 index 000000000000..3b2f3746b587 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-fma.mir @@ -0,0 +1,41 @@ +# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @FMADDSrrr_fpr() { ret void } +... + +--- +# CHECK-LABEL: name: FMADDSrrr_fpr +name: FMADDSrrr_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: fpr32, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: fpr32, preferred-register: '' } +# CHECK-NEXT: - { id: 3, class: fpr32, preferred-register: '' } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = COPY %w2 +# CHECK: %3 = FMADDSrrr %0, %1, %2 +body: | + bb.0: + liveins: %w0, %w1, %w2 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = COPY %w2 + %3(s32) = G_FMA %0, %1, %2 + %x0 = COPY %3 +... + diff --git a/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll new file mode 100644 index 000000000000..2546e7c90ce5 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s + +define win64cc void @pass_va(i32 %count, ...) nounwind { +entry: +; CHECK: sub sp, sp, #80 +; CHECK: add x8, sp, #24 +; CHECK: add x0, sp, #24 +; CHECK: stp x6, x7, [sp, #64] +; CHECK: stp x4, x5, [sp, #48] +; CHECK: stp x2, x3, [sp, #32] +; CHECK: str x1, [sp, #24] +; CHECK: stp x30, x8, [sp] +; CHECK: bl other_func +; CHECK: ldr x30, [sp], #80 +; CHECK: ret + %ap = alloca i8*, align 8 + %ap1 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap1) + %ap2 = load i8*, i8** %ap, align 8 + call void @other_func(i8* %ap2) + ret void +} + +declare void @other_func(i8*) local_unnamed_addr + +declare void @llvm.va_start(i8*) nounwind +declare void @llvm.va_copy(i8*, i8*) nounwind + +; CHECK-LABEL: f9: +; CHECK: sub sp, sp, #16 +; CHECK: add x8, sp, #24 +; CHECK: add x0, sp, #24 +; CHECK: str x8, [sp, #8] +; CHECK: add sp, sp, #16 +; CHECK: ret +define win64cc i8* @f9(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, ...) nounwind { +entry: + %ap = alloca i8*, align 8 + %ap1 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap1) + %ap2 = load i8*, i8** %ap, align 8 + ret i8* %ap2 +} + +; CHECK-LABEL: f8: +; CHECK: sub sp, sp, #16 +; CHECK: add x8, sp, #16 +; CHECK: add x0, sp, #16 +; CHECK: str x8, [sp, #8] +; CHECK: add sp, sp, #16 +; CHECK: ret +define win64cc i8* @f8(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, ...) nounwind { +entry: + %ap = alloca i8*, align 8 + %ap1 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap1) + %ap2 = load i8*, i8** %ap, align 8 + ret i8* %ap2 +} + +; CHECK-LABEL: f7: +; CHECK: sub sp, sp, #16 +; CHECK: add x8, sp, #8 +; CHECK: add x0, sp, #8 +; CHECK: stp x8, x7, [sp], #16 +; CHECK: ret +define win64cc i8* @f7(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, ...) nounwind { +entry: + %ap = alloca i8*, align 8 + %ap1 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap1) + %ap2 = load i8*, i8** %ap, align 8 + ret i8* %ap2 +} diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll index 0a7965571480..64a6b9b6b210 100644 --- a/test/CodeGen/AArch64/arm64-abi-varargs.ll +++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll @@ -11,9 +11,8 @@ define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, ; CHECK: add {{x[0-9]+}}, [[ARGS]], #8 ; First vararg ; CHECK: ldr {{w[0-9]+}}, [sp, #72] -; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8 ; Second vararg -; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}] +; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8 ; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8 ; Third vararg ; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}] diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll index b2ea9ad3b4a1..b844aab5628c 100644 --- a/test/CodeGen/AArch64/arm64-abi_align.ll +++ b/test/CodeGen/AArch64/arm64-abi_align.ll @@ -280,10 +280,10 @@ entry: define i32 @caller42() #3 { entry: ; CHECK-LABEL: caller42 -; CHECK: str {{x[0-9]+}}, [sp, #48] -; CHECK: str {{q[0-9]+}}, [sp, #32] -; CHECK: str {{x[0-9]+}}, [sp, #16] -; CHECK: str {{q[0-9]+}}, [sp] +; CHECK-DAG: str {{x[0-9]+}}, [sp, #48] +; CHECK-DAG: str {{q[0-9]+}}, [sp, #32] +; CHECK-DAG: str {{x[0-9]+}}, [sp, #16] +; CHECK-DAG: str {{q[0-9]+}}, [sp] ; CHECK: add x1, sp, #32 ; CHECK: mov x2, sp ; Space for s1 is allocated at sp+32 @@ -318,10 +318,10 @@ entry: ; CHECK-LABEL: caller42_stack ; CHECK: sub sp, sp, #112 ; CHECK: add x29, sp, #96 -; CHECK: stur {{x[0-9]+}}, [x29, #-16] -; CHECK: stur {{q[0-9]+}}, [x29, #-32] -; CHECK: str {{x[0-9]+}}, [sp, #48] -; CHECK: str {{q[0-9]+}}, [sp, #32] +; CHECK-DAG: stur {{x[0-9]+}}, [x29, #-16] +; CHECK-DAG: stur {{q[0-9]+}}, [x29, #-32] +; CHECK-DAG: str {{x[0-9]+}}, [sp, #48] +; CHECK-DAG: str {{q[0-9]+}}, [sp, #32] ; Space for s1 is allocated at x29-32 = sp+64 ; Space for s2 is allocated at sp+32 ; CHECK: add x[[B:[0-9]+]], sp, #32 @@ -388,10 +388,10 @@ entry: define i32 @caller43() #3 { entry: ; CHECK-LABEL: caller43 -; CHECK: str {{q[0-9]+}}, [sp, #48] -; CHECK: str {{q[0-9]+}}, [sp, #32] -; CHECK: str {{q[0-9]+}}, [sp, #16] -; CHECK: str {{q[0-9]+}}, [sp] +; CHECK-DAG: str {{q[0-9]+}}, [sp, #48] +; CHECK-DAG: str {{q[0-9]+}}, [sp, #32] +; CHECK-DAG: str {{q[0-9]+}}, [sp, #16] +; CHECK-DAG: str {{q[0-9]+}}, [sp] ; CHECK: add x1, sp, #32 ; CHECK: mov x2, sp ; Space for s1 is allocated at sp+32 @@ -430,10 +430,10 @@ entry: ; CHECK-LABEL: caller43_stack ; CHECK: sub sp, sp, #112 ; CHECK: add x29, sp, #96 -; CHECK: stur {{q[0-9]+}}, [x29, #-16] -; CHECK: stur {{q[0-9]+}}, [x29, #-32] -; CHECK: str {{q[0-9]+}}, [sp, #48] -; CHECK: str {{q[0-9]+}}, [sp, #32] +; CHECK-DAG: stur {{q[0-9]+}}, [x29, #-16] +; CHECK-DAG: stur {{q[0-9]+}}, [x29, #-32] +; CHECK-DAG: str {{q[0-9]+}}, [sp, #48] +; CHECK-DAG: str {{q[0-9]+}}, [sp, #32] ; Space for s1 is allocated at x29-32 = sp+64 ; Space for s2 is allocated at sp+32 ; CHECK: add x[[B:[0-9]+]], sp, #32 diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll index a3b740df9b4e..fdb379871048 100644 --- a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll +++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll @@ -1,10 +1,8 @@ ; RUN: llc -mtriple=arm64-eabi -mcpu=cyclone < %s | FileCheck %s ; CHECK: foo -; CHECK: str w[[REG0:[0-9]+]], [x19, #264] -; CHECK: mov w[[REG1:[0-9]+]], w[[REG0]] -; CHECK: str w[[REG1]], [x19, #132] - +; CHECK-DAG: str w[[REG0:[0-9]+]], [x19, #132] +; CHECK-DAG: str w[[REG0]], [x19, #264] define i32 @foo(i32 %a) nounwind { %retval = alloca i32, align 4 %a.addr = alloca i32, align 4 diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll index 990782cb69a0..c98bda0d01a0 100644 --- a/test/CodeGen/AArch64/arm64-extern-weak.ll +++ b/test/CodeGen/AArch64/arm64-extern-weak.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - < %s | FileCheck %s -; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck %s ; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s declare extern_weak i32 @var() diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll index f849df2a51ec..848b87fd2cfb 100644 --- a/test/CodeGen/AArch64/arm64-inline-asm.ll +++ b/test/CodeGen/AArch64/arm64-inline-asm.ll @@ -261,3 +261,13 @@ define void @test_inline_modifier_a(i8* %ptr) nounwind { ; CHECK: prfm pldl1keep, [x0] ret void } + +; PR33134 +define void @test_zero_address() { +entry: +; CHECK-LABEL: test_zero_address +; CHECK: mov {{x[0-9]+}}, xzr +; CHECK: ldr {{x[0-9]+}}, {{[x[0-9]+]}} + tail call i32 asm sideeffect "ldr $0, $1 \0A", "=r,*Q"(i32* null) + ret void +} diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll index f3af01a73559..9b5d8a890fa6 100644 --- a/test/CodeGen/AArch64/arm64-platform-reg.ll +++ b/test/CodeGen/AArch64/arm64-platform-reg.ll @@ -1,6 +1,7 @@ ; RUN: llc -mtriple=arm64-apple-ios -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18 ; RUN: llc -mtriple=arm64-freebsd-gnu -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18 ; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-windows -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18 ; x18 is reserved as a platform register on Darwin but not on other ; systems. Create loads of register pressure and make sure this is respected. diff --git a/test/CodeGen/AArch64/arm64-vext.ll b/test/CodeGen/AArch64/arm64-vext.ll index b315e4c409b0..c1edf1b2e9bf 100644 --- a/test/CodeGen/AArch64/arm64-vext.ll +++ b/test/CodeGen/AArch64/arm64-vext.ll @@ -116,7 +116,7 @@ define void @test_vext_p16() nounwind ssp { define void @test_vext_s32() nounwind ssp { ; CHECK-LABEL: test_vext_s32: - ; CHECK: {{ext.8.*#4}} + ; CHECK: {{rev64.2s.*}} %xS32x2 = alloca <2 x i32>, align 8 %__a = alloca <2 x i32>, align 8 %__b = alloca <2 x i32>, align 8 @@ -137,7 +137,7 @@ define void @test_vext_s32() nounwind ssp { define void @test_vext_u32() nounwind ssp { ; CHECK-LABEL: test_vext_u32: - ; CHECK: {{ext.8.*#4}} + ; CHECK: {{rev64.2s.*}} %xU32x2 = alloca <2 x i32>, align 8 %__a = alloca <2 x i32>, align 8 %__b = alloca <2 x i32>, align 8 @@ -158,7 +158,7 @@ define void @test_vext_u32() nounwind ssp { define void @test_vext_f32() nounwind ssp { ; CHECK-LABEL: test_vext_f32: - ; CHECK: {{ext.8.*#4}} + ; CHECK: {{rev64.2s.*}} %xF32x2 = alloca <2 x float>, align 8 %__a = alloca <2 x float>, align 8 %__b = alloca <2 x float>, align 8 @@ -179,7 +179,7 @@ define void @test_vext_f32() nounwind ssp { define void @test_vext_s64() nounwind ssp { ; CHECK-LABEL: test_vext_s64: - ; CHECK_FIXME: {{ext.8.*#1}} + ; CHECK_FIXME: {{rev64.2s.*}} ; this just turns into a load of the second element %xS64x1 = alloca <1 x i64>, align 8 %__a = alloca <1 x i64>, align 8 diff --git a/test/CodeGen/AArch64/atomic-ops-lse.ll b/test/CodeGen/AArch64/atomic-ops-lse.ll index a85eb6b46aff..a0c418bff573 100644 --- a/test/CodeGen/AArch64/atomic-ops-lse.ll +++ b/test/CodeGen/AArch64/atomic-ops-lse.ll @@ -681,3 +681,164 @@ define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { ret i64 %old } +define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_sub_i8: + %old = atomicrmw sub i8* @var8, i8 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8 + +; CHECK: ldaddalb w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + + ret i8 %old +} + +define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_sub_i16: + %old = atomicrmw sub i16* @var16, i16 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16 + +; CHECK: ldaddalh w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + + ret i16 %old +} + +define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_sub_i32: + %old = atomicrmw sub i32* @var32, i32 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 + +; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + + ret i32 %old +} + +define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_sub_i64: + %old = atomicrmw sub i64* @var64, i64 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64 + +; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + + ret i64 %old +} + +define void @test_atomic_load_sub_i32_noret(i32 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_sub_i32_noret: + atomicrmw sub i32* @var32, i32 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 + +; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + + ret void +} + +define void @test_atomic_load_sub_i64_noret(i64 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_sub_i64_noret: + atomicrmw sub i64* @var64, i64 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64 + +; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + + ret void +} + +define i8 @test_atomic_load_and_i8(i8 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_and_i8: + %old = atomicrmw and i8* @var8, i8 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8 + +; CHECK: ldclralb w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + ret i8 %old +} + +define i16 @test_atomic_load_and_i16(i16 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_and_i16: + %old = atomicrmw and i16* @var16, i16 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16 + +; CHECK: ldclralh w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + ret i16 %old +} + +define i32 @test_atomic_load_and_i32(i32 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_and_i32: + %old = atomicrmw and i32* @var32, i32 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 + +; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + ret i32 %old +} + +define i64 @test_atomic_load_and_i64(i64 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_and_i64: + %old = atomicrmw and i64* @var64, i64 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64 + +; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + ret i64 %old +} + +define void @test_atomic_load_and_i32_noret(i32 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_and_i32_noret: + atomicrmw and i32* @var32, i32 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 + +; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + ret void +} + +define void @test_atomic_load_and_i64_noret(i64 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_and_i64_noret: + atomicrmw and i64* @var64, i64 %offset seq_cst +; CHECK-NOT: dmb +; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]] +; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 +; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64 + +; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]] +; CHECK-NOT: dmb + ret void +} diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll index 20ba3fea8377..a2fa1db8a8ac 100644 --- a/test/CodeGen/AArch64/dag-combine-invaraints.ll +++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll @@ -9,7 +9,7 @@ main_: %i32T = alloca i32, align 4 %i32F = alloca i32, align 4 %i32X = alloca i32, align 4 - store i32 0, i32* %tmp + store i32 %argc, i32* %tmp store i32 15, i32* %i32T, align 4 store i32 5, i32* %i32F, align 4 %tmp6 = load i32, i32* %tmp, align 4 diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll index ac2153ad8ffe..5671a1070138 100644 --- a/test/CodeGen/AArch64/extern-weak.ll +++ b/test/CodeGen/AArch64/extern-weak.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s -; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s declare extern_weak i32 @var() diff --git a/test/CodeGen/AArch64/falkor-hwpf-fix.ll b/test/CodeGen/AArch64/falkor-hwpf-fix.ll new file mode 100644 index 000000000000..9f2af5adce71 --- /dev/null +++ b/test/CodeGen/AArch64/falkor-hwpf-fix.ll @@ -0,0 +1,67 @@ +; RUN: llc < %s -mtriple aarch64 -mcpu=falkor -disable-post-ra | FileCheck %s + +; Check that strided load tag collisions are avoided on Falkor. + +; CHECK-LABEL: hwpf1: +; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE:[0-9]+]], #-16] +; CHECK: mov x[[BASE2:[0-9]+]], x[[BASE]] +; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE2]], #-8] +; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE3:[0-9]+]]] +; CHECK: mov x[[BASE4:[0-9]+]], x[[BASE3]] +; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE4]], #8] + +define void @hwpf1(i32* %p, i32* %sp, i32* %sp2, i32* %sp3, i32* %sp4) { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %loop ] + + %gep = getelementptr inbounds i32, i32* %p, i32 %iv + %load1 = load i32, i32* %gep + + %gep2 = getelementptr inbounds i32, i32* %gep, i32 1 + %load2 = load i32, i32* %gep2 + + %add = add i32 %load1, %load2 + %storegep = getelementptr inbounds i32, i32* %sp, i32 %iv + store i32 %add, i32* %storegep + + %gep3 = getelementptr inbounds i32, i32* %gep, i32 2 + %load3 = load i32, i32* %gep3 + + %gep4 = getelementptr inbounds i32, i32* %gep, i32 3 + %load4 = load i32, i32* %gep4 + + %add2 = add i32 %load3, %load4 + %storegep2 = getelementptr inbounds i32, i32* %sp2, i32 %iv + store i32 %add2, i32* %storegep2 + + %gep5 = getelementptr inbounds i32, i32* %gep, i32 4 + %load5 = load i32, i32* %gep5 + + %gep6 = getelementptr inbounds i32, i32* %gep, i32 5 + %load6 = load i32, i32* %gep6 + + %add3 = add i32 %load5, %load6 + %storegep3 = getelementptr inbounds i32, i32* %sp3, i32 %iv + store i32 %add3, i32* %storegep3 + + %gep7 = getelementptr inbounds i32, i32* %gep, i32 6 + %load7 = load i32, i32* %gep7 + + %gep8 = getelementptr inbounds i32, i32* %gep, i32 7 + %load8 = load i32, i32* %gep8 + + %add4 = add i32 %load7, %load8 + %storegep4 = getelementptr inbounds i32, i32* %sp4, i32 %iv + store i32 %add4, i32* %storegep4 + + %inc = add i32 %iv, 8 + %exitcnd = icmp uge i32 %inc, 1024 + br i1 %exitcnd, label %exit, label %loop + +exit: + ret void +} + diff --git a/test/CodeGen/AArch64/falkor-hwpf-fix.mir b/test/CodeGen/AArch64/falkor-hwpf-fix.mir new file mode 100644 index 000000000000..54c8b16a9b43 --- /dev/null +++ b/test/CodeGen/AArch64/falkor-hwpf-fix.mir @@ -0,0 +1,52 @@ +# RUN: llc -mtriple=aarch64-linux-gnu -mcpu=falkor -run-pass falkor-hwpf-fix-late -o - %s | FileCheck %s +--- | + @g = external global i32 + + define void @hwpf1() { ret void } + define void @hwpf2() { ret void } +... +--- +# Verify that the tag collision between the loads is resolved. +# CHECK-LABEL: name: hwpf1 +# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0 +# CHECK: LDRWui %[[BASE]], 0 +# CHECK: LDRWui %x1, 1 +name: hwpf1 +tracksRegLiveness: true +body: | + bb.0: + liveins: %w0, %x1 + + %w2 = LDRWui %x1, 0 :: ("aarch64-strided-access" load 4 from @g) + %w2 = LDRWui %x1, 1 + + %w0 = SUBWri %w0, 1, 0 + %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv + Bcc 9, %bb.0, implicit %nzcv + + bb.1: + RET_ReallyLR +... +--- +# Verify that the tag collision between the loads is resolved and written back for post increment addressing. +# CHECK-LABEL: name: hwpf2 +# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0 +# CHECK: LDRWpost %[[BASE]], 0 +# CHECK: %x1 = ORRXrs %xzr, %[[BASE]], 0 +# CHECK: LDRWui %x1, 1 +name: hwpf2 +tracksRegLiveness: true +body: | + bb.0: + liveins: %w0, %x1 + + %x1, %w2 = LDRWpost %x1, 0 :: ("aarch64-strided-access" load 4 from @g) + %w2 = LDRWui %x1, 1 + + %w0 = SUBWri %w0, 1, 0 + %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv + Bcc 9, %bb.0, implicit %nzcv + + bb.1: + RET_ReallyLR +... diff --git a/test/CodeGen/AArch64/falkor-hwpf.ll b/test/CodeGen/AArch64/falkor-hwpf.ll new file mode 100644 index 000000000000..bbe7febe397f --- /dev/null +++ b/test/CodeGen/AArch64/falkor-hwpf.ll @@ -0,0 +1,106 @@ +; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s +; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF + +; Check that strided access metadata is added to loads in inner loops when compiling for Falkor. + +; CHECK-LABEL: @hwpf1( +; CHECK: load i32, i32* %gep, !falkor.strided.access !0 +; CHECK: load i32, i32* %gep2, !falkor.strided.access !0 + +; NOHWPF-LABEL: @hwpf1( +; NOHWPF: load i32, i32* %gep{{$}} +; NOHWPF: load i32, i32* %gep2{{$}} +define void @hwpf1(i32* %p, i32* %p2) { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %loop ] + + %gep = getelementptr inbounds i32, i32* %p, i32 %iv + %load = load i32, i32* %gep + + %gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv + %load2 = load i32, i32* %gep2 + + %inc = add i32 %iv, 1 + %exitcnd = icmp uge i32 %inc, 1024 + br i1 %exitcnd, label %exit, label %loop + +exit: + ret void +} + +; Check that outer loop strided load isn't marked. +; CHECK-LABEL: @hwpf2( +; CHECK: load i32, i32* %gep, !falkor.strided.access !0 +; CHECK: load i32, i32* %gep2{{$}} + +; NOHWPF-LABEL: @hwpf2( +; NOHWPF: load i32, i32* %gep{{$}} +; NOHWPF: load i32, i32* %gep2{{$}} +define void @hwpf2(i32* %p) { +entry: + br label %loop1 + +loop1: + %iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ] + %outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ] + br label %loop2.header + +loop2.header: + br label %loop2 + +loop2: + %iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ] + %sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ] + %gep = getelementptr inbounds i32, i32* %p, i32 %iv2 + %load = load i32, i32* %gep + %sum.inc = add i32 %sum, %load + %inc2 = add i32 %iv2, 1 + %exitcnd2 = icmp uge i32 %inc2, 1024 + br i1 %exitcnd2, label %exit2, label %loop2 + +exit2: + %gep2 = getelementptr inbounds i32, i32* %p, i32 %iv1 + %load2 = load i32, i32* %gep2 + br label %loop1.latch + +loop1.latch: + %inc1 = add i32 %iv1, 1 + %exitcnd1 = icmp uge i32 %inc1, 1024 + br i1 %exitcnd2, label %exit, label %loop1 + +exit: + ret void +} + + +; Check that non-strided load isn't marked. +; CHECK-LABEL: @hwpf3( +; CHECK: load i32, i32* %gep, !falkor.strided.access !0 +; CHECK: load i32, i32* %gep2{{$}} + +; NOHWPF-LABEL: @hwpf3( +; NOHWPF: load i32, i32* %gep{{$}} +; NOHWPF: load i32, i32* %gep2{{$}} +define void @hwpf3(i32* %p, i32* %p2) { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %loop ] + + %gep = getelementptr inbounds i32, i32* %p, i32 %iv + %load = load i32, i32* %gep + + %gep2 = getelementptr inbounds i32, i32* %p2, i32 %load + %load2 = load i32, i32* %gep2 + + %inc = add i32 %iv, 1 + %exitcnd = icmp uge i32 %inc, 1024 + br i1 %exitcnd, label %exit, label %loop + +exit: + ret void +} diff --git a/test/CodeGen/AArch64/preferred-function-alignment.ll b/test/CodeGen/AArch64/preferred-function-alignment.ll index 88e6f5dd01c9..386a6ecccf54 100644 --- a/test/CodeGen/AArch64/preferred-function-alignment.ll +++ b/test/CodeGen/AArch64/preferred-function-alignment.ll @@ -1,7 +1,6 @@ ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=generic < %s | FileCheck --check-prefix=ALIGN2 %s ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a35 < %s | FileCheck --check-prefix=ALIGN2 %s ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a53 < %s | FileCheck --check-prefix=ALIGN2 %s -; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a73 < %s | FileCheck --check-prefix=ALIGN2 %s ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cyclone < %s | FileCheck --check-prefix=ALIGN2 %s ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=falkor < %s | FileCheck --check-prefix=ALIGN2 %s ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=kryo < %s | FileCheck --check-prefix=ALIGN2 %s @@ -12,6 +11,7 @@ ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 < %s | FileCheck --check-prefix=ALIGN3 %s ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 < %s | FileCheck --check-prefix=ALIGN4 %s ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a72 < %s | FileCheck --check-prefix=ALIGN4 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a73 < %s | FileCheck --check-prefix=ALIGN4 %s ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 < %s | FileCheck --check-prefix=ALIGN4 %s ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m2 < %s | FileCheck --check-prefix=ALIGN4 %s ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 < %s | FileCheck --check-prefix=ALIGN4 %s diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll index bc28f477c810..bcad19e391d0 100644 --- a/test/CodeGen/AArch64/swifterror.ll +++ b/test/CodeGen/AArch64/swifterror.ll @@ -309,17 +309,17 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) { ; CHECK-APPLE-LABEL: foo_vararg: ; CHECK-APPLE: orr w0, wzr, #0x10 ; CHECK-APPLE: malloc -; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1 -; CHECK-APPLE: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16 -; CHECK-APPLE: strb [[ID]], [x0, #8] +; CHECK-APPLE-DAG: orr [[ID:w[0-9]+]], wzr, #0x1 +; CHECK-APPLE-DAG: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16 +; CHECK-APPLE-DAG: strb [[ID]], [x0, #8] ; First vararg ; CHECK-APPLE-DAG: orr {{x[0-9]+}}, [[ARGS]], #0x8 ; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #16] -; CHECK-APPLE: add {{x[0-9]+}}, {{x[0-9]+}}, #8 +; CHECK-APPLE-DAG: add {{x[0-9]+}}, {{x[0-9]+}}, #8 ; Second vararg -; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}] -; CHECK-APPLE: add {{x[0-9]+}}, {{x[0-9]+}}, #8 +; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8 +; CHECK-APPLE-DAG: add {{x[0-9]+}}, {{x[0-9]+}}, #16 ; Third vararg ; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}] diff --git a/test/CodeGen/AArch64/win64_vararg.ll b/test/CodeGen/AArch64/win64_vararg.ll new file mode 100644 index 000000000000..b760e4acd16a --- /dev/null +++ b/test/CodeGen/AArch64/win64_vararg.ll @@ -0,0 +1,95 @@ +; RUN: llc < %s -mtriple=aarch64-pc-win32 | FileCheck %s + +define void @pass_va(i32 %count, ...) nounwind { +entry: +; CHECK: sub sp, sp, #80 +; CHECK: add x8, sp, #24 +; CHECK: add x0, sp, #24 +; CHECK: stp x6, x7, [sp, #64] +; CHECK: stp x4, x5, [sp, #48] +; CHECK: stp x2, x3, [sp, #32] +; CHECK: str x1, [sp, #24] +; CHECK: stp x30, x8, [sp] +; CHECK: bl other_func +; CHECK: ldr x30, [sp], #80 +; CHECK: ret + %ap = alloca i8*, align 8 + %ap1 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap1) + %ap2 = load i8*, i8** %ap, align 8 + call void @other_func(i8* %ap2) + ret void +} + +declare void @other_func(i8*) local_unnamed_addr + +declare void @llvm.va_start(i8*) nounwind +declare void @llvm.va_copy(i8*, i8*) nounwind + +; CHECK-LABEL: f9: +; CHECK: sub sp, sp, #16 +; CHECK: add x8, sp, #24 +; CHECK: add x0, sp, #24 +; CHECK: str x8, [sp, #8] +; CHECK: add sp, sp, #16 +; CHECK: ret +define i8* @f9(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, ...) nounwind { +entry: + %ap = alloca i8*, align 8 + %ap1 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap1) + %ap2 = load i8*, i8** %ap, align 8 + ret i8* %ap2 +} + +; CHECK-LABEL: f8: +; CHECK: sub sp, sp, #16 +; CHECK: add x8, sp, #16 +; CHECK: add x0, sp, #16 +; CHECK: str x8, [sp, #8] +; CHECK: add sp, sp, #16 +; CHECK: ret +define i8* @f8(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, ...) nounwind { +entry: + %ap = alloca i8*, align 8 + %ap1 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap1) + %ap2 = load i8*, i8** %ap, align 8 + ret i8* %ap2 +} + +; CHECK-LABEL: f7: +; CHECK: sub sp, sp, #16 +; CHECK: add x8, sp, #8 +; CHECK: add x0, sp, #8 +; CHECK: stp x8, x7, [sp], #16 +; CHECK: ret +define i8* @f7(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, ...) nounwind { +entry: + %ap = alloca i8*, align 8 + %ap1 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap1) + %ap2 = load i8*, i8** %ap, align 8 + ret i8* %ap2 +} + +; CHECK-LABEL: copy1: +; CHECK: sub sp, sp, #80 +; CHECK: add x8, sp, #24 +; CHECK: stp x6, x7, [sp, #64] +; CHECK: stp x4, x5, [sp, #48] +; CHECK: stp x2, x3, [sp, #32] +; CHECK: stp x8, x1, [sp, #16] +; CHECK: str x8, [sp, #8] +; CHECK: add sp, sp, #80 +; CHECK: ret +define void @copy1(i64 %a0, ...) nounwind { +entry: + %ap = alloca i8*, align 8 + %cp = alloca i8*, align 8 + %ap1 = bitcast i8** %ap to i8* + %cp1 = bitcast i8** %cp to i8* + call void @llvm.va_start(i8* %ap1) + call void @llvm.va_copy(i8* %cp1, i8* %ap1) + ret void +} diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll new file mode 100644 index 000000000000..e9797eff712b --- /dev/null +++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -0,0 +1,312 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=HSA %s + +declare i32 @llvm.amdgcn.workgroup.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.y() #0 +declare i32 @llvm.amdgcn.workgroup.id.z() #0 + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 + +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 +declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 +declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0 +declare i64 @llvm.amdgcn.dispatch.id() #0 + +; HSA: define void @use_workitem_id_x() #1 { +define void @use_workitem_id_x() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @use_workitem_id_y() #2 { +define void @use_workitem_id_y() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @use_workitem_id_z() #3 { +define void @use_workitem_id_z() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @use_workgroup_id_x() #4 { +define void @use_workgroup_id_x() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @use_workgroup_id_y() #5 { +define void @use_workgroup_id_y() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.y() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @use_workgroup_id_z() #6 { +define void @use_workgroup_id_z() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @use_dispatch_ptr() #7 { +define void @use_dispatch_ptr() #1 { + %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + store volatile i8 addrspace(2)* %dispatch.ptr, i8 addrspace(2)* addrspace(1)* undef + ret void +} + +; HSA: define void @use_queue_ptr() #8 { +define void @use_queue_ptr() #1 { + %queue.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() + store volatile i8 addrspace(2)* %queue.ptr, i8 addrspace(2)* addrspace(1)* undef + ret void +} + +; HSA: define void @use_dispatch_id() #9 { +define void @use_dispatch_id() #1 { + %val = call i64 @llvm.amdgcn.dispatch.id() + store volatile i64 %val, i64 addrspace(1)* undef + ret void +} + +; HSA: define void @use_workgroup_id_y_workgroup_id_z() #10 { +define void @use_workgroup_id_y_workgroup_id_z() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.y() + %val1 = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @func_indirect_use_workitem_id_x() #1 { +define void @func_indirect_use_workitem_id_x() #1 { + call void @use_workitem_id_x() + ret void +} + +; HSA: define void @kernel_indirect_use_workitem_id_x() #1 { +define void @kernel_indirect_use_workitem_id_x() #1 { + call void @use_workitem_id_x() + ret void +} + +; HSA: define void @func_indirect_use_workitem_id_y() #2 { +define void @func_indirect_use_workitem_id_y() #1 { + call void @use_workitem_id_y() + ret void +} + +; HSA: define void @func_indirect_use_workitem_id_z() #3 { +define void @func_indirect_use_workitem_id_z() #1 { + call void @use_workitem_id_z() + ret void +} + +; HSA: define void @func_indirect_use_workgroup_id_x() #4 { +define void @func_indirect_use_workgroup_id_x() #1 { + call void @use_workgroup_id_x() + ret void +} + +; HSA: define void @kernel_indirect_use_workgroup_id_x() #4 { +define void @kernel_indirect_use_workgroup_id_x() #1 { + call void @use_workgroup_id_x() + ret void +} + +; HSA: define void @func_indirect_use_workgroup_id_y() #5 { +define void @func_indirect_use_workgroup_id_y() #1 { + call void @use_workgroup_id_y() + ret void +} + +; HSA: define void @func_indirect_use_workgroup_id_z() #6 { +define void @func_indirect_use_workgroup_id_z() #1 { + call void @use_workgroup_id_z() + ret void +} + +; HSA: define void @func_indirect_indirect_use_workgroup_id_y() #5 { +define void @func_indirect_indirect_use_workgroup_id_y() #1 { + call void @func_indirect_use_workgroup_id_y() + ret void +} + +; HSA: define void @indirect_x2_use_workgroup_id_y() #5 { +define void @indirect_x2_use_workgroup_id_y() #1 { + call void @func_indirect_indirect_use_workgroup_id_y() + ret void +} + +; HSA: define void @func_indirect_use_dispatch_ptr() #7 { +define void @func_indirect_use_dispatch_ptr() #1 { + call void @use_dispatch_ptr() + ret void +} + +; HSA: define void @func_indirect_use_queue_ptr() #8 { +define void @func_indirect_use_queue_ptr() #1 { + call void @use_queue_ptr() + ret void +} + +; HSA: define void @func_indirect_use_dispatch_id() #9 { +define void @func_indirect_use_dispatch_id() #1 { + call void @use_dispatch_id() + ret void +} + +; HSA: define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #11 { +define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 { + call void @func_indirect_use_workgroup_id_y_workgroup_id_z() + ret void +} + +; HSA: define void @recursive_use_workitem_id_y() #2 { +define void @recursive_use_workitem_id_y() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val, i32 addrspace(1)* undef + call void @recursive_use_workitem_id_y() + ret void +} + +; HSA: define void @call_recursive_use_workitem_id_y() #2 { +define void @call_recursive_use_workitem_id_y() #1 { + call void @recursive_use_workitem_id_y() + ret void +} + +; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #8 { +define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %stof + ret void +} + +; HSA: define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #12 { +define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 { + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %stof + ret void +} + +; HSA: define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #13 { +define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #2 { + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %stof + call void @func_indirect_use_queue_ptr() + ret void +} + +; HSA: define void @indirect_use_group_to_flat_addrspacecast() #8 { +define void @indirect_use_group_to_flat_addrspacecast() #1 { + call void @use_group_to_flat_addrspacecast(i32 addrspace(3)* null) + ret void +} + +; HSA: define void @indirect_use_group_to_flat_addrspacecast_gfx9() #11 { +define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { + call void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* null) + ret void +} + +; HSA: define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #8 { +define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { + call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* null) + ret void +} + +; HSA: define void @use_kernarg_segment_ptr() #14 { +define void @use_kernarg_segment_ptr() #1 { + %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() + store volatile i8 addrspace(2)* %kernarg.segment.ptr, i8 addrspace(2)* addrspace(1)* undef + ret void +} + +; HSA: define void @func_indirect_use_kernarg_segment_ptr() #14 { +define void @func_indirect_use_kernarg_segment_ptr() #1 { + call void @use_kernarg_segment_ptr() + ret void +} + +; HSA: define void @use_implicitarg_ptr() #14 { +define void @use_implicitarg_ptr() #1 { + %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef + ret void +} + +; HSA: define void @func_indirect_use_implicitarg_ptr() #14 { +define void @func_indirect_use_implicitarg_ptr() #1 { + call void @use_implicitarg_ptr() + ret void +} + +; HSA: declare void @external.func() #15 +declare void @external.func() #3 + +; HSA: define internal void @defined.func() #15 { +define internal void @defined.func() #3 { + ret void +} + +; HSA: define void @func_call_external() #15 { +define void @func_call_external() #3 { + call void @external.func() + ret void +} + +; HSA: define void @func_call_defined() #15 { +define void @func_call_defined() #3 { + call void @defined.func() + ret void +} + +; HSA: define void @func_call_asm() #15 { +define void @func_call_asm() #3 { + call void asm sideeffect "", ""() #3 + ret void +} + +; HSA: define amdgpu_kernel void @kern_call_external() #16 { +define amdgpu_kernel void @kern_call_external() #3 { + call void @external.func() + ret void +} + +; HSA: define amdgpu_kernel void @func_kern_defined() #16 { +define amdgpu_kernel void @func_kern_defined() #3 { + call void @defined.func() + ret void +} + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind "target-cpu"="fiji" } +attributes #2 = { nounwind "target-cpu"="gfx900" } +attributes #3 = { nounwind } + +; HSA: attributes #0 = { nounwind readnone speculatable } +; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" } +; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" } +; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" } +; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" } +; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" } +; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" } +; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" } +; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" } +; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" } +; HSA: attributes #10 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" } +; HSA: attributes #11 = { nounwind "target-cpu"="fiji" } +; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" } +; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" } +; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" } +; HSA: attributes #15 = { nounwind } +; HSA: attributes #16 = { nounwind "amdgpu-flat-scratch" } diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index f7461b925ca1..3059a95a5098 100644 --- a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -10,6 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 +declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 ; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { @@ -164,6 +165,15 @@ define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 { ret void } +; HSA: define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #12 { +define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #1 { + %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() + %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* + %val = load i32, i32 addrspace(2)* %bc + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + ; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 { define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* @@ -236,3 +246,4 @@ attributes #1 = { nounwind } ; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" } ; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" } +; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" } diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index 63a6f6a8d32c..a0694fb1e3c9 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -36,7 +36,7 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} ; CHECK-LABEL: {{^}}min_1024_max_2048 ; CHECK: SGPRBlocks: 1 ; CHECK: VGPRBlocks: 7 -; CHECK: NumSGPRsForWavesPerEU: 13 +; CHECK: NumSGPRsForWavesPerEU: 12 ; CHECK: NumVGPRsForWavesPerEU: 32 @var = addrspace(1) global float 0.0 define amdgpu_kernel void @min_1024_max_2048() #3 { diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 3dda73bc336e..a5e97205de21 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -118,7 +118,7 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"} ; CHECK-LABEL: {{^}}exactly_10: ; CHECK: SGPRBlocks: 1 ; CHECK: VGPRBlocks: 5 -; CHECK: NumSGPRsForWavesPerEU: 13 +; CHECK: NumSGPRsForWavesPerEU: 12 ; CHECK: NumVGPRsForWavesPerEU: 24 define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, float addrspace(1)* @var @@ -188,3 +188,15 @@ define amdgpu_kernel void @exactly_10() #9 { ret void } attributes #9 = {"amdgpu-waves-per-eu"="10,10"} + +; Exactly 256 workitems and exactly 2 waves. +; CHECK-LABEL: {{^}}empty_workitems_exactly_256_waves_exactly_2: +; CHECK: SGPRBlocks: 12 +; CHECK: VGPRBlocks: 21 +; CHECK: NumSGPRsForWavesPerEU: 102 +; CHECK: NumVGPRsForWavesPerEU: 85 +define amdgpu_kernel void @empty_workitems_exactly_256_waves_exactly_2() #10 { +entry: + ret void +} +attributes #10 = {"amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2,2"} diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index 5383bbe71ae3..5ffa45595e70 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -347,7 +347,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace } ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32: -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V]] define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -388,9 +390,11 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace } ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32: -; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] -; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; VI: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -402,9 +406,11 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa } ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32: -; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GFX9: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; VI: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -465,6 +471,49 @@ entry: ret float %canonicalized } +; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32 +; GFX9-DENORM: flat_load_dword [[V:v[0-9]+]], +; GFX9-DENORM: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9-DENORM-NOT: 1.0 +; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %v = load float, float addrspace(1)* %gep, align 4 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id + store float %canonicalized, float addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64 +; GCN: flat_load_dwordx2 [[V:v\[[0-9:]+\]]], +; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id + %v = load double, double addrspace(1)* %gep, align 8 + %canonicalized = tail call double @llvm.canonicalize.f64(double %v) + %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id + store double %canonicalized, double addrspace(1)* %gep2, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16 +; GCN: flat_load_ushort [[V:v[0-9]+]], +; GCN: flat_store_short v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %v = load half, half addrspace(1)* %gep, align 2 + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id + store half %canonicalized, half addrspace(1)* %gep2, align 2 + ret void +} + declare float @llvm.canonicalize.f32(float) #0 declare double @llvm.canonicalize.f64(double) #0 declare half @llvm.canonicalize.f16(half) #0 @@ -485,3 +534,4 @@ declare float @llvm.maxnum.f32(float, float) #0 declare double @llvm.maxnum.f64(double, double) #0 attributes #0 = { nounwind readnone } +attributes #1 = { "no-nans-fp-math"="true" } diff --git a/test/CodeGen/AMDGPU/function-args.ll b/test/CodeGen/AMDGPU/function-args.ll index 9b1368493ba5..6b22cb0b7e28 100644 --- a/test/CodeGen/AMDGPU/function-args.ll +++ b/test/CodeGen/AMDGPU/function-args.ll @@ -34,6 +34,22 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 { ret void } +; GCN-LABEL: {{^}}i1_arg_i1_use: +; GCN: v_and_b32_e32 v0, 1, v0 +; GCN: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1 +define void @i1_arg_i1_use(i1 %arg) #0 { +bb: + br i1 %arg, label %bb2, label %bb1 + +bb1: + store volatile i32 0, i32 addrspace(1)* undef + br label %bb2 + +bb2: + ret void +} + ; GCN-LABEL: {{^}}void_func_i8: ; GCN-NOT: v0 ; GCN: buffer_store_byte v0, off diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll index 972fbd66ef37..0b19fbe7d70c 100644 --- a/test/CodeGen/AMDGPU/hsa.ll +++ b/test/CodeGen/AMDGPU/hsa.ll @@ -40,7 +40,7 @@ ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU" -; HSA: .amdgpu_hsa_kernel simple +; HSA-LABEL: .amdgpu_hsa_kernel simple ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t ; HSA: enable_sgpr_private_segment_buffer = 1 @@ -65,3 +65,11 @@ entry: store i32 0, i32 addrspace(1)* %out ret void } + +; HSA-LABEL: .amdgpu_hsa_kernel simple_no_kernargs +; HSA: enable_sgpr_kernarg_segment_ptr = 0 +define amdgpu_kernel void @simple_no_kernargs() { +entry: + store volatile i32 0, i32 addrspace(1)* undef + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index 9a27809f37bb..70e6b408ca29 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -49,6 +49,18 @@ define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x ret void } +; ALL-LABEL: {{^}}test_no_kernargs: +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: s_load_dword s{{[0-9]+}}, s[4:5] +define amdgpu_kernel void @test_no_kernargs() #1 { + %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() + %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)* + %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10 + %value = load i32, i32 addrspace(2)* %gep + store volatile i32 %value, i32 addrspace(1)* undef + ret void +} + declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll index f0af876567b4..1c3cba8d3e4f 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test1: ; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll index ee58d359a935..a466671d8c55 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test1: ; CHECK: image_store diff --git a/test/CodeGen/AMDGPU/move-to-valu-worklist.ll b/test/CodeGen/AMDGPU/move-to-valu-worklist.ll new file mode 100644 index 000000000000..539eed92d540 --- /dev/null +++ b/test/CodeGen/AMDGPU/move-to-valu-worklist.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s + +; In moveToVALU(), move to vector ALU is performed, all instrs in +; the use chain will be visited. We do not want the same node to be +; pushed to the visit worklist more than once. + +; GCN-LABEL: {{^}}in_worklist_once: +; GCN: buffer_load_dword +; GCN: BB0_1: +; GCN: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-NEXT: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-NEXT: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @in_worklist_once() #0 { +bb: + %tmp = load i64, i64* undef +br label %bb1 + +bb1: ; preds = %bb1, %bb + %tmp2 = phi i64 [ undef, %bb ], [ %tmp16, %bb1 ] + %tmp3 = phi i64 [ %tmp, %bb ], [ undef, %bb1 ] + %tmp11 = shl i64 %tmp2, 14 + %tmp13 = xor i64 %tmp11, %tmp2 + %tmp15 = and i64 %tmp3, %tmp13 + %tmp16 = xor i64 %tmp15, %tmp3 +br label %bb1 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/test/CodeGen/AMDGPU/mubuf-offset-private.ll index 3a0605fa182a..742c4f8af85d 100644 --- a/test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ b/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -5,42 +5,42 @@ ; Test addressing modes when the scratch base is not a frame index. ; GCN-LABEL: {{^}}store_private_offset_i8: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_i8() #0 { store volatile i8 5, i8* inttoptr (i32 8 to i8*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i16: -; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_i16() #0 { store volatile i16 5, i16* inttoptr (i32 8 to i16*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i32: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_i32() #0 { store volatile i32 5, i32* inttoptr (i32 8 to i32*) ret void } ; GCN-LABEL: {{^}}store_private_offset_v2i32: -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_v2i32() #0 { store volatile <2 x i32> , <2 x i32>* inttoptr (i32 8 to <2 x i32>*) ret void } ; GCN-LABEL: {{^}}store_private_offset_v4i32: -; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_v4i32() #0 { store volatile <4 x i32> , <4 x i32>* inttoptr (i32 8 to <4 x i32>*) ret void } ; GCN-LABEL: {{^}}load_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_i8() #0 { %load = load volatile i8, i8* inttoptr (i32 8 to i8*) ret void @@ -65,7 +65,7 @@ define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0 } ; GCN-LABEL: {{^}}load_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_i16() #0 { %load = load volatile i16, i16* inttoptr (i32 8 to i16*) ret void @@ -90,28 +90,28 @@ define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) # } ; GCN-LABEL: {{^}}load_private_offset_i32: -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_i32() #0 { %load = load volatile i32, i32* inttoptr (i32 8 to i32*) ret void } ; GCN-LABEL: {{^}}load_private_offset_v2i32: -; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_v2i32() #0 { %load = load volatile <2 x i32>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*) ret void } ; GCN-LABEL: {{^}}load_private_offset_v4i32: -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_v4i32() #0 { %load = load volatile <4 x i32>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:4095 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:4095 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { store volatile i8 5, i8* inttoptr (i32 4095 to i8*) ret void @@ -119,7 +119,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { store volatile i8 5, i8* inttoptr (i32 4096 to i8*) ret void @@ -127,7 +127,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen offset:1{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen offset:1{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { store volatile i8 5, i8* inttoptr (i32 4097 to i8*) ret void diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll index 190d2b72ebaf..87f37144244e 100644 --- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll +++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: OR_INT ; FIXME: For some reason having the allocas here allowed the flatten cfg pass -; to do its transfomation, however now that we are using local memory for +; to do its transformation, however now that we are using local memory for ; allocas, the transformation isn't happening. define amdgpu_kernel void @_Z9chk1D_512v() #0 { diff --git a/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/test/CodeGen/AMDGPU/parallelorifcollapse.ll index 91116b0f65ea..e199d5b5df25 100644 --- a/test/CodeGen/AMDGPU/parallelorifcollapse.ll +++ b/test/CodeGen/AMDGPU/parallelorifcollapse.ll @@ -5,7 +5,7 @@ ; then merge if-regions with the same bodies. ; FIXME: For some reason having the allocas here allowed the flatten cfg pass -; to do its transfomation, however now that we are using local memory for +; to do its transformation, however now that we are using local memory for ; allocas, the transformation isn't happening. ; XFAIL: * ; diff --git a/test/CodeGen/AMDGPU/private-access-no-objects.ll b/test/CodeGen/AMDGPU/private-access-no-objects.ll index dcb089010e99..cf0c7944d4cd 100644 --- a/test/CodeGen/AMDGPU/private-access-no-objects.ll +++ b/test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -10,14 +10,14 @@ ; GCN-LABEL: {{^}}store_to_undef: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} ; -O0 should assume spilling, so the input scratch resource descriptor ; -should be used directly without any copies. ; OPTNONE-NOT: s_mov_b32 -; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s5 offen{{$}} define amdgpu_kernel void @store_to_undef() #0 { store volatile i32 0, i32* undef ret void @@ -26,7 +26,7 @@ define amdgpu_kernel void @store_to_undef() #0 { ; GCN-LABEL: {{^}}store_to_inttoptr: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} define amdgpu_kernel void @store_to_inttoptr() #0 { store volatile i32 0, i32* inttoptr (i32 124 to i32*) @@ -36,7 +36,7 @@ define amdgpu_kernel void @store_to_inttoptr() #0 { ; GCN-LABEL: {{^}}load_from_undef: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} define amdgpu_kernel void @load_from_undef() #0 { %ld = load volatile i32, i32* undef @@ -46,7 +46,7 @@ define amdgpu_kernel void @load_from_undef() #0 { ; GCN-LABEL: {{^}}load_from_inttoptr: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} define amdgpu_kernel void @load_from_inttoptr() #0 { %ld = load volatile i32, i32* inttoptr (i32 124 to i32*) diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir index 770bfaddb23e..a52b80ba86e5 100644 --- a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir +++ b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir @@ -34,7 +34,7 @@ body: | bb.0: successors: %bb.2, %bb.1 - %7 = V_CMP_NEQ_F32_e64 0, 0, 0, undef %3, 0, 0, implicit %exec + %7 = V_CMP_NEQ_F32_e64 0, 0, 0, undef %3, 0, implicit %exec %vcc = COPY killed %7 S_CBRANCH_VCCZ %bb.2, implicit killed %vcc diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll index 6ed730ad60f4..5e0178072e5e 100644 --- a/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/test/CodeGen/AMDGPU/scratch-simple.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=gfx804 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s ; This used to fail due to a v_add_i32 instruction with an illegal immediate @@ -8,15 +8,16 @@ ; ; GCN-LABEL: {{^}}ps_main: -; GCN-DAG: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-NOT: s_mov_b32 s0 ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]] ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]] -; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen -; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen define amdgpu_ps float @ps_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -25,9 +26,10 @@ define amdgpu_ps float @ps_main(i32 %idx) { } ; GCN-LABEL: {{^}}vs_main: -; GCN: s_mov_b32 [[SWO:s[0-9]+]], s0 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-NOT: s_mov_b32 s0 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen define amdgpu_vs float @vs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -36,9 +38,9 @@ define amdgpu_vs float @vs_main(i32 %idx) { } ; GCN-LABEL: {{^}}cs_main: -; GCN: s_mov_b32 [[SWO:s[0-9]+]], s0 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen define amdgpu_cs float @cs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -47,10 +49,15 @@ define amdgpu_cs float @cs_main(i32 %idx) { } ; GCN-LABEL: {{^}}hs_main: -; SI: s_mov_b32 [[SWO:s[0-9]+]], s0 -; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; SI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SI-NOT: s_mov_b32 s0 +; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen + +; GFX9: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GFX9-NOT: s_mov_b32 s5 +; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen define amdgpu_hs float @hs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -59,10 +66,13 @@ define amdgpu_hs float @hs_main(i32 %idx) { } ; GCN-LABEL: {{^}}gs_main: -; SI: s_mov_b32 [[SWO:s[0-9]+]], s0 -; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; SI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen + +; GFX9: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen define amdgpu_gs float @gs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -71,10 +81,16 @@ define amdgpu_gs float @gs_main(i32 %idx) { } ; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset: -; SI: s_mov_b32 [[SWO:s[0-9]+]], s6 -; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 + +; SI-NOT: s_mov_b32 s6 +; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen +; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen + +; GFX9-NOT: s_mov_b32 s5 +; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen + ; GCN: s_mov_b32 s2, s5 define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx @@ -86,10 +102,14 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, } ; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset: -; SI: s_mov_b32 [[SWO:s[0-9]+]], s6 -; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 + +; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen +; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen + +; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen + ; GCN: s_mov_b32 s2, s5 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx diff --git a/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir index 4f5c582f8b58..ff1b2ad73ef0 100644 --- a/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir +++ b/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir @@ -332,7 +332,7 @@ body: | # VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit %exec -# VI: %{{[0-9]+}} = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 0, implicit-def %exec, implicit %exec +# VI: %{{[0-9]+}} = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, implicit-def %exec, implicit %exec # VI: %vcc = V_CMP_LT_I32_sdwa 0, %{{[0-9]+}}, 0, %3, 0, 6, 4, implicit-def %vcc, implicit %exec # VI: %{{[0-9]+}} = V_CMPX_EQ_I32_e64 23, killed %{{[0-9]+}}, implicit-def %exec, implicit %exec @@ -345,20 +345,21 @@ body: | # VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def %vcc, implicit %exec -# VI: %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 2, implicit-def %exec, implicit %exec -# VI: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 2, implicit %exec +# VI: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec +# VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def %vcc, implicit %exec # VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec # VI: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec # VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec -# VI: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, 2, implicit-def %exec, implicit %exec +# VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec -# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 0, implicit %exec -# GFX9: %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 2, implicit-def %exec, implicit %exec -# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 2, implicit %exec +# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, implicit %exec +# GFX9: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec +# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, implicit %exec # GFX9: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec # GFX9: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec # GFX9: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec -# GFX9: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, 2, implicit-def %exec, implicit %exec +# GFX9: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, implicit-def %exec, implicit %exec + name: vopc_instructions @@ -415,28 +416,28 @@ body: | V_CMPX_EQ_I32_e32 123, killed %13, implicit-def %vcc, implicit-def %exec, implicit %exec %14 = V_AND_B32_e64 %5, %3, implicit %exec - %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %14, 0, 0, implicit %exec + %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %14, 0, implicit %exec %15 = V_AND_B32_e64 %5, %3, implicit %exec - %18 = V_CMPX_GT_F32_e64 0, 23, 0, killed %15, 0, 0, implicit-def %exec, implicit %exec + %18 = V_CMPX_GT_F32_e64 0, 23, 0, killed %15, 0, implicit-def %exec, implicit %exec %16 = V_AND_B32_e64 %5, %3, implicit %exec %vcc = V_CMP_LT_I32_e64 %6, killed %16, implicit %exec %17 = V_AND_B32_e64 %5, %3, implicit %exec %19 = V_CMPX_EQ_I32_e64 23, killed %17, implicit-def %exec, implicit %exec %20 = V_AND_B32_e64 %5, %3, implicit %exec - %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %20, 1, 0, implicit %exec + %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %20, 1, implicit %exec %21 = V_AND_B32_e64 %5, %3, implicit %exec - %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %21, 0, 2, implicit-def %exec, implicit %exec + %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %21, 0, implicit-def %exec, implicit %exec %23 = V_AND_B32_e64 %5, %3, implicit %exec - %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %23, 1, 2, implicit %exec + %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %23, 1, implicit %exec %24 = V_AND_B32_e64 %5, %3, implicit %exec - %vcc = V_CMPX_GT_F32_e64 1, 23, 0, killed %24, 0, 0, implicit-def %exec, implicit %exec + %vcc = V_CMPX_GT_F32_e64 1, 23, 0, killed %24, 0, implicit-def %exec, implicit %exec %25 = V_AND_B32_e64 %5, %3, implicit %exec - %vcc = V_CMPX_GT_F32_e64 0, 23, 1, killed %25, 0, 0, implicit-def %exec, implicit %exec + %vcc = V_CMPX_GT_F32_e64 0, 23, 1, killed %25, 0, implicit-def %exec, implicit %exec %26 = V_AND_B32_e64 %5, %3, implicit %exec - %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %26, 0, 0, implicit-def %exec, implicit %exec + %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %26, 0, implicit-def %exec, implicit %exec %27 = V_AND_B32_e64 %5, %3, implicit %exec - %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %27, 1, 2, implicit-def %exec, implicit %exec + %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %27, 1, implicit-def %exec, implicit %exec %100 = V_MOV_B32_e32 %vcc_lo, implicit %exec diff --git a/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir b/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir index 913b54332119..bd222adf6a68 100644 --- a/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir +++ b/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir @@ -8,7 +8,7 @@ # GCN: %{{[0-9]+}} = V_BCNT_U32_B32_e64 %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec # GCN: %{{[0-9]+}} = V_BFM_B32_e64 %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec -# GCN: %{{[0-9]+}} = V_CVT_PKNORM_I16_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 0, implicit-def %vcc, implicit %exec +# GCN: %{{[0-9]+}} = V_CVT_PKNORM_I16_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, implicit-def %vcc, implicit %exec # GCN: %{{[0-9]+}} = V_READLANE_B32 killed %{{[0-9]+}}, 0, implicit-def %vcc, implicit %exec --- @@ -50,7 +50,7 @@ body: | %15 = V_BFM_B32_e64 %13, killed %14, implicit-def %vcc, implicit %exec %16 = V_LSHRREV_B32_e64 16, %15, implicit %exec - %17 = V_CVT_PKNORM_I16_F32_e64 0, %15, 0, killed %16, 0, 0, implicit-def %vcc, implicit %exec + %17 = V_CVT_PKNORM_I16_F32_e64 0, %15, 0, killed %16, 0, implicit-def %vcc, implicit %exec %18 = V_LSHRREV_B32_e64 16, %17, implicit %exec %19 = V_READLANE_B32 killed %18, 0, implicit-def %vcc, implicit %exec diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll index 51771c9723e0..04ff4c87ea77 100644 --- a/test/CodeGen/AMDGPU/trap.ll +++ b/test/CodeGen/AMDGPU/trap.ll @@ -19,11 +19,11 @@ declare void @llvm.debugtrap() #0 ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 208 +; MESA-TRAP-NEXT: .long 204 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 144 +; NOMESA-TRAP-NEXT: .long 140 ; GCN-LABEL: {{^}}hsa_trap: ; HSA-TRAP: enable_trap_handler = 1 @@ -45,11 +45,11 @@ define amdgpu_kernel void @hsa_trap() { ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 208 +; MESA-TRAP-NEXT: .long 204 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 144 +; NOMESA-TRAP-NEXT: .long 140 ; GCN-WARNING: warning: :0:0: in function hsa_debugtrap void (): debugtrap handler not supported ; GCN-LABEL: {{^}}hsa_debugtrap: diff --git a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir index 6eb937e71b1b..54991d3d953c 100644 --- a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -81,7 +81,7 @@ body: | %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) %sgpr7 = S_MOV_B32 61440 %sgpr6 = S_MOV_B32 -1 - %vcc = V_CMP_EQ_F32_e64 0, 0, 0, %sgpr2, 0, 0, implicit %exec + %vcc = V_CMP_EQ_F32_e64 0, 0, 0, %sgpr2, 0, implicit %exec S_CBRANCH_VCCZ %bb.1.else, implicit killed %vcc bb.2.if: diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index 135f02ac205a..feae5e9f3792 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -19,8 +19,9 @@ ; HSA: workitem_private_segment_byte_size = 1536 ; GCN-NOT: flat_scr +; MESA-NOT: s_mov_b32 s3 +; HSA-NOT: s_mov_b32 s7 -; GCNMESA-DAG: s_mov_b32 s16, s3 ; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCNMESA-DAG: s_mov_b32 s14, -1 @@ -29,17 +30,32 @@ ; GFX9MESA-DAG: s_mov_b32 s15, 0xe00000 -; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCNMESAMESA: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}} -; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}} -; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}} -; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} + +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} + + + +; HSA: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}} +; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}} +; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}} +; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}} + +; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}} +; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}} +; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}} +; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}} -; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} -; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} -; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} -; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1536 diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index ca2366a361fb..afbd06a00fae 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}main: -; GCN-DAG: s_mov_b32 s[[OFFREG:[0-9]+]], s12 +; GCN-NOT: s_mov_b32 s12 ; GCN-DAG: s_mov_b32 s[[DESC0:[0-9]+]], SCRATCH_RSRC_DWORD0 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1 @@ -22,8 +22,8 @@ ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000 ; OFFREG is offset system SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Reload +; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1536 diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 6a1da0dfe85f..0e3ef479bc3c 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -45,6 +45,8 @@ define void @test_select_s32() { ret void } define void @test_select_ptr() { ret void } + define void @test_br() { ret void } + define void @test_soft_fp_double() #0 { ret void } attributes #0 = { "target-features"="+vfp2,-neonfp" } @@ -1173,6 +1175,43 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_br +# CHECK-LABEL: name: test_br +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } +body: | + bb.0: + ; CHECK: bb.0 + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: %r0 + + %0(s1) = COPY %r0 + ; CHECK: [[COND:%[0-9]+]] = COPY %r0 + + G_BRCOND %0(s1), %bb.1 + ; CHECK: TSTri [[COND]], 1, 14, _, implicit-def %cpsr + ; CHECK: Bcc %bb.1, 0, %cpsr + G_BR %bb.2 + ; CHECK: B %bb.2 + + bb.1: + ; CHECK: bb.1 + successors: %bb.2(0x80000000) + + G_BR %bb.2 + ; CHECK: B %bb.2 + + bb.2: + ; CHECK: bb.2 + + BX_RET 14, _ + ; CHECK: BX_RET 14, _ +... +--- name: test_soft_fp_double # CHECK-LABEL: name: test_soft_fp_double legalized: true diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll index c778caacd0f4..c2e8c5abca4e 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll @@ -87,3 +87,55 @@ define arm_aapcscc i32 @test_urem_i32(i32 %x, i32 %y) { %r = urem i32 %x, %y ret i32 %r } + +define arm_aapcscc i16 @test_srem_i16(i16 %x, i16 %y) { +; CHECK-LABEL: test_srem_i16: +; CHECK-DAG: sxth r0, r0 +; CHECK-DAG: sxth r1, r1 +; HWDIV: sdiv [[Q:r[0-9]+]], r0, r1 +; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1 +; HWDIV: sub r0, r0, [[P]] +; SOFT-AEABI: blx __aeabi_idivmod +; SOFT-DEFAULT: blx __modsi3 + %r = srem i16 %x, %y + ret i16 %r +} + +define arm_aapcscc i16 @test_urem_i16(i16 %x, i16 %y) { +; CHECK-LABEL: test_urem_i16: +; CHECK-DAG: uxth r0, r0 +; CHECK-DAG: uxth r1, r1 +; HWDIV: udiv [[Q:r[0-9]+]], r0, r1 +; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1 +; HWDIV: sub r0, r0, [[P]] +; SOFT-AEABI: blx __aeabi_uidivmod +; SOFT-DEFAULT: blx __umodsi3 + %r = urem i16 %x, %y + ret i16 %r +} + +define arm_aapcscc i8 @test_srem_i8(i8 %x, i8 %y) { +; CHECK-LABEL: test_srem_i8: +; CHECK-DAG: sxtb r0, r0 +; CHECK-DAG: sxtb r1, r1 +; HWDIV: sdiv [[Q:r[0-9]+]], r0, r1 +; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1 +; HWDIV: sub r0, r0, [[P]] +; SOFT-AEABI: blx __aeabi_idivmod +; SOFT-DEFAULT: blx __modsi3 + %r = srem i8 %x, %y + ret i8 %r +} + +define arm_aapcscc i8 @test_urem_i8(i8 %x, i8 %y) { +; CHECK-LABEL: test_urem_i8: +; CHECK-DAG: uxtb r0, r0 +; CHECK-DAG: uxtb r1, r1 +; HWDIV: udiv [[Q:r[0-9]+]], r0, r1 +; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1 +; HWDIV: sub r0, r0, [[P]] +; SOFT-AEABI: blx __aeabi_uidivmod +; SOFT-DEFAULT: blx __umodsi3 + %r = urem i8 %x, %y + ret i8 %r +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel.ll b/test/CodeGen/ARM/GlobalISel/arm-isel.ll index 4c498ff6ca9b..419bcf71c106 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-isel.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-isel.ll @@ -420,3 +420,42 @@ entry: %r = select i1 %cond, i32* %a, i32* %b ret i32* %r } + +define arm_aapcscc void @test_br() { +; CHECK-LABEL: test_br +; CHECK: [[LABEL:.L[[:alnum:]_]+]]: +; CHECK: b [[LABEL]] +entry: + br label %infinite + +infinite: + br label %infinite +} + +declare arm_aapcscc void @brcond1() +declare arm_aapcscc void @brcond2() + +define arm_aapcscc void @test_brcond(i32 %n) { +; CHECK-LABEL: test_brcond +; CHECK: cmp r0 +; CHECK-NEXT: movgt [[RCMP:r[0-9]+]], #1 +; CHECK: tst [[RCMP]], #1 +; CHECK-NEXT: bne [[FALSE:.L[[:alnum:]_]+]] +; CHECK: blx brcond1 +; CHECK: [[FALSE]]: +; CHECK: blx brcond2 +entry: + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %if.true, label %if.false + +if.true: + call arm_aapcscc void @brcond1() + br label %if.end + +if.false: + call arm_aapcscc void @brcond2() + br label %if.end + +if.end: + ret void +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir index 9a0877846fc3..f436c3774c86 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir @@ -14,6 +14,12 @@ define void @test_srem_i32() { ret void } define void @test_urem_i32() { ret void } + + define void @test_srem_i16() { ret void } + define void @test_urem_i16() { ret void } + + define void @test_srem_i8() { ret void } + define void @test_urem_i8() { ret void } ... --- name: test_sdiv_i32 @@ -323,3 +329,171 @@ body: | %r0 = COPY %2(s32) BX_RET 14, _, implicit %r0 ... +--- +name: test_srem_i16 +# CHECK-LABEL: name: test_srem_i16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s16) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s16) + %0(s16) = COPY %r0 + %1(s16) = COPY %r1 + ; HWDIV: [[Q32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]] + ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]] + ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]] + ; SOFT-NOT: G_SREM + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_idivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1 + ; SOFT-DEFAULT: BLX $__modsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_SREM + ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]] + ; SOFT-NOT: G_SREM + %2(s16) = G_SREM %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s16) + BX_RET 14, _, implicit %r0 +... +--- +name: test_urem_i16 +# CHECK-LABEL: name: test_urem_i16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s16) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s16) + %0(s16) = COPY %r0 + %1(s16) = COPY %r1 + ; HWDIV: [[Q32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]] + ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]] + ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]] + ; SOFT-NOT: G_UREM + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_uidivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1 + ; SOFT-DEFAULT: BLX $__umodsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_UREM + ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]] + ; SOFT-NOT: G_UREM + %2(s16) = G_UREM %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s16) + BX_RET 14, _, implicit %r0 +... +--- +name: test_srem_i8 +# CHECK-LABEL: name: test_srem_i8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s8) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s8) + %0(s8) = COPY %r0 + %1(s8) = COPY %r1 + ; HWDIV: [[Q32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]] + ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]] + ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]] + ; SOFT-NOT: G_SREM + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_idivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1 + ; SOFT-DEFAULT: BLX $__modsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_SREM + ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]] + ; SOFT-NOT: G_SREM + %2(s8) = G_SREM %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s8) + BX_RET 14, _, implicit %r0 +... +--- +name: test_urem_i8 +# CHECK-LABEL: name: test_urem_i8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s8) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s8) + %0(s8) = COPY %r0 + %1(s8) = COPY %r1 + ; HWDIV: [[Q32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]] + ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]] + ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]] + ; SOFT-NOT: G_UREM + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_uidivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1 + ; SOFT-DEFAULT: BLX $__umodsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_UREM + ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]] + ; SOFT-NOT: G_UREM + %2(s8) = G_UREM %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s8) + BX_RET 14, _, implicit %r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir index 4575341dfc29..616f29d3b068 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir @@ -42,6 +42,8 @@ define void @test_select_s32() { ret void } define void @test_select_ptr() { ret void } + define void @test_brcond() { ret void } + define void @test_fadd_s32() #0 { ret void } define void @test_fadd_s64() #0 { ret void } @@ -863,6 +865,40 @@ body: | BX_RET 14, _, implicit %r0 ... --- +name: test_brcond +# CHECK-LABEL: name: test_brcond +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s1) = G_ICMP intpred(sgt), %0(s32), %1 + G_BRCOND %2(s1), %bb.1 + ; G_BRCOND with s1 is legal, so we should find it unchanged in the output + ; CHECK: G_BRCOND {{%[0-9]+}}(s1), %bb.1 + G_BR %bb.2 + + bb.1: + %r0 = COPY %1(s32) + BX_RET 14, _, implicit %r0 + + bb.2: + %r0 = COPY %0(s32) + BX_RET 14, _, implicit %r0 + +... +--- name: test_fadd_s32 # CHECK-LABEL: name: test_fadd_s32 legalized: false diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index ffca431d96ea..638c6e620926 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -40,6 +40,8 @@ define void @test_select_s32() { ret void } + define void @test_br() { ret void } + define void @test_fadd_s32() #0 { ret void } define void @test_fadd_s64() #0 { ret void } @@ -828,6 +830,34 @@ body: | %r0 = COPY %3(s32) BX_RET 14, _, implicit %r0 +... +--- +name: test_br +# CHECK-LABEL: name: test_br +legalized: true +regBankSelected: false +# CHECK: regBankSelected: true +selected: false +registers: + - { id: 0, class: _ } +# CHECK: { id: 0, class: gprb, preferred-register: '' } +# Check that we map the condition of the G_BRCOND into the GPR. +# For the G_BR, there are no registers to map, but make sure we don't crash. +body: | + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: %r0 + + %0(s1) = COPY %r0 + G_BRCOND %0(s1), %bb.1 + G_BR %bb.2 + + bb.1: + BX_RET 14, _ + + bb.2: + BX_RET 14, _ + ... --- name: test_fadd_s32 diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll index 23c4ccea4604..644a7fbf8d9a 100644 --- a/test/CodeGen/ARM/atomic-op.ll +++ b/test/CodeGen/ARM/atomic-op.ll @@ -26,6 +26,7 @@ entry: store i32 3855, i32* %xort store i32 4, i32* %temp %tmp = load i32, i32* %temp + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: add ; CHECK: strex @@ -35,6 +36,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %0 = atomicrmw add i32* %val1, i32 %tmp monotonic store i32 %0, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: sub ; CHECK: strex @@ -44,6 +46,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %1 = atomicrmw sub i32* %val2, i32 30 monotonic store i32 %1, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: add ; CHECK: strex @@ -53,6 +56,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %2 = atomicrmw add i32* %val2, i32 1 monotonic store i32 %2, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: sub ; CHECK: strex @@ -62,6 +66,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %3 = atomicrmw sub i32* %val2, i32 1 monotonic store i32 %3, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: and ; CHECK: strex @@ -71,6 +76,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %4 = atomicrmw and i32* %andt, i32 4080 monotonic store i32 %4, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: or ; CHECK: strex @@ -80,6 +86,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %5 = atomicrmw or i32* %ort, i32 4080 monotonic store i32 %5, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: eor ; CHECK: strex @@ -89,6 +96,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %6 = atomicrmw xor i32* %xort, i32 4080 monotonic store i32 %6, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: cmp ; CHECK: strex @@ -98,6 +106,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %7 = atomicrmw min i32* %val2, i32 16 monotonic store i32 %7, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() %neg = sub i32 0, 1 ; CHECK: ldrex ; CHECK: cmp @@ -108,6 +117,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %8 = atomicrmw min i32* %val2, i32 %neg monotonic store i32 %8, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: cmp ; CHECK: strex @@ -117,6 +127,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %9 = atomicrmw max i32* %val2, i32 1 monotonic store i32 %9, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: cmp ; CHECK: strex @@ -126,6 +137,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %10 = atomicrmw max i32* %val2, i32 0 monotonic store i32 %10, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: cmp ; CHECK: strex @@ -135,6 +147,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %11 = atomicrmw umin i32* %val2, i32 16 monotonic store i32 %11, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() %uneg = sub i32 0, 1 ; CHECK: ldrex ; CHECK: cmp @@ -145,6 +158,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %12 = atomicrmw umin i32* %val2, i32 %uneg monotonic store i32 %12, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: cmp ; CHECK: strex @@ -154,6 +168,7 @@ entry: ; CHECK-BAREMETAL-NOT: __sync %13 = atomicrmw umax i32* %val2, i32 1 monotonic store i32 %13, i32* %old + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: ldrex ; CHECK: cmp ; CHECK: strex diff --git a/test/CodeGen/AVR/branch-relaxation.ll b/test/CodeGen/AVR/branch-relaxation.ll index d6f07f653576..e415b059692e 100644 --- a/test/CodeGen/AVR/branch-relaxation.ll +++ b/test/CodeGen/AVR/branch-relaxation.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -march=avr | FileCheck %s -; CHECKC-LABEL: relax_breq +; CHECK-LABEL: relax_breq ; CHECK: cpi r{{[0-9]+}}, 0 ; CHECK: brne LBB0_1 ; CHECK: rjmp LBB0_2 @@ -66,7 +66,7 @@ finished: ret i8 3 } -; CHECKC-LABEL: no_relax_breq +; CHECK-LABEL: no_relax_breq ; CHECK: cpi r{{[0-9]+}}, 0 ; CHECK: breq [[END_BB:LBB[0-9]+_[0-9]+]] ; CHECK: nop diff --git a/test/CodeGen/BPF/select_ri.ll b/test/CodeGen/BPF/select_ri.ll new file mode 100644 index 000000000000..c4ac376502b8 --- /dev/null +++ b/test/CodeGen/BPF/select_ri.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -march=bpf -verify-machineinstrs | FileCheck %s +; +; Source file: +; int b, c; +; int test() { +; int a = b; +; if (a) +; a = c; +; return a; +; } +@b = common local_unnamed_addr global i32 0, align 4 +@c = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind readonly +define i32 @test() local_unnamed_addr #0 { +entry: + %0 = load i32, i32* @b, align 4 + %tobool = icmp eq i32 %0, 0 + %1 = load i32, i32* @c, align 4 + %. = select i1 %tobool, i32 0, i32 %1 +; CHECK: r1 = ll +; CHECK: r1 = *(u32 *)(r1 + 0) +; CHECK: if r1 == 0 goto + ret i32 %. +} + +attributes #0 = { norecurse nounwind readonly } diff --git a/test/CodeGen/BPF/setcc.ll b/test/CodeGen/BPF/setcc.ll index 294c49365670..7e20814da807 100644 --- a/test/CodeGen/BPF/setcc.ll +++ b/test/CodeGen/BPF/setcc.ll @@ -7,7 +7,7 @@ define i16 @sccweqand(i16 %a, i16 %b) nounwind { ret i16 %t3 } ; CHECK-LABEL: sccweqand: -; CHECK: if r1 == r2 +; CHECK: if r1 == 0 define i16 @sccwneand(i16 %a, i16 %b) nounwind { %t1 = and i16 %a, %b @@ -16,7 +16,7 @@ define i16 @sccwneand(i16 %a, i16 %b) nounwind { ret i16 %t3 } ; CHECK-LABEL: sccwneand: -; CHECK: if r1 != r2 +; CHECK: if r1 != 0 define i16 @sccwne(i16 %a, i16 %b) nounwind { %t1 = icmp ne i16 %a, %b diff --git a/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll b/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll index 9e4664ad69c9..48c5f8f4d247 100644 --- a/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll +++ b/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll @@ -1,7 +1,6 @@ ; RUN: llc < %s ; Bug: PR31341 -; XFAIL: avr ;; Date: Jul 29, 2003. ;; From: test/Programs/MultiSource/Ptrdist-bc diff --git a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll index a9a33d72bca2..afa2e8a72ed1 100644 --- a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll +++ b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll @@ -1,8 +1,5 @@ ; RUN: llc < %s -; Bug: PR31898 -; XFAIL: avr - ; This caused ScheduleDAG to crash in EmitPhysRegCopy when searching ; the uses of a copy to a physical register without ignoring non-data ; dependence, PR10220. diff --git a/test/CodeGen/Generic/print-mul-exp.ll b/test/CodeGen/Generic/print-mul-exp.ll index 91c8147aaad9..1426fb59f669 100644 --- a/test/CodeGen/Generic/print-mul-exp.ll +++ b/test/CodeGen/Generic/print-mul-exp.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s +; XFAIL: avr @a_str = internal constant [8 x i8] c"a = %d\0A\00" ; <[8 x i8]*> [#uses=1] @a_mul_str = internal constant [13 x i8] c"a * %d = %d\0A\00" ; <[13 x i8]*> [#uses=1] diff --git a/test/CodeGen/Generic/print-mul.ll b/test/CodeGen/Generic/print-mul.ll index 4b60d759278a..20fb1be6edef 100644 --- a/test/CodeGen/Generic/print-mul.ll +++ b/test/CodeGen/Generic/print-mul.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s +; XFAIL: avr @a_str = internal constant [8 x i8] c"a = %d\0A\00" ; <[8 x i8]*> [#uses=1] @b_str = internal constant [8 x i8] c"b = %d\0A\00" ; <[8 x i8]*> [#uses=1] diff --git a/test/CodeGen/Generic/print-shift.ll b/test/CodeGen/Generic/print-shift.ll index 56b3ec1df760..1fda55420b59 100644 --- a/test/CodeGen/Generic/print-shift.ll +++ b/test/CodeGen/Generic/print-shift.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s +; XFAIL: avr @a_str = internal constant [8 x i8] c"a = %d\0A\00" ; <[8 x i8]*> [#uses=1] @b_str = internal constant [8 x i8] c"b = %d\0A\00" ; <[8 x i8]*> [#uses=1] diff --git a/test/CodeGen/Generic/v-split.ll b/test/CodeGen/Generic/v-split.ll index 91aece94fecd..f9a1cee440ca 100644 --- a/test/CodeGen/Generic/v-split.ll +++ b/test/CodeGen/Generic/v-split.ll @@ -1,8 +1,5 @@ ; RUN: llc < %s -; Bug: PR31898 -; XFAIL: avr - %f8 = type <8 x float> define void @test_f8(%f8 *%P, %f8* %Q, %f8 *%S) { diff --git a/test/CodeGen/Generic/vector-redux.ll b/test/CodeGen/Generic/vector-redux.ll index 64562d6d9490..8efdbf85b8c0 100644 --- a/test/CodeGen/Generic/vector-redux.ll +++ b/test/CodeGen/Generic/vector-redux.ll @@ -1,9 +1,6 @@ ; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s ; REQUIRES: asserts -; Bug: PR31898 -; XFAIL: avr - @a = global [1024 x i32] zeroinitializer, align 16 define i32 @reduce_add() { diff --git a/test/CodeGen/Generic/vector.ll b/test/CodeGen/Generic/vector.ll index 9c0cacdcd878..2d4dc501a53a 100644 --- a/test/CodeGen/Generic/vector.ll +++ b/test/CodeGen/Generic/vector.ll @@ -1,9 +1,6 @@ ; Test that vectors are scalarized/lowered correctly. ; RUN: llc < %s -; Bug: PR31898 -; XFAIL: avr - %d8 = type <8 x double> %f1 = type <1 x float> %f2 = type <2 x float> diff --git a/test/CodeGen/Hexagon/intrinsics/system_user.ll b/test/CodeGen/Hexagon/intrinsics/system_user.ll index ac4c53e221d0..23473c92da91 100644 --- a/test/CodeGen/Hexagon/intrinsics/system_user.ll +++ b/test/CodeGen/Hexagon/intrinsics/system_user.ll @@ -1,13 +1,71 @@ -; RUN: llc -march=hexagon -O0 < %s | FileCheck %s -; RUN: llc -march=hexagon -O0 < %s | FileCheck -check-prefix=CHECK-CALL %s -; Hexagon Programmer's Reference Manual 11.9.1 SYSTEM/USER +; RUN: llc -march=hexagon < %s | FileCheck %s -; CHECK-CALL-NOT: call +target triple = "hexagon" -; Data cache prefetch -declare void @llvm.hexagon.prefetch(i8*) -define void @prefetch(i8* %a) { - call void @llvm.hexagon.prefetch(i8* %a) +; CHECK-LABEL: dc00: +; CHECK: dcfetch +define void @dc00(i8* nocapture readonly %p) local_unnamed_addr #0 { + tail call void @llvm.hexagon.prefetch(i8* %p) ret void } -; CHECK: dcfetch({{.*}}+#0) + +; CHECK-LABEL: dc01: +; CHECK: dccleana +define void @dc01(i8* nocapture readonly %p) local_unnamed_addr #0 { +entry: + tail call void @llvm.hexagon.Y2.dccleana(i8* %p) + ret void +} + +; CHECK-LABEL: dc02: +; CHECK: dccleaninva +define void @dc02(i8* nocapture readonly %p) local_unnamed_addr #0 { +entry: + tail call void @llvm.hexagon.Y2.dccleaninva(i8* %p) + ret void +} + +; CHECK-LABEL: dc03: +; CHECK: dcinva +define void @dc03(i8* nocapture readonly %p) local_unnamed_addr #0 { +entry: + tail call void @llvm.hexagon.Y2.dcinva(i8* %p) + ret void +} + +; CHECK-LABEL: dc04: +; CHECK: dczeroa +define void @dc04(i8* nocapture %p) local_unnamed_addr #0 { +entry: + tail call void @llvm.hexagon.Y2.dczeroa(i8* %p) + ret void +} + +; CHECK-LABEL: dc05: +; CHECK: l2fetch(r{{[0-9]+}},r{{[0-9]+}}) +define void @dc05(i8* nocapture readonly %p, i32 %q) local_unnamed_addr #0 { +entry: + tail call void @llvm.hexagon.Y4.l2fetch(i8* %p, i32 %q) + ret void +} + +; CHECK-LABEL: dc06: +; CHECK: l2fetch(r{{[0-9]+}},r{{[0-9]+}}:{{[0-9]+}}) +define void @dc06(i8* nocapture readonly %p, i64 %q) local_unnamed_addr #0 { +entry: + tail call void @llvm.hexagon.Y5.l2fetch(i8* %p, i64 %q) + ret void +} + +declare void @llvm.hexagon.prefetch(i8* nocapture) #1 +declare void @llvm.hexagon.Y2.dccleana(i8* nocapture readonly) #2 +declare void @llvm.hexagon.Y2.dccleaninva(i8* nocapture readonly) #2 +declare void @llvm.hexagon.Y2.dcinva(i8* nocapture readonly) #2 +declare void @llvm.hexagon.Y2.dczeroa(i8* nocapture) #3 +declare void @llvm.hexagon.Y4.l2fetch(i8* nocapture readonly, i32) #2 +declare void @llvm.hexagon.Y5.l2fetch(i8* nocapture readonly, i64) #2 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" } +attributes #1 = { inaccessiblemem_or_argmemonly nounwind } +attributes #2 = { nounwind } +attributes #3 = { argmemonly nounwind writeonly } diff --git a/test/CodeGen/Hexagon/switch-lut-explicit-section.ll b/test/CodeGen/Hexagon/switch-lut-explicit-section.ll new file mode 100644 index 000000000000..6c67a0dab1a8 --- /dev/null +++ b/test/CodeGen/Hexagon/switch-lut-explicit-section.ll @@ -0,0 +1,32 @@ +;RUN: llc -O2 -hexagon-emit-lut-text=true < %s | FileCheck --check-prefix=FUNCTEXT %s +;RUN: llc -O2 -hexagon-emit-lut-text=true -function-sections < %s | FileCheck --check-prefix=FUNCTEXT %s + +;This test checks the placement of lookup table in explicit section from the attribute set. +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon-unknown--elf" + +;FUNCTEXT: .text +;FUNCTEXT: .section{{.*}}tcm.hexagon, +;FUNCTEXT-NOT: .section{{.*}}.rodata +;FUNCTEXT-NOT: .text +;FUNCTEXT: .Lswitch.table: +;FUNCTEXT-NEXT: .word + +@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11] #0 + +; Function Attrs: norecurse nounwind readnone +define i32 @foo(i32 %x) local_unnamed_addr #0 section "tcm.hexagon" { +entry: + %0 = icmp ult i32 %x, 9 + br i1 %0, label %switch.lookup, label %return + +switch.lookup: ; preds = %entry + %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x + %switch.load = load i32, i32* %switch.gep, align 4 + ret i32 %switch.load + +return: ; preds = %entry + ret i32 19 +} + +attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/Hexagon/switch-lut-function-section.ll b/test/CodeGen/Hexagon/switch-lut-function-section.ll new file mode 100644 index 000000000000..bb2b1e798c8a --- /dev/null +++ b/test/CodeGen/Hexagon/switch-lut-function-section.ll @@ -0,0 +1,30 @@ +;RUN: llc -O2 -hexagon-emit-lut-text=true -function-sections < %s | FileCheck --check-prefix=FUNCTEXT %s + +;This test checks the placement of lookup table in function's text section. +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon-unknown--elf" + +;FUNCTEXT: .text +;FUNCTEXT: .section{{.*}}text.foo, +;FUNCTEXT-NOT: .section{{.*}}.rodata +;FUNCTEXT: .Lswitch.table: +;FUNCTEXT-NEXT: .word + +@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11] #0 + +; Function Attrs: norecurse nounwind readnone +define i32 @foo(i32 %x) local_unnamed_addr #0 { +entry: + %0 = icmp ult i32 %x, 9 + br i1 %0, label %switch.lookup, label %return + +switch.lookup: ; preds = %entry + %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x + %switch.load = load i32, i32* %switch.gep, align 4 + ret i32 %switch.load + +return: ; preds = %entry + ret i32 19 +} + +attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/Hexagon/switch-lut-multiple-functions.ll b/test/CodeGen/Hexagon/switch-lut-multiple-functions.ll new file mode 100644 index 000000000000..57fdfbf33abc --- /dev/null +++ b/test/CodeGen/Hexagon/switch-lut-multiple-functions.ll @@ -0,0 +1,42 @@ +;RUN: llc -O2 -hexagon-emit-lut-text=true < %s | FileCheck --check-prefix=TEXT %s +;If the look up table is used by more than one function, we should ignore the +;flag and place it the rodata. +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon-unknown--elf" + +;TEXT: .text +;TEXT: .section{{.*}}.rodata +;TEXT: .Lswitch.table: +;TEXT-NEXT: .word +@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11] + +; Function Attrs: norecurse nounwind readnone +define i32 @foo(i32 %x) local_unnamed_addr #0 { +entry: + %0 = icmp ult i32 %x, 9 + br i1 %0, label %switch.lookup, label %return + +switch.lookup: ; preds = %entry + %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x + %switch.load = load i32, i32* %switch.gep, align 4 + ret i32 %switch.load + +return: ; preds = %entry + ret i32 19 +} + +define i32 @goo(i32 %x) local_unnamed_addr #0 { +entry: + %0 = icmp ult i32 %x, 9 + br i1 %0, label %switch.lookup, label %return + +switch.lookup: ; preds = %entry + %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x + %switch.load = load i32, i32* %switch.gep, align 4 + ret i32 %switch.load + +return: ; preds = %entry + ret i32 19 +} + +attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/Hexagon/switch-lut-text-section.ll b/test/CodeGen/Hexagon/switch-lut-text-section.ll new file mode 100644 index 000000000000..b4d3e898d103 --- /dev/null +++ b/test/CodeGen/Hexagon/switch-lut-text-section.ll @@ -0,0 +1,27 @@ +;RUN: llc -O2 -hexagon-emit-lut-text=true < %s | FileCheck --check-prefix=TEXT %s +;This test checks the placement of lookup table in text section. +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon-unknown--elf" + +;TEXT: .text +;TEXT-NOT: .section{{.*}}.rodata +;TEXT: .Lswitch.table: +;TEXT-NEXT: .word +@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11] + +; Function Attrs: norecurse nounwind readnone +define i32 @foo(i32 %x) local_unnamed_addr #0 { +entry: + %0 = icmp ult i32 %x, 9 + br i1 %0, label %switch.lookup, label %return + +switch.lookup: ; preds = %entry + %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x + %switch.load = load i32, i32* %switch.gep, align 4 + ret i32 %switch.load + +return: ; preds = %entry + ret i32 19 +} + +attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/Hexagon/v6vec-vprint.ll b/test/CodeGen/Hexagon/v6vec-vprint.ll index 224547c24b75..24daeac3fb5d 100644 --- a/test/CodeGen/Hexagon/v6vec-vprint.ll +++ b/test/CodeGen/Hexagon/v6vec-vprint.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx -disable-hexagon-shuffle=0 -O2 -enable-hexagon-vector-print < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx -disable-hexagon-shuffle=0 -O2 -enable-hexagon-vector-print < %s | FileCheck %s ; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx -disable-hexagon-shuffle=0 -O2 -enable-hexagon-vector-print -trace-hex-vector-stores-only < %s | FileCheck --check-prefix=VSTPRINT %s ; generate .long XXXX which is a vector debug print instruction. ; CHECK: .long 0x1dffe0 diff --git a/test/CodeGen/Hexagon/vect/vect-load-v4i16.ll b/test/CodeGen/Hexagon/vect/vect-load-v4i16.ll new file mode 100644 index 000000000000..32abb75f20f4 --- /dev/null +++ b/test/CodeGen/Hexagon/vect/vect-load-v4i16.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=hexagon -O0 < %s | FileCheck %s + +; CHECK-LABEL: danny: +; CHECK-DAG: [[T0:r[0-9]+]] = memuh(r0+#0) +; CHECK-DAG: [[T1:r[0-9]+]] = memuh(r0+#2) +; CHECK: [[T0]] |= asl([[T1]],#16) +; CHECK-DAG: [[T2:r[0-9]+]] = memuh(r0+#4) +; CHECK-DAG: [[T3:r[0-9]+]] = memuh(r0+#6) +; CHECK: [[T2]] |= asl([[T3]],#16) +; CHECK: combine([[T2]],[[T0]]) +define <4 x i16> @danny(<4 x i16>* %p) { + %t0 = load <4 x i16>, <4 x i16>* %p, align 2 + ret <4 x i16> %t0 +} + +; CHECK-LABEL: sammy: +; CHECK-DAG: [[T0:r[0-9]+]] = memw(r0+#0) +; CHECK-DAG: [[T1:r[0-9]+]] = memw(r0+#4) +; CHECK: combine([[T1]],[[T0]]) +define <4 x i16> @sammy(<4 x i16>* %p) { + %t0 = load <4 x i16>, <4 x i16>* %p, align 4 + ret <4 x i16> %t0 +} diff --git a/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll b/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll deleted file mode 100644 index f49a1e24a1bb..000000000000 --- a/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll +++ /dev/null @@ -1,73 +0,0 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s - -; Check that store is post-incremented. -; CHECK: memuh(r{{[0-9]+}}+#6) -; CHECK: combine(r{{[0-9]+}},r{{[0-9]+}}) -; CHECK: vaddh - -target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32" -target triple = "hexagon" - -define void @matrix_add_const(i32 %N, i16* nocapture %A, i16 signext %val) #0 { -entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %polly.cond - -for.end.loopexit: ; preds = %polly.stmt.for.body29 - br label %for.end - -for.end: ; preds = %for.end.loopexit, %polly.loop_header24.preheader, %entry - ret void - -polly.cond: ; preds = %entry - %0 = icmp sgt i32 %N, 3 - br i1 %0, label %polly.then, label %polly.loop_header24.preheader - -polly.then: ; preds = %polly.cond - %1 = add i32 %N, -1 - %leftover_lb = and i32 %1, -4 - %2 = icmp sgt i32 %leftover_lb, 0 - br i1 %2, label %polly.loop_body.lr.ph, label %polly.loop_header24.preheader - -polly.loop_body.lr.ph: ; preds = %polly.then - %3 = insertelement <4 x i16> undef, i16 %val, i32 0 - %4 = insertelement <4 x i16> %3, i16 %val, i32 1 - %5 = insertelement <4 x i16> %4, i16 %val, i32 2 - %6 = insertelement <4 x i16> %5, i16 %val, i32 3 - br label %polly.loop_body - -polly.loop_header24.preheader.loopexit: ; preds = %polly.loop_body - br label %polly.loop_header24.preheader - -polly.loop_header24.preheader: ; preds = %polly.loop_header24.preheader.loopexit, %polly.then, %polly.cond - %polly.loopiv27.ph = phi i32 [ 0, %polly.cond ], [ %leftover_lb, %polly.then ], [ %leftover_lb, %polly.loop_header24.preheader.loopexit ] - %7 = icmp slt i32 %polly.loopiv27.ph, %N - br i1 %7, label %polly.stmt.for.body29.preheader, label %for.end - -polly.stmt.for.body29.preheader: ; preds = %polly.loop_header24.preheader - br label %polly.stmt.for.body29 - -polly.loop_body: ; preds = %polly.loop_body.lr.ph, %polly.loop_body - %p_arrayidx.phi = phi i16* [ %A, %polly.loop_body.lr.ph ], [ %p_arrayidx.inc, %polly.loop_body ] - %polly.loopiv34 = phi i32 [ 0, %polly.loop_body.lr.ph ], [ %polly.next_loopiv, %polly.loop_body ] - %polly.next_loopiv = add nsw i32 %polly.loopiv34, 4 - %vector_ptr = bitcast i16* %p_arrayidx.phi to <4 x i16>* - %_p_vec_full = load <4 x i16>, <4 x i16>* %vector_ptr, align 2 - %addp_vec = add <4 x i16> %_p_vec_full, %6 - store <4 x i16> %addp_vec, <4 x i16>* %vector_ptr, align 2 - %8 = icmp slt i32 %polly.next_loopiv, %leftover_lb - %p_arrayidx.inc = getelementptr i16, i16* %p_arrayidx.phi, i32 4 - br i1 %8, label %polly.loop_body, label %polly.loop_header24.preheader.loopexit - -polly.stmt.for.body29: ; preds = %polly.stmt.for.body29.preheader, %polly.stmt.for.body29 - %polly.loopiv2733 = phi i32 [ %polly.next_loopiv28, %polly.stmt.for.body29 ], [ %polly.loopiv27.ph, %polly.stmt.for.body29.preheader ] - %polly.next_loopiv28 = add nsw i32 %polly.loopiv2733, 1 - %p_arrayidx30 = getelementptr i16, i16* %A, i32 %polly.loopiv2733 - %_p_scalar_ = load i16, i16* %p_arrayidx30, align 2 - %p_add = add i16 %_p_scalar_, %val - store i16 %p_add, i16* %p_arrayidx30, align 2 - %exitcond = icmp eq i32 %polly.next_loopiv28, %N - br i1 %exitcond, label %for.end.loopexit, label %polly.stmt.for.body29 -} - -attributes #0 = { nounwind "fp-contract-model"="standard" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="static" "ssp-buffers-size"="8" } diff --git a/test/CodeGen/Hexagon/vect/vect-v4i16.ll b/test/CodeGen/Hexagon/vect/vect-v4i16.ll new file mode 100644 index 000000000000..f49a1e24a1bb --- /dev/null +++ b/test/CodeGen/Hexagon/vect/vect-v4i16.ll @@ -0,0 +1,73 @@ +; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s + +; Check that store is post-incremented. +; CHECK: memuh(r{{[0-9]+}}+#6) +; CHECK: combine(r{{[0-9]+}},r{{[0-9]+}}) +; CHECK: vaddh + +target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32" +target triple = "hexagon" + +define void @matrix_add_const(i32 %N, i16* nocapture %A, i16 signext %val) #0 { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %polly.cond + +for.end.loopexit: ; preds = %polly.stmt.for.body29 + br label %for.end + +for.end: ; preds = %for.end.loopexit, %polly.loop_header24.preheader, %entry + ret void + +polly.cond: ; preds = %entry + %0 = icmp sgt i32 %N, 3 + br i1 %0, label %polly.then, label %polly.loop_header24.preheader + +polly.then: ; preds = %polly.cond + %1 = add i32 %N, -1 + %leftover_lb = and i32 %1, -4 + %2 = icmp sgt i32 %leftover_lb, 0 + br i1 %2, label %polly.loop_body.lr.ph, label %polly.loop_header24.preheader + +polly.loop_body.lr.ph: ; preds = %polly.then + %3 = insertelement <4 x i16> undef, i16 %val, i32 0 + %4 = insertelement <4 x i16> %3, i16 %val, i32 1 + %5 = insertelement <4 x i16> %4, i16 %val, i32 2 + %6 = insertelement <4 x i16> %5, i16 %val, i32 3 + br label %polly.loop_body + +polly.loop_header24.preheader.loopexit: ; preds = %polly.loop_body + br label %polly.loop_header24.preheader + +polly.loop_header24.preheader: ; preds = %polly.loop_header24.preheader.loopexit, %polly.then, %polly.cond + %polly.loopiv27.ph = phi i32 [ 0, %polly.cond ], [ %leftover_lb, %polly.then ], [ %leftover_lb, %polly.loop_header24.preheader.loopexit ] + %7 = icmp slt i32 %polly.loopiv27.ph, %N + br i1 %7, label %polly.stmt.for.body29.preheader, label %for.end + +polly.stmt.for.body29.preheader: ; preds = %polly.loop_header24.preheader + br label %polly.stmt.for.body29 + +polly.loop_body: ; preds = %polly.loop_body.lr.ph, %polly.loop_body + %p_arrayidx.phi = phi i16* [ %A, %polly.loop_body.lr.ph ], [ %p_arrayidx.inc, %polly.loop_body ] + %polly.loopiv34 = phi i32 [ 0, %polly.loop_body.lr.ph ], [ %polly.next_loopiv, %polly.loop_body ] + %polly.next_loopiv = add nsw i32 %polly.loopiv34, 4 + %vector_ptr = bitcast i16* %p_arrayidx.phi to <4 x i16>* + %_p_vec_full = load <4 x i16>, <4 x i16>* %vector_ptr, align 2 + %addp_vec = add <4 x i16> %_p_vec_full, %6 + store <4 x i16> %addp_vec, <4 x i16>* %vector_ptr, align 2 + %8 = icmp slt i32 %polly.next_loopiv, %leftover_lb + %p_arrayidx.inc = getelementptr i16, i16* %p_arrayidx.phi, i32 4 + br i1 %8, label %polly.loop_body, label %polly.loop_header24.preheader.loopexit + +polly.stmt.for.body29: ; preds = %polly.stmt.for.body29.preheader, %polly.stmt.for.body29 + %polly.loopiv2733 = phi i32 [ %polly.next_loopiv28, %polly.stmt.for.body29 ], [ %polly.loopiv27.ph, %polly.stmt.for.body29.preheader ] + %polly.next_loopiv28 = add nsw i32 %polly.loopiv2733, 1 + %p_arrayidx30 = getelementptr i16, i16* %A, i32 %polly.loopiv2733 + %_p_scalar_ = load i16, i16* %p_arrayidx30, align 2 + %p_add = add i16 %_p_scalar_, %val + store i16 %p_add, i16* %p_arrayidx30, align 2 + %exitcond = icmp eq i32 %polly.next_loopiv28, %N + br i1 %exitcond, label %for.end.loopexit, label %polly.stmt.for.body29 +} + +attributes #0 = { nounwind "fp-contract-model"="standard" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="static" "ssp-buffers-size"="8" } diff --git a/test/CodeGen/MIR/AArch64/target-memoperands.mir b/test/CodeGen/MIR/AArch64/target-memoperands.mir index f853b551e098..c71302d97e2e 100644 --- a/test/CodeGen/MIR/AArch64/target-memoperands.mir +++ b/test/CodeGen/MIR/AArch64/target-memoperands.mir @@ -10,13 +10,17 @@ --- # CHECK-LABEL: name: target_memoperands # CHECK: %1(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8) +# CHECK: %2(s32) = G_LOAD %0(p0) :: ("aarch64-strided-access" load 4) # CHECK: G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8) +# CHECK: G_STORE %2(s32), %0(p0) :: ("aarch64-strided-access" store 4) name: target_memoperands body: | bb.0: %0:_(p0) = COPY %x0 %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8) + %2:_(s32) = G_LOAD %0(p0) :: ("aarch64-strided-access" load 4) G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8) + G_STORE %2(s32), %0(p0) :: ("aarch64-strided-access" store 4) RET_ReallyLR ... diff --git a/test/CodeGen/MIR/AMDGPU/fold-multiple.mir b/test/CodeGen/MIR/AMDGPU/fold-multiple.mir new file mode 100644 index 000000000000..a5da33a997d3 --- /dev/null +++ b/test/CodeGen/MIR/AMDGPU/fold-multiple.mir @@ -0,0 +1,40 @@ +# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s +--- | + define amdgpu_kernel void @test() #0 { + ret void + } + + attributes #0 = { nounwind } + +... +--- + +# This used to crash / trigger an assertion, because re-scanning the use list +# after constant-folding the definition of %3 lead to the definition of %2 +# being processed twice. + +# CHECK-LABEL: name: test +# CHECK: %2 = V_LSHLREV_B32_e32 2, killed %0, implicit %exec +# CHECK: %4 = V_AND_B32_e32 8, killed %2, implicit %exec + +name: test +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: sreg_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_32 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_128 } +body: | + bb.0 (%ir-block.0): + %0 = IMPLICIT_DEF + %1 = S_MOV_B32 2 + %2 = V_LSHLREV_B32_e64 %1, killed %0, implicit %exec + %3 = S_LSHL_B32 %1, killed %1, implicit-def dead %scc + %4 = V_AND_B32_e64 killed %2, killed %3, implicit %exec + %5 = IMPLICIT_DEF + BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll index 4baf499848fd..3501861f5757 100644 --- a/test/CodeGen/MSP430/vararg.ll +++ b/test/CodeGen/MSP430/vararg.ll @@ -39,11 +39,11 @@ entry: ; CHECK-LABEL: va_copy: %vl.addr = alloca i8*, align 2 %vl2 = alloca i8*, align 2 -; CHECK: mov.w r12, 2(r1) +; CHECK-DAG: mov.w r12, 2(r1) store i8* %vl, i8** %vl.addr, align 2 %0 = bitcast i8** %vl2 to i8* %1 = bitcast i8** %vl.addr to i8* -; CHECK-NEXT: mov.w r12, 0(r1) +; CHECK-DAG: mov.w r12, 0(r1) call void @llvm.va_copy(i8* %0, i8* %1) ret void } diff --git a/test/CodeGen/Mips/2008-06-05-Carry.ll b/test/CodeGen/Mips/2008-06-05-Carry.ll index c61e1cdedea7..5e6092fc7848 100644 --- a/test/CodeGen/Mips/2008-06-05-Carry.ll +++ b/test/CodeGen/Mips/2008-06-05-Carry.ll @@ -2,20 +2,21 @@ define i64 @add64(i64 %u, i64 %v) nounwind { entry: +; CHECK-LABEL: add64: ; CHECK: addu -; CHECK: sltu +; CHECK-DAG: sltu +; CHECK-DAG: addu ; CHECK: addu -; CHECK: addu - %tmp2 = add i64 %u, %v + %tmp2 = add i64 %u, %v ret i64 %tmp2 } define i64 @sub64(i64 %u, i64 %v) nounwind { entry: -; CHECK: sub64 +; CHECK-LABEL: sub64 +; CHECK-DAG: sltu +; CHECK-DAG: subu ; CHECK: subu -; CHECK: sltu -; CHECK: addu ; CHECK: subu %tmp2 = sub i64 %u, %v ret i64 %tmp2 diff --git a/test/CodeGen/Mips/dins.ll b/test/CodeGen/Mips/dins.ll index 5c0415759266..2aa824250d3b 100644 --- a/test/CodeGen/Mips/dins.ll +++ b/test/CodeGen/Mips/dins.ll @@ -59,9 +59,9 @@ entry: ; CHECK-LABEL: f123: ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 123 ; MIPS64R2: dins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 27, 37 -; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 5 ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 4 ; MIPS64R2: dins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6 +; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 5 ; MIPS64R2: dins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50, 14 ; MIPS64R2: dsrl $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50 ; MIPS64R2: dins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 34, 16 @@ -94,4 +94,4 @@ entry: ; MIPS32R2: ori $[[R0:[0-9]+]], $[[R0:[0-9]+]], 8 ; MIPS32R2-NOT: ins {{[[:space:]].*}} ; MIPS64R2N32: ori $[[R0:[0-9]+]], $[[R0:[0-9]+]], 8 -; MIPS64R2N32-NOT: ins {{[[:space:]].*}} \ No newline at end of file +; MIPS64R2N32-NOT: ins {{[[:space:]].*}} diff --git a/test/CodeGen/Mips/dsp-patterns.ll b/test/CodeGen/Mips/dsp-patterns.ll index 837c0d8bfc52..250d3eff37dc 100644 --- a/test/CodeGen/Mips/dsp-patterns.ll +++ b/test/CodeGen/Mips/dsp-patterns.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=mips -mattr=dsp < %s | FileCheck %s -check-prefix=R1 -; RUN: llc -march=mips -mattr=dspr2 < %s | FileCheck %s -check-prefix=R2 +; RUN: llc -march=mips -mcpu=mips32r2 -mattr=dsp < %s | FileCheck %s -check-prefix=R1 +; RUN: llc -march=mips -mcpu=mips32r2 -mattr=dspr2 < %s | FileCheck %s -check-prefix=R2 ; R1-LABEL: test_lbux: ; R1: lbux ${{[0-9]+}} diff --git a/test/CodeGen/Mips/llcarry.ll b/test/CodeGen/Mips/llcarry.ll index fcf129420234..b7cc6fc8ea75 100644 --- a/test/CodeGen/Mips/llcarry.ll +++ b/test/CodeGen/Mips/llcarry.ll @@ -14,9 +14,9 @@ entry: %add = add nsw i64 %1, %0 store i64 %add, i64* @k, align 8 ; 16: addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}} -; 16: sltu ${{[0-9]+}}, ${{[0-9]+}} -; 16: move ${{[0-9]+}}, $t8 ; 16: addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}} +; 16: sltu ${{[0-9]+}}, ${{[0-9]+}} +; 16: move ${{[0-9]+}}, $24 ; 16: addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}} ret void } @@ -28,8 +28,8 @@ entry: %sub = sub nsw i64 %0, %1 ; 16: subu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}} ; 16: sltu ${{[0-9]+}}, ${{[0-9]+}} -; 16: move ${{[0-9]+}}, $t8 -; 16: addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}} +; 16: move ${{[0-9]+}}, $24 +; 16: subu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}} ; 16: subu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}} store i64 %sub, i64* @l, align 8 ret void @@ -41,8 +41,7 @@ entry: %add = add nsw i64 %0, 15 ; 16: addiu ${{[0-9]+}}, 15 ; 16: sltu ${{[0-9]+}}, ${{[0-9]+}} -; 16: move ${{[0-9]+}}, $t8 -; 16: addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}} +; 16: move ${{[0-9]+}}, $24 ; 16: addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}} store i64 %add, i64* @m, align 8 ret void diff --git a/test/CodeGen/Mips/llvm-ir/add.ll b/test/CodeGen/Mips/llvm-ir/add.ll index a5ecdda94ce2..63884eb03b8c 100644 --- a/test/CodeGen/Mips/llvm-ir/add.ll +++ b/test/CodeGen/Mips/llvm-ir/add.ll @@ -1,35 +1,35 @@ ; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \ -; RUN: -check-prefixes=ALL,NOT-R2-R6,GP32 +; RUN: -check-prefixes=ALL,NOT-R2-R6,GP32,PRE4 ; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \ -; RUN: -check-prefixes=ALL,NOT-R2-R6,GP32 +; RUN: -check-prefixes=ALL,NOT-R2-R6,GP32,GP32-CMOV ; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \ -; RUN: -check-prefixes=ALL,R2-R6,GP32 +; RUN: -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV ; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \ -; RUN: -check-prefixes=ALL,R2-R6,GP32 +; RUN: -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV ; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \ -; RUN: -check-prefixes=ALL,R2-R6,GP32 +; RUN: -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV ; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \ ; RUN: -check-prefixes=ALL,R2-R6,GP32 ; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \ -; RUN: -check-prefixes=ALL,NOT-R2-R6,GP64 +; RUN: -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6 ; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \ -; RUN: -check-prefixes=ALL,NOT-R2-R6,GP64 +; RUN: -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6 ; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \ -; RUN: -check-prefixes=ALL,NOT-R2-R6,GP64 +; RUN: -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6 ; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \ -; RUN: -check-prefixes=ALL,R2-R6,GP64 +; RUN: -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6 ; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \ -; RUN: -check-prefixes=ALL,R2-R6,GP64 +; RUN: -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6 ; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \ -; RUN: -check-prefixes=ALL,R2-R6,GP64 +; RUN: -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \ -; RUN: -check-prefixes=ALL,R2-R6,GP64 +; RUN: -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6 ; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -O2 -verify-machineinstrs | FileCheck %s \ -; RUN: -check-prefixes=ALL,MMR6,MM32 +; RUN: -check-prefixes=ALL,MMR3,MM32 ; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -O2 | FileCheck %s \ ; RUN: -check-prefixes=ALL,MMR6,MM32 ; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips -O2 | FileCheck %s \ -; RUN: -check-prefixes=ALL,MMR6,MM64 +; RUN: -check-prefixes=ALL,MM64 ; FIXME: This code sequence is inefficient as it should be 'subu $[[T0]], $zero, $[[T0]'. @@ -110,17 +110,17 @@ define signext i64 @add_i64(i64 signext %a, i64 signext %b) { entry: ; ALL-LABEL: add_i64: - ; GP32: addu $3, $5, $7 - ; GP32: sltu $[[T0:[0-9]+]], $3, $7 - ; GP32: addu $[[T1:[0-9]+]], $[[T0]], $6 - ; GP32: addu $2, $4, $[[T1]] + ; GP32-DAG: addu $[[T0:[0-9]+]], $4, $6 + ; GP32-DAG: addu $3, $5, $7 + ; GP32: sltu $[[T1:[0-9]+]], $3, $5 + ; GP32: addu $2, $[[T0]], $[[T1]] ; GP64: daddu $2, $4, $5 - ; MM32: addu16 $3, $5, $7 - ; MM32: sltu $[[T0:[0-9]+]], $3, $7 - ; MM32: addu $[[T1:[0-9]+]], $[[T0]], $6 - ; MM32: addu $2, $4, $[[T1]] + ; MM32-DAG: addu16 $3, $5, $7 + ; MM32-DAG: addu16 $[[T0:[0-9]+]], $4, $6 + ; MM32: sltu $[[T1:[0-9]+]], $3, $5 + ; MM32: addu16 $2, $[[T0]], $[[T1]] ; MM64: daddu $2, $4, $5 @@ -132,49 +132,108 @@ define signext i128 @add_i128(i128 signext %a, i128 signext %b) { entry: ; ALL-LABEL: add_i128: - ; GP32: lw $[[T0:[0-9]+]], 28($sp) - ; GP32: addu $[[T1:[0-9]+]], $7, $[[T0]] - ; GP32: sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]] - ; GP32: lw $[[T3:[0-9]+]], 24($sp) - ; GP32: addu $[[T4:[0-9]+]], $[[T2]], $[[T3]] - ; GP32: addu $[[T5:[0-9]+]], $6, $[[T4]] - ; GP32: sltu $[[T6:[0-9]+]], $[[T5]], $[[T3]] - ; GP32: lw $[[T7:[0-9]+]], 20($sp) - ; GP32: addu $[[T8:[0-9]+]], $[[T6]], $[[T7]] - ; GP32: lw $[[T9:[0-9]+]], 16($sp) - ; GP32: addu $3, $5, $[[T8]] - ; GP32: sltu $[[T10:[0-9]+]], $3, $[[T7]] - ; GP32: addu $[[T11:[0-9]+]], $[[T10]], $[[T9]] - ; GP32: addu $2, $4, $[[T11]] - ; GP32: move $4, $[[T5]] - ; GP32: move $5, $[[T1]] - - ; GP64: daddu $3, $5, $7 - ; GP64: sltu $[[T0:[0-9]+]], $3, $7 - ; GP64: daddu $[[T1:[0-9]+]], $[[T0]], $6 - ; GP64: daddu $2, $4, $[[T1]] - - ; MM32: lw $[[T0:[0-9]+]], 28($sp) - ; MM32: addu $[[T1:[0-9]+]], $7, $[[T0]] - ; MM32: sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]] - ; MM32: lw $[[T3:[0-9]+]], 24($sp) - ; MM32: addu16 $[[T4:[0-9]+]], $[[T2]], $[[T3]] - ; MM32: addu16 $[[T5:[0-9]+]], $6, $[[T4]] - ; MM32: sltu $[[T6:[0-9]+]], $[[T5]], $[[T3]] - ; MM32: lw $[[T7:[0-9]+]], 20($sp) - ; MM32: addu16 $[[T8:[0-9]+]], $[[T6]], $[[T7]] - ; MM32: lw $[[T9:[0-9]+]], 16($sp) - ; MM32: addu16 $[[T10:[0-9]+]], $5, $[[T8]] - ; MM32: sltu $[[T11:[0-9]+]], $[[T10]], $[[T7]] - ; MM32: addu $[[T12:[0-9]+]], $[[T11]], $[[T9]] - ; MM32: addu16 $[[T13:[0-9]+]], $4, $[[T12]] - ; MM32: move $4, $[[T5]] - ; MM32: move $5, $[[T1]] - + ; PRE4: move $[[R1:[0-9]+]], $5 + ; PRE4: move $[[R2:[0-9]+]], $4 + ; PRE4: lw $[[R3:[0-9]+]], 24($sp) + ; PRE4: addu $[[R4:[0-9]+]], $6, $[[R3]] + ; PRE4: lw $[[R5:[0-9]+]], 28($sp) + ; PRE4: addu $[[R6:[0-9]+]], $7, $[[R5]] + ; PRE4: sltu $[[R7:[0-9]+]], $[[R6]], $7 + ; PRE4: addu $[[R8:[0-9]+]], $[[R4]], $[[R7]] + ; PRE4: xor $[[R9:[0-9]+]], $[[R8]], $6 + ; PRE4: sltiu $[[R10:[0-9]+]], $[[R9]], 1 + ; PRE4: bnez $[[R10]], $BB5_2 + ; PRE4: sltu $[[R7]], $[[R8]], $6 + ; PRE4: lw $[[R12:[0-9]+]], 20($sp) + ; PRE4: addu $[[R13:[0-9]+]], $[[R1]], $[[R12]] + ; PRE4: lw $[[R14:[0-9]+]], 16($sp) + ; PRE4: addu $[[R15:[0-9]+]], $[[R13]], $[[R7]] + ; PRE4: addu $[[R16:[0-9]+]], $[[R2]], $[[R14]] + ; PRE4: sltu $[[R17:[0-9]+]], $[[R15]], $[[R13]] + ; PRE4: sltu $[[R18:[0-9]+]], $[[R13]], $[[R1]] + ; PRE4: addu $[[R19:[0-9]+]], $[[R16]], $[[R18]] + ; PRE4: addu $2, $[[R19]], $[[R17]] + + ; GP32-CMOV: lw $[[T0:[0-9]+]], 24($sp) + ; GP32-CMOV: addu $[[T1:[0-9]+]], $6, $[[T0]] + ; GP32-CMOV: lw $[[T2:[0-9]+]], 28($sp) + ; GP32-CMOV: addu $[[T3:[0-9]+]], $7, $[[T2]] + ; GP32-CMOV: sltu $[[T4:[0-9]+]], $[[T3]], $7 + ; GP32-CMOV: addu $[[T5:[0-9]+]], $[[T1]], $[[T4]] + ; GP32-CMOV: sltu $[[T6:[0-9]+]], $[[T5]], $6 + ; GP32-CMOV: xor $[[T7:[0-9]+]], $[[T5]], $6 + ; GP32-CMOV: movz $[[T8:[0-9]+]], $[[T4]], $[[T7]] + ; GP32-CMOV: lw $[[T9:[0-9]+]], 20($sp) + ; GP32-CMOV: addu $[[T10:[0-9]+]], $5, $[[T4]] + ; GP32-CMOV: addu $[[T11:[0-9]+]], $[[T10]], $[[T8]] + ; GP32-CMOV: lw $[[T12:[0-9]+]], 16($sp) + ; GP32-CMOV: sltu $[[T13:[0-9]+]], $[[T11]], $[[T10]] + ; GP32-CMOV: addu $[[T14:[0-9]+]], $4, $[[T12]] + ; GP32-CMOV: sltu $[[T15:[0-9]+]], $[[T10]], $5 + ; GP32-CMOV: addu $[[T16:[0-9]+]], $[[T14]], $[[T15]] + ; GP32-CMOV: addu $[[T17:[0-9]+]], $[[T16]], $[[T13]] + ; GP32-CMOV: move $4, $[[T5]] + ; GP32-CMOV: move $5, $[[T3]] + + ; GP64: daddu $[[T0:[0-9]+]], $4, $6 + ; GP64: daddu $[[T1:[0-9]+]], $5, $7 + ; GP64: sltu $[[T2:[0-9]+]], $[[T1]], $5 + ; GP64-NOT-R2-R6: dsll $[[T3:[0-9]+]], $[[T2]], 32 + ; GP64-NOT-R2-R6: dsrl $[[T4:[0-9]+]], $[[T3]], 32 + ; GP64-R2-R6: dext $[[T4:[0-9]+]], $[[T2]], 0, 32 + + ; GP64: daddu $2, $[[T0]], $[[T4]] + + ; MMR3: move $[[T1:[0-9]+]], $5 + ; MMR3-DAG: lw $[[T2:[0-9]+]], 32($sp) + ; MMR3: addu16 $[[T3:[0-9]+]], $6, $[[T2]] + ; MMR3-DAG: lw $[[T4:[0-9]+]], 36($sp) + ; MMR3: addu16 $[[T5:[0-9]+]], $7, $[[T4]] + ; MMR3: sltu $[[T6:[0-9]+]], $[[T5]], $7 + ; MMR3: addu16 $[[T7:[0-9]+]], $[[T3]], $[[T6]] + ; MMR3: sltu $[[T8:[0-9]+]], $[[T7]], $6 + ; MMR3: xor $[[T9:[0-9]+]], $[[T7]], $6 + ; MMR3: movz $[[T8]], $[[T6]], $[[T9]] + ; MMR3: lw $[[T10:[0-9]+]], 28($sp) + ; MMR3: addu16 $[[T11:[0-9]+]], $[[T1]], $[[T10]] + ; MMR3: addu16 $[[T12:[0-9]+]], $[[T11]], $[[T8]] + ; MMR3: lw $[[T13:[0-9]+]], 24($sp) + ; MMR3: sltu $[[T14:[0-9]+]], $[[T12]], $[[T11]] + ; MMR3: addu16 $[[T15:[0-9]+]], $4, $[[T13]] + ; MMR3: sltu $[[T16:[0-9]+]], $[[T11]], $[[T1]] + ; MMR3: addu16 $[[T17:[0-9]+]], $[[T15]], $[[T16]] + ; MMR3: addu16 $2, $2, $[[T14]] + + ; MMR6: move $[[T1:[0-9]+]], $5 + ; MMR6: move $[[T2:[0-9]+]], $4 + ; MMR6: lw $[[T3:[0-9]+]], 32($sp) + ; MMR6: addu16 $[[T4:[0-9]+]], $6, $[[T3]] + ; MMR6: lw $[[T5:[0-9]+]], 36($sp) + ; MMR6: addu16 $[[T6:[0-9]+]], $7, $[[T5]] + ; MMR6: sltu $[[T7:[0-9]+]], $[[T6]], $7 + ; MMR6: addu16 $[[T8:[0-9]+]], $[[T4]], $7 + ; MMR6: sltu $[[T9:[0-9]+]], $[[T8]], $6 + ; MMR6: xor $[[T10:[0-9]+]], $[[T4]], $6 + ; MMR6: sltiu $[[T11:[0-9]+]], $[[T10]], 1 + ; MMR6: seleqz $[[T12:[0-9]+]], $[[T9]], $[[T11]] + ; MMR6: selnez $[[T13:[0-9]+]], $[[T7]], $[[T11]] + ; MMR6: lw $[[T14:[0-9]+]], 24($sp) + ; MMR6: or $[[T15:[0-9]+]], $[[T13]], $[[T12]] + ; MMR6: addu16 $[[T16:[0-9]+]], $[[T2]], $[[T14]] + ; MMR6: lw $[[T17:[0-9]+]], 28($sp) + ; MMR6: addu16 $[[T18:[0-9]+]], $[[T1]], $[[T17]] + ; MMR6: addu16 $[[T19:[0-9]+]], $[[T18]], $[[T15]] + ; MMR6: sltu $[[T20:[0-9]+]], $[[T18]], $[[T1]] + ; MMR6: sltu $[[T21:[0-9]+]], $[[T17]], $[[T18]] + ; MMR6: addu16 $2, $[[T16]], $[[T20]] + ; MMR6: addu16 $2, $[[T20]], $[[T21]] + + ; MM64: daddu $[[T0:[0-9]+]], $4, $6 ; MM64: daddu $3, $5, $7 - ; MM64: sltu $[[T0:[0-9]+]], $3, $7 - ; MM64: daddu $[[T1:[0-9]+]], $[[T0]], $6 - ; MM64: daddu $2, $4, $[[T1]] + ; MM64: sltu $[[T1:[0-9]+]], $3, $5 + ; MM64: dsll $[[T2:[0-9]+]], $[[T1]], 32 + ; MM64: dsrl $[[T3:[0-9]+]], $[[T2]], 32 + ; MM64: daddu $2, $[[T0]], $[[T3]] %r = add i128 %a, %b ret i128 %r @@ -249,17 +308,16 @@ define signext i32 @add_i32_4(i32 signext %a) { define signext i64 @add_i64_4(i64 signext %a) { ; ALL-LABEL: add_i64_4: - ; GP32: addiu $[[T0:[0-9]+]], $5, 4 - ; GP32: addiu $[[T1:[0-9]+]], $zero, 4 - ; GP32: sltu $[[T1]], $[[T0]], $[[T1]] - ; GP32: addu $2, $4, $[[T1]] + ; GP32: addiu $3, $5, 4 + ; GP32: sltu $[[T0:[0-9]+]], $3, $5 + ; GP32: addu $2, $4, $[[T0]] + + ; MM32: addiur2 $[[T1:[0-9]+]], $5, 4 + ; MM32: sltu $[[T2:[0-9]+]], $[[T1]], $5 + ; MM32: addu16 $2, $4, $[[T2]] ; GP64: daddiu $2, $4, 4 - ; MM32: addiu $[[T0:[0-9]+]], $5, 4 - ; MM32: li16 $[[T1:[0-9]+]], 4 - ; MM32: sltu $[[T2:[0-9]+]], $[[T0]], $[[T1]] - ; MM32: addu $2, $4, $[[T2]] ; MM64: daddiu $2, $4, 4 @@ -270,38 +328,67 @@ define signext i64 @add_i64_4(i64 signext %a) { define signext i128 @add_i128_4(i128 signext %a) { ; ALL-LABEL: add_i128_4: - ; GP32: addiu $[[T0:[0-9]+]], $7, 4 - ; GP32: addiu $[[T1:[0-9]+]], $zero, 4 - ; GP32: sltu $[[T1]], $[[T0]], $[[T1]] - ; GP32: addu $[[T2:[0-9]+]], $6, $[[T1]] - ; GP32: sltu $[[T1]], $[[T2]], $zero - ; GP32: addu $[[T3:[0-9]+]], $5, $[[T1]] - ; GP32: sltu $[[T1]], $[[T3]], $zero - ; GP32: addu $[[T1]], $4, $[[T1]] - ; GP32: move $4, $[[T2]] - ; GP32: move $5, $[[T0]] - - ; GP64: daddiu $[[T0:[0-9]+]], $5, 4 - ; GP64: daddiu $[[T1:[0-9]+]], $zero, 4 - ; GP64: sltu $[[T1]], $[[T0]], $[[T1]] - ; GP64: daddu $2, $4, $[[T1]] - - ; MM32: addiu $[[T0:[0-9]+]], $7, 4 - ; MM32: li16 $[[T1:[0-9]+]], 4 - ; MM32: sltu $[[T1]], $[[T0]], $[[T1]] - ; MM32: addu16 $[[T2:[0-9]+]], $6, $[[T1]] - ; MM32: li16 $[[T1]], 0 - ; MM32: sltu $[[T3:[0-9]+]], $[[T2]], $[[T1]] - ; MM32: addu16 $[[T3]], $5, $[[T3]] - ; MM32: sltu $[[T1]], $[[T3]], $[[T1]] - ; MM32: addu16 $[[T1]], $4, $[[T1]] - ; MM32: move $4, $[[T2]] - ; MM32: move $5, $[[T0]] + ; PRE4: move $[[T0:[0-9]+]], $5 + ; PRE4: addiu $[[T1:[0-9]+]], $7, 4 + ; PRE4: sltu $[[T2:[0-9]+]], $[[T1]], $7 + ; PRE4: xori $[[T3:[0-9]+]], $[[T2]], 1 + ; PRE4: bnez $[[T3]], $BB[[BB0:[0-9_]+]] + ; PRE4: addu $[[T4:[0-9]+]], $6, $[[T2]] + ; PRE4: sltu $[[T5:[0-9]+]], $[[T4]], $6 + ; PRE4; $BB[[BB0:[0-9]+]]: + ; PRE4: addu $[[T6:[0-9]+]], $[[T0]], $[[T5]] + ; PRE4: sltu $[[T7:[0-9]+]], $[[T6]], $[[T0]] + ; PRE4: addu $[[T8:[0-9]+]], $4, $[[T7]] + ; PRE4: move $4, $[[T4]] + + ; GP32-CMOV: addiu $[[T0:[0-9]+]], $7, 4 + ; GP32-CMOV: sltu $[[T1:[0-9]+]], $[[T0]], $7 + ; GP32-CMOV: addu $[[T2:[0-9]+]], $6, $[[T1]] + ; GP32-CMOV: sltu $[[T3:[0-9]+]], $[[T2]], $6 + ; GP32-CMOV: movz $[[T3]], $[[T1]], $[[T1]] + ; GP32-CMOV: addu $[[T4:[0-9]+]], $5, $[[T3]] + ; GP32-CMOV: sltu $[[T5:[0-9]+]], $[[T4]], $5 + ; GP32-CMOV: addu $[[T7:[0-9]+]], $4, $[[T5]] + ; GP32-CMOV: move $4, $[[T2]] + ; GP32-CMOV: move $5, $[[T0]] + + ; GP64: daddiu $[[T0:[0-9]+]], $5, 4 + ; GP64: sltu $[[T1:[0-9]+]], $[[T0]], $5 + ; GP64-NOT-R2-R6: dsll $[[T2:[0-9]+]], $[[T1]], 32 + ; GP64-NOT-R2-R6: dsrl $[[T3:[0-9]+]], $[[T2]], 32 + ; GP64-R2-R6: dext $[[T3:[0-9]+]], $[[T1]], 0, 32 + + ; GP64: daddu $2, $4, $[[T3]] + + ; MMR3: addiur2 $[[T0:[0-9]+]], $7, 4 + ; MMR3: sltu $[[T1:[0-9]+]], $[[T0]], $7 + ; MMR3: sltu $[[T2:[0-9]+]], $[[T0]], $7 + ; MMR3: addu16 $[[T3:[0-9]+]], $6, $[[T2]] + ; MMR3: sltu $[[T4:[0-9]+]], $[[T3]], $6 + ; MMR3: movz $[[T4]], $[[T2]], $[[T1]] + ; MMR3: addu16 $[[T6:[0-9]+]], $5, $[[T4]] + ; MMR3: sltu $[[T7:[0-9]+]], $[[T6]], $5 + ; MMR3: addu16 $2, $4, $[[T7]] + + ; MMR6: addiur2 $[[T1:[0-9]+]], $7, 4 + ; MMR6: sltu $[[T2:[0-9]+]], $[[T1]], $7 + ; MMR6: xori $[[T3:[0-9]+]], $[[T2]], 1 + ; MMR6: selnez $[[T4:[0-9]+]], $[[T2]], $[[T3]] + ; MMR6: addu16 $[[T5:[0-9]+]], $6, $[[T2]] + ; MMR6: sltu $[[T6:[0-9]+]], $[[T5]], $6 + ; MMR6: seleqz $[[T7:[0-9]+]], $[[T6]], $[[T3]] + ; MMR6: or $[[T8:[0-9]+]], $[[T4]], $[[T7]] + ; MMR6: addu16 $[[T9:[0-9]+]], $5, $[[T8]] + ; MMR6: sltu $[[T10:[0-9]+]], $[[T9]], $5 + ; MMR6: addu16 $[[T11:[0-9]+]], $4, $[[T10]] + ; MMR6: move $4, $7 + ; MMR6: move $5, $[[T1]] ; MM64: daddiu $[[T0:[0-9]+]], $5, 4 - ; MM64: daddiu $[[T1:[0-9]+]], $zero, 4 - ; MM64: sltu $[[T1]], $[[T0]], $[[T1]] - ; MM64: daddu $2, $4, $[[T1]] + ; MM64: sltu $[[T1:[0-9]+]], $[[T0]], $5 + ; MM64: dsll $[[T2:[0-9]+]], $[[T1]], 32 + ; MM64: dsrl $[[T3:[0-9]+]], $[[T2]], 32 + ; MM64: daddu $2, $4, $[[T3]] %r = add i128 4, %a ret i128 %r @@ -380,16 +467,15 @@ define signext i64 @add_i64_3(i64 signext %a) { ; ALL-LABEL: add_i64_3: ; GP32: addiu $[[T0:[0-9]+]], $5, 3 - ; GP32: addiu $[[T1:[0-9]+]], $zero, 3 - ; GP32: sltu $[[T1]], $[[T0]], $[[T1]] + ; GP32: sltu $[[T1:[0-9]+]], $[[T0]], $5 ; GP32: addu $2, $4, $[[T1]] ; GP64: daddiu $2, $4, 3 - ; MM32: addiu $[[T0:[0-9]+]], $5, 3 - ; MM32: li16 $[[T1:[0-9]+]], 3 - ; MM32: sltu $[[T2:[0-9]+]], $[[T0]], $[[T1]] - ; MM32: addu $2, $4, $[[T2]] + ; MM32: move $[[T1:[0-9]+]], $5 + ; MM32: addius5 $[[T1]], 3 + ; MM32: sltu $[[T2:[0-9]+]], $[[T1]], $5 + ; MM32: addu16 $2, $4, $[[T2]] ; MM64: daddiu $2, $4, 3 @@ -400,38 +486,70 @@ define signext i64 @add_i64_3(i64 signext %a) { define signext i128 @add_i128_3(i128 signext %a) { ; ALL-LABEL: add_i128_3: - ; GP32: addiu $[[T0:[0-9]+]], $7, 3 - ; GP32: addiu $[[T1:[0-9]+]], $zero, 3 - ; GP32: sltu $[[T1]], $[[T0]], $[[T1]] - ; GP32: addu $[[T2:[0-9]+]], $6, $[[T1]] - ; GP32: sltu $[[T3:[0-9]+]], $[[T2]], $zero - ; GP32: addu $[[T4:[0-9]+]], $5, $[[T3]] - ; GP32: sltu $[[T5:[0-9]+]], $[[T4]], $zero - ; GP32: addu $[[T5]], $4, $[[T5]] - ; GP32: move $4, $[[T2]] - ; GP32: move $5, $[[T0]] - - ; GP64: daddiu $[[T0:[0-9]+]], $5, 3 - ; GP64: daddiu $[[T1:[0-9]+]], $zero, 3 - ; GP64: sltu $[[T1]], $[[T0]], $[[T1]] - ; GP64: daddu $2, $4, $[[T1]] - - ; MM32: addiu $[[T0:[0-9]+]], $7, 3 - ; MM32: li16 $[[T1:[0-9]+]], 3 - ; MM32: sltu $[[T1]], $[[T0]], $[[T1]] - ; MM32: addu16 $[[T2:[0-9]+]], $6, $[[T1]] - ; MM32: li16 $[[T3:[0-9]+]], 0 - ; MM32: sltu $[[T4:[0-9]+]], $[[T2]], $[[T3]] - ; MM32: addu16 $[[T4]], $5, $[[T4]] - ; MM32: sltu $[[T5:[0-9]+]], $[[T4]], $[[T3]] - ; MM32: addu16 $[[T5]], $4, $[[T5]] - ; MM32: move $4, $[[T2]] - ; MM32: move $5, $[[T0]] + ; PRE4: move $[[T0:[0-9]+]], $5 + ; PRE4: addiu $[[T1:[0-9]+]], $7, 3 + ; PRE4: sltu $[[T2:[0-9]+]], $[[T1]], $7 + ; PRE4: xori $[[T3:[0-9]+]], $[[T2]], 1 + ; PRE4: bnez $[[T3]], $BB[[BB0:[0-9_]+]] + ; PRE4: addu $[[T4:[0-9]+]], $6, $[[T2]] + ; PRE4: sltu $[[T5:[0-9]+]], $[[T4]], $6 + ; PRE4; $BB[[BB0:[0-9]+]]: + ; PRE4: addu $[[T6:[0-9]+]], $[[T0]], $[[T5]] + ; PRE4: sltu $[[T7:[0-9]+]], $[[T6]], $[[T0]] + ; PRE4: addu $[[T8:[0-9]+]], $4, $[[T7]] + ; PRE4: move $4, $[[T4]] + + ; GP32-CMOV: addiu $[[T0:[0-9]+]], $7, 3 + ; GP32-CMOV: sltu $[[T1:[0-9]+]], $[[T0]], $7 + ; GP32-CMOV: addu $[[T2:[0-9]+]], $6, $[[T1]] + ; GP32-CMOV: sltu $[[T3:[0-9]+]], $[[T2]], $6 + ; GP32-CMOV: movz $[[T3]], $[[T1]], $[[T1]] + ; GP32-CMOV: addu $[[T4:[0-9]+]], $5, $[[T3]] + ; GP32-CMOV: sltu $[[T5:[0-9]+]], $[[T4]], $5 + ; GP32-CMOV: addu $[[T7:[0-9]+]], $4, $[[T5]] + ; GP32-CMOV: move $4, $[[T2]] + ; GP32-CMOV: move $5, $[[T0]] + + ; GP64: daddiu $[[T0:[0-9]+]], $5, 3 + ; GP64: sltu $[[T1:[0-9]+]], $[[T0]], $5 + + ; GP64-NOT-R2-R6: dsll $[[T2:[0-9]+]], $[[T1]], 32 + ; GP64-NOT-R2-R6: dsrl $[[T3:[0-9]+]], $[[T2]], 32 + ; GP64-R2-R6: dext $[[T3:[0-9]+]], $[[T1]], 0, 32 + + ; GP64: daddu $2, $4, $[[T3]] + + ; MMR3: move $[[T1:[0-9]+]], $7 + ; MMR3: addius5 $[[T1]], 3 + ; MMR3: sltu $[[T2:[0-9]+]], $[[T1]], $7 + ; MMR3: sltu $[[T3:[0-9]+]], $[[T1]], $7 + ; MMR3: addu16 $[[T4:[0-9]+]], $6, $[[T3]] + ; MMR3: sltu $[[T5:[0-9]+]], $[[T4]], $6 + ; MMR3: movz $[[T5]], $[[T3]], $[[T2]] + ; MMR3: addu16 $[[T6:[0-9]+]], $5, $[[T5]] + ; MMR3: sltu $[[T7:[0-9]+]], $[[T6]], $5 + ; MMR3: addu16 $2, $4, $[[T7]] + + ; MMR6: move $[[T1:[0-9]+]], $7 + ; MMR6: addius5 $[[T1]], 3 + ; MMR6: sltu $[[T2:[0-9]+]], $[[T1]], $7 + ; MMR6: xori $[[T3:[0-9]+]], $[[T2]], 1 + ; MMR6: selnez $[[T4:[0-9]+]], $[[T2]], $[[T3]] + ; MMR6: addu16 $[[T5:[0-9]+]], $6, $[[T2]] + ; MMR6: sltu $[[T6:[0-9]+]], $[[T5]], $6 + ; MMR6: seleqz $[[T7:[0-9]+]], $[[T6]], $[[T3]] + ; MMR6: or $[[T8:[0-9]+]], $[[T4]], $[[T7]] + ; MMR6: addu16 $[[T9:[0-9]+]], $5, $[[T8]] + ; MMR6: sltu $[[T10:[0-9]+]], $[[T9]], $5 + ; MMR6: addu16 $[[T11:[0-9]+]], $4, $[[T10]] + ; MMR6: move $4, $[[T5]] + ; MMR6: move $5, $[[T1]] ; MM64: daddiu $[[T0:[0-9]+]], $5, 3 - ; MM64: daddiu $[[T1:[0-9]+]], $zero, 3 - ; MM64: sltu $[[T1]], $[[T0]], $[[T1]] - ; MM64: daddu $2, $4, $[[T1]] + ; MM64: sltu $[[T1:[0-9]+]], $[[T0]], $5 + ; MM64: dsll $[[T2:[0-9]+]], $[[T1]], 32 + ; MM64: dsrl $[[T3:[0-9]+]], $[[T2]], 32 + ; MM64: daddu $2, $4, $[[T3]] %r = add i128 3, %a ret i128 %r diff --git a/test/CodeGen/Mips/llvm-ir/sub.ll b/test/CodeGen/Mips/llvm-ir/sub.ll index a730063c552f..655addb10a64 100644 --- a/test/CodeGen/Mips/llvm-ir/sub.ll +++ b/test/CodeGen/Mips/llvm-ir/sub.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \ -; RUN: -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM +; RUN: -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM,PRE4 ; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \ ; RUN: -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM ; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \ @@ -11,25 +11,25 @@ ; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \ ; RUN: -check-prefixes=R2-R6,GP32,GP32-NOT-MM,NOT-MM ; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -verify-machineinstrs | FileCheck %s \ -; RUN: -check-prefixes=GP32-MM,GP32,MM +; RUN: -check-prefixes=GP32-MM,GP32,MM32,MMR3 ; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \ -; RUN: -check-prefixes=GP32-MM,GP32,MM +; RUN: -check-prefixes=GP32-MM,GP32,MM32,MMR6 ; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \ -; RUN: -check-prefixes=NOT-R2-R6,GP64,NOT-MM +; RUN: -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2 ; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \ -; RUN: -check-prefixes=NOT-R2-R6,GP64,NOT-MM +; RUN: -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2 ; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \ -; RUN: -check-prefixes=NOT-R2-R6,GP64,NOT-MM +; RUN: -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2 ; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \ -; RUN: -check-prefixes=R2-R6,GP64,NOT-MM +; RUN: -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2 ; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \ -; RUN: -check-prefixes=R2-R6,GP64,NOT-MM +; RUN: -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2 ; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \ -; RUN: -check-prefixes=R2-R6,GP64,NOT-MM +; RUN: -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \ -; RUN: -check-prefixes=R2-R6,GP64,NOT-MM +; RUN: -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -mattr=+micromips | FileCheck %s \ -; RUN: -check-prefixes=GP64,MM +; RUN: -check-prefixes=GP64,MM64 define signext i1 @sub_i1(i1 signext %a, i1 signext %b) { entry: @@ -100,10 +100,15 @@ define signext i64 @sub_i64(i64 signext %a, i64 signext %b) { entry: ; ALL-LABEL: sub_i64: - ; GP32-NOT-MM subu $3, $5, $7 - ; GP32: sltu $[[T0:[0-9]+]], $5, $7 - ; GP32: addu $[[T1:[0-9]+]], $[[T0]], $6 - ; GP32: subu $2, $4, $[[T1]] + ; GP32-NOT-MM: sltu $[[T0:[0-9]+]], $5, $7 + ; GP32-NOT-MM: subu $2, $4, $6 + ; GP32-NOT-MM: subu $2, $2, $[[T0]] + ; GP32-NOT-MM: subu $3, $5, $7 + + ; MM32: sltu $[[T0:[0-9]+]], $5, $7 + ; MM32: subu16 $3, $4, $6 + ; MM32: subu16 $2, $3, $[[T0]] + ; MM32: subu16 $3, $5, $7 ; GP64: dsubu $2, $4, $5 @@ -115,42 +120,109 @@ define signext i128 @sub_i128(i128 signext %a, i128 signext %b) { entry: ; ALL-LABEL: sub_i128: - ; GP32-NOT-MM: lw $[[T0:[0-9]+]], 20($sp) - ; GP32-NOT-MM: sltu $[[T1:[0-9]+]], $5, $[[T0]] - ; GP32-NOT-MM: lw $[[T2:[0-9]+]], 16($sp) - ; GP32-NOT-MM: addu $[[T3:[0-9]+]], $[[T1]], $[[T2]] - ; GP32-NOT-MM: lw $[[T4:[0-9]+]], 24($sp) - ; GP32-NOT-MM: lw $[[T5:[0-9]+]], 28($sp) - ; GP32-NOT-MM: subu $[[T6:[0-9]+]], $7, $[[T5]] - ; GP32-NOT-MM: subu $2, $4, $[[T3]] - ; GP32-NOT-MM: sltu $[[T8:[0-9]+]], $6, $[[T4]] - ; GP32-NOT-MM: addu $[[T9:[0-9]+]], $[[T8]], $[[T0]] - ; GP32-NOT-MM: subu $3, $5, $[[T9]] - ; GP32-NOT-MM: sltu $[[T10:[0-9]+]], $7, $[[T5]] - ; GP32-NOT-MM: addu $[[T11:[0-9]+]], $[[T10]], $[[T4]] - ; GP32-NOT-MM: subu $4, $6, $[[T11]] - ; GP32-NOT-MM: move $5, $[[T6]] - - ; GP32-MM: lw $[[T0:[0-9]+]], 20($sp) - ; GP32-MM: sltu $[[T1:[0-9]+]], $[[T2:[0-9]+]], $[[T0]] - ; GP32-MM: lw $[[T3:[0-9]+]], 16($sp) - ; GP32-MM: addu $[[T3]], $[[T1]], $[[T3]] - ; GP32-MM: lw $[[T4:[0-9]+]], 24($sp) - ; GP32-MM: lw $[[T5:[0-9]+]], 28($sp) - ; GP32-MM: subu $[[T1]], $7, $[[T5]] - ; GP32-MM: subu16 $[[T3]], $[[T6:[0-9]+]], $[[T3]] - ; GP32-MM: sltu $[[T6]], $6, $[[T4]] - ; GP32-MM: addu16 $[[T0]], $[[T6]], $[[T0]] - ; GP32-MM: subu16 $[[T0]], $5, $[[T0]] - ; GP32-MM: sltu $[[T6]], $7, $[[T5]] - ; GP32-MM: addu $[[T6]], $[[T6]], $[[T4]] - ; GP32-MM: subu16 $[[T6]], $6, $[[T6]] - ; GP32-MM: move $[[T2]], $[[T1]] - - ; GP64: dsubu $3, $5, $7 - ; GP64: sltu $[[T0:[0-9]+]], $5, $7 - ; GP64: daddu $[[T1:[0-9]+]], $[[T0]], $6 - ; GP64: dsubu $2, $4, $[[T1]] +; PRE4: lw $[[T0:[0-9]+]], 24($sp) +; PRE4: lw $[[T1:[0-9]+]], 28($sp) +; PRE4: sltu $[[T2:[0-9]+]], $7, $[[T1]] +; PRE4: xor $[[T3:[0-9]+]], $6, $[[T0]] +; PRE4: sltiu $[[T4:[0-9]+]], $[[T3]], 1 +; PRE4: bnez $[[T4]] +; PRE4: move $[[T5:[0-9]+]], $[[T2]] +; PRE4: sltu $[[T5]], $6, $[[T0]] + +; PRE4: lw $[[T6:[0-9]+]], 20($sp) +; PRE4: subu $[[T7:[0-9]+]], $5, $[[T6]] +; PRE4: subu $[[T8:[0-9]+]], $[[T7]], $[[T5]] +; PRE4: sltu $[[T9:[0-9]+]], $[[T7]], $[[T5]] +; PRE4: sltu $[[T10:[0-9]+]], $5, $[[T6]] +; PRE4: lw $[[T11:[0-9]+]], 16($sp) +; PRE4: subu $[[T12:[0-9]+]], $4, $[[T11]] +; PRE4: subu $[[T13:[0-9]+]], $[[T12]], $[[T10]] +; PRE4: subu $[[T14:[0-9]+]], $[[T13]], $[[T9]] +; PRE4: subu $[[T15:[0-9]+]], $6, $[[T0]] +; PRE4: subu $[[T16:[0-9]+]], $[[T15]], $[[T2]] +; PRE4: subu $5, $7, $[[T1]] + +; MMR3: lw $[[T1:[0-9]+]], 48($sp) +; MMR3: sltu $[[T2:[0-9]+]], $6, $[[T1]] +; MMR3: xor $[[T3:[0-9]+]], $6, $[[T1]] +; MMR3: lw $[[T4:[0-9]+]], 52($sp) +; MMR3: sltu $[[T5:[0-9]+]], $7, $[[T4]] +; MMR3: movz $[[T6:[0-9]+]], $[[T5]], $[[T3]] +; MMR3: lw $[[T7:[0-8]+]], 44($sp) +; MMR3: subu16 $[[T8:[0-9]+]], $5, $[[T7]] +; MMR3: subu16 $[[T9:[0-9]+]], $[[T8]], $[[T6]] +; MMR3: sltu $[[T10:[0-9]+]], $[[T8]], $[[T2]] +; MMR3: sltu $[[T11:[0-9]+]], $5, $[[T7]] +; MMR3: lw $[[T12:[0-9]+]], 40($sp) +; MMR3: lw $[[T13:[0-9]+]], 12($sp) +; MMR3: subu16 $[[T14:[0-9]+]], $[[T13]], $[[T12]] +; MMR3: subu16 $[[T15:[0-9]+]], $[[T14]], $[[T11]] +; MMR3: subu16 $[[T16:[0-9]+]], $[[T15]], $[[T10]] +; MMR3: subu16 $[[T17:[0-9]+]], $6, $[[T1]] +; MMR3: subu16 $[[T18:[0-9]+]], $[[T17]], $7 +; MMR3: lw $[[T19:[0-9]+]], 8($sp) +; MMR3: lw $[[T20:[0-9]+]], 0($sp) +; MMR3: subu16 $5, $[[T19]], $[[T20]] + +; MMR6: move $[[T0:[0-9]+]], $7 +; MMR6: sw $[[T0]], 8($sp) +; MMR6: move $[[T1:[0-9]+]], $5 +; MMR6: sw $4, 12($sp) +; MMR6: lw $[[T2:[0-9]+]], 48($sp) +; MMR6: sltu $[[T3:[0-9]+]], $6, $[[T2]] +; MMR6: xor $[[T4:[0-9]+]], $6, $[[T2]] +; MMR6: sltiu $[[T5:[0-9]+]], $[[T4]], 1 +; MMR6: seleqz $[[T6:[0-9]+]], $[[T3]], $[[T5]] +; MMR6: lw $[[T7:[0-9]+]], 52($sp) +; MMR6: sltu $[[T8:[0-9]+]], $[[T0]], $[[T7]] +; MMR6: selnez $[[T9:[0-9]+]], $[[T8]], $[[T5]] +; MMR6: or $[[T10:[0-9]+]], $[[T9]], $[[T6]] +; MMR6: lw $[[T11:[0-9]+]], 44($sp) +; MMR6: subu16 $[[T12:[0-9]+]], $[[T1]], $[[T11]] +; MMR6: subu16 $[[T13:[0-9]+]], $[[T12]], $[[T7]] +; MMR6: sltu $[[T16:[0-9]+]], $[[T12]], $[[T7]] +; MMR6: sltu $[[T17:[0-9]+]], $[[T1]], $[[T11]] +; MMR6: lw $[[T18:[0-9]+]], 40($sp) +; MMR6: lw $[[T19:[0-9]+]], 12($sp) +; MMR6: subu16 $[[T20:[0-9]+]], $[[T19]], $[[T18]] +; MMR6: subu16 $[[T21:[0-9]+]], $[[T20]], $[[T17]] +; MMR6: subu16 $[[T22:[0-9]+]], $[[T21]], $[[T16]] +; MMR6: subu16 $[[T23:[0-9]+]], $6, $[[T2]] +; MMR6: subu16 $4, $[[T23]], $5 +; MMR6: lw $[[T24:[0-9]+]], 8($sp) +; MMR6: lw $[[T25:[0-9]+]], 0($sp) +; MMR6: subu16 $5, $[[T24]], $[[T25]] +; MMR6: lw $3, 4($sp) + +; FIXME: The sltu, dsll, dsrl pattern here occurs when an i32 is zero +; extended to 64 bits. Fortunately slt(i)(u) actually gives an i1. +; These should be combined away. + +; GP64-NOT-R2: dsubu $1, $4, $6 +; GP64-NOT-R2: sltu $[[T0:[0-9]+]], $5, $7 +; GP64-NOT-R2: dsll $[[T1:[0-9]+]], $[[T0]], 32 +; GP64-NOT-R2: dsrl $[[T2:[0-9]+]], $[[T1]], 32 +; GP64-NOT-R2: dsubu $2, $1, $[[T2]] +; GP64-NOT-R2: dsubu $3, $5, $7 + +; FIXME: Likewise for the sltu, dext here. + +; GP64-R2: dsubu $1, $4, $6 +; GP64-R2: sltu $[[T0:[0-9]+]], $5, $7 +; GP64-R2: dext $[[T1:[0-9]+]], $[[T0]], 0, 32 +; GP64-R2: dsubu $2, $1, $[[T1]] +; GP64-R2: dsubu $3, $5, $7 + +; FIXME: Again, redundant sign extension. Also, microMIPSR6 has the +; dext instruction which should be used here. + +; MM64: dsubu $[[T0:[0-9]+]], $4, $6 +; MM64: sltu $[[T1:[0-9]+]], $5, $7 +; MM64: dsll $[[T2:[0-9]+]], $[[T1]], 32 +; MM64: dsrl $[[T3:[0-9]+]], $[[T2]], 32 +; MM64: dsubu $2, $[[T0]], $[[T3]] +; MM64: dsubu $3, $5, $7 +; MM64: jr $ra %r = sub i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/long-calls.ll b/test/CodeGen/Mips/long-calls.ll new file mode 100644 index 000000000000..8a95e9b9307d --- /dev/null +++ b/test/CodeGen/Mips/long-calls.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=mips -mattr=-long-calls %s -o - \ +; RUN: | FileCheck -check-prefix=OFF %s +; RUN: llc -march=mips -mattr=+long-calls,+noabicalls %s -o - \ +; RUN: | FileCheck -check-prefix=ON32 %s + +; RUN: llc -march=mips -mattr=+long-calls,-noabicalls %s -o - \ +; RUN: | FileCheck -check-prefix=OFF %s + +; RUN: llc -march=mips64 -target-abi n32 -mattr=-long-calls %s -o - \ +; RUN: | FileCheck -check-prefix=OFF %s +; RUN: llc -march=mips64 -target-abi n32 -mattr=+long-calls,+noabicalls %s -o - \ +; RUN: | FileCheck -check-prefix=ON32 %s + +; RUN: llc -march=mips64 -target-abi n64 -mattr=-long-calls %s -o - \ +; RUN: | FileCheck -check-prefix=OFF %s +; RUN: llc -march=mips64 -target-abi n64 -mattr=+long-calls,+noabicalls %s -o - \ +; RUN: | FileCheck -check-prefix=ON64 %s + +declare void @callee() +declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i32, i1) + +@val = internal unnamed_addr global [20 x i32] zeroinitializer, align 4 + +define void @caller() { + +; Use `jal` instruction with R_MIPS_26 relocation. +; OFF: jal callee +; OFF: jal memset + +; Save the `callee` and `memset` addresses in $25 register +; and use `jalr` for the jumps. +; ON32: lui $1, %hi(callee) +; ON32: addiu $25, $1, %lo(callee) +; ON32: jalr $25 + +; ON32: addiu $1, $zero, %lo(memset) +; ON32: lui $2, %hi(memset) +; ON32: addu $25, $2, $1 +; ON32: jalr $25 + +; ON64: lui $1, %highest(callee) +; ON64: daddiu $1, $1, %higher(callee) +; ON64: daddiu $1, $1, %hi(callee) +; ON64: daddiu $25, $1, %lo(callee) +; ON64: jalr $25 + +; ON64: daddiu $1, $zero, %higher(memset) +; ON64: lui $2, %highest(memset) +; ON64: lui $2, %hi(memset) +; ON64: daddiu $2, $zero, %lo(memset) +; ON64: daddu $25, $1, $2 +; ON64: jalr $25 + + call void @callee() + call void @llvm.memset.p0i8.i32(i8* bitcast ([20 x i32]* @val to i8*), i8 0, i32 80, i32 4, i1 false) + ret void +} diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll index 7baba005a072..3e1a2e8b9708 100644 --- a/test/CodeGen/Mips/madd-msub.ll +++ b/test/CodeGen/Mips/madd-msub.ll @@ -25,11 +25,11 @@ ; 32R6-DAG: mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}} ; 32R6-DAG: addu $[[T1:[0-9]+]], $[[T0]], $6 -; 32R6-DAG: sltu $[[T2:[0-9]+]], $[[T1]], $6 -; 32R6-DAG: sra $[[T3:[0-9]+]], $6, 31 -; 32R6-DAG: addu $[[T4:[0-9]+]], $[[T2]], $[[T3]] -; 32R6-DAG: muh $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}} -; 32R6-DAG: addu $2, $[[T5]], $[[T4]] +; 32R6-DAG: sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]] +; 32R6-DAG: muh $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}} +; 32R6-DAG: sra $[[T4:[0-9]+]], $6, 31 +; 32R6-DAG: addu $[[T5:[0-9]+]], $[[T3]], $[[T4]] +; 32R6-DAG: addu $2, $[[T5]], $[[T2]] ; 64-DAG: sll $[[T0:[0-9]+]], $4, 0 ; 64-DAG: sll $[[T1:[0-9]+]], $5, 0 @@ -71,7 +71,7 @@ entry: ; 32R6-DAG: mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}} ; 32R6-DAG: addu $[[T1:[0-9]+]], $[[T0]], $6 -; 32R6-DAG: sltu $[[T2:[0-9]+]], $[[T1]], $6 +; 32R6-DAG: sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]] ; FIXME: There's a redundant move here. We should remove it ; 32R6-DAG: muhu $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}} ; 32R6-DAG: addu $2, $[[T3]], $[[T2]] @@ -109,10 +109,10 @@ entry: ; 32R6-DAG: mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}} ; 32R6-DAG: addu $[[T1:[0-9]+]], $[[T0]], $7 -; 32R6-DAG: sltu $[[T2:[0-9]+]], $[[T1]], $7 -; 32R6-DAG: addu $[[T4:[0-9]+]], $[[T2]], $6 -; 32R6-DAG: muh $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}} -; 32R6-DAG: addu $2, $[[T5]], $[[T4]] +; 32R6-DAG: sltu $[[T2:[0-9]+]], $[[T1]], $1 +; 32R6-DAG: muh $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}} +; 32R6-DAG: addu $[[T4:[0-9]+]], $[[T3]], $6 +; 32R6-DAG: addu $2, $[[T4]], $[[T2]] ; 64-DAG: sll $[[T0:[0-9]+]], $4, 0 ; 64-DAG: sll $[[T1:[0-9]+]], $5, 0 @@ -134,6 +134,17 @@ entry: ret i64 %add } +; ALL-LABEL: madd4 +; ALL-NOT: madd ${{[0-9]+}}, ${{[0-9]+}} + +define i32 @madd4(i32 %a, i32 %b, i32 %c) { +entry: + %mul = mul nsw i32 %a, %b + %add = add nsw i32 %c, %mul + + ret i32 %add +} + ; ALL-LABEL: msub1: ; 32-DAG: sra $[[T0:[0-9]+]], $6, 31 @@ -148,13 +159,13 @@ entry: ; DSP-DAG: mfhi $2, $[[AC]] ; DSP-DAG: mflo $3, $[[AC]] -; 32R6-DAG: muh $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}} -; 32R6-DAG: mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}} -; 32R6-DAG: sltu $[[T3:[0-9]+]], $6, $[[T1]] -; 32R6-DAG: addu $[[T4:[0-9]+]], $[[T3]], $[[T0]] -; 32R6-DAG: sra $[[T5:[0-9]+]], $6, 31 -; 32R6-DAG: subu $2, $[[T5]], $[[T4]] -; 32R6-DAG: subu $3, $6, $[[T1]] +; 32R6-DAG: mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}} +; 32R6-DAG: sltu $[[T1:[0-9]+]], $6, $[[T0]] +; 32R6-DAG: muh $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}} +; 32R6-DAG: sra $[[T3:[0-9]+]], $6, 31 +; 32R6-DAG: subu $[[T4:[0-9]+]], $[[T3]], $[[T2]] +; 32R6-DAG: subu $2, $[[T4]], $[[T1]] +; 32R6-DAG: subu $3, $6, $[[T0]] ; 64-DAG: sll $[[T0:[0-9]+]], $4, 0 ; 64-DAG: sll $[[T1:[0-9]+]], $5, 0 @@ -194,13 +205,12 @@ entry: ; DSP-DAG: mfhi $2, $[[AC]] ; DSP-DAG: mflo $3, $[[AC]] -; 32R6-DAG: muhu $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}} -; 32R6-DAG: mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}} - -; 32R6-DAG: sltu $[[T2:[0-9]+]], $6, $[[T1]] -; 32R6-DAG: addu $[[T3:[0-9]+]], $[[T2]], $[[T0]] -; 32R6-DAG: negu $2, $[[T3]] -; 32R6-DAG: subu $3, $6, $[[T1]] +; 32R6-DAG: mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}} +; 32R6-DAG: sltu $[[T1:[0-9]+]], $6, $[[T0]] +; 32R6-DAG: muhu $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}} +; 32R6-DAG: negu $[[T3:[0-9]+]], $[[T2]] +; 32R6-DAG: subu $2, $[[T3]], $[[T1]] +; 32R6-DAG: subu $3, $6, $[[T0]] ; 64-DAG: d[[m:m]]ult $5, $4 ; 64-DAG: [[m]]flo $[[T0:[0-9]+]] @@ -234,12 +244,12 @@ entry: ; DSP-DAG: mfhi $2, $[[AC]] ; DSP-DAG: mflo $3, $[[AC]] -; 32R6-DAG: muh $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}} -; 32R6-DAG: mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}} -; 32R6-DAG: sltu $[[T2:[0-9]+]], $7, $[[T1]] -; 32R6-DAG: addu $[[T3:[0-9]+]], $[[T2]], $[[T0]] -; 32R6-DAG: subu $2, $6, $[[T3]] -; 32R6-DAG: subu $3, $7, $[[T1]] +; 32R6-DAG: mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}} +; 32R6-DAG: sltu $[[T1:[0-9]+]], $7, $[[T0]] +; 32R6-DAG: muh $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}} +; 32R6-DAG: subu $[[T3:[0-9]+]], $6, $[[T2]] +; 32R6-DAG: subu $2, $[[T3]], $[[T1]] +; 32R6-DAG: subu $3, $7, $[[T0]] ; 64-DAG: sll $[[T0:[0-9]+]], $4, 0 ; 64-DAG: sll $[[T1:[0-9]+]], $5, 0 @@ -260,3 +270,14 @@ entry: %sub = sub nsw i64 %c, %mul ret i64 %sub } + +; ALL-LABEL: msub4 +; ALL-NOT: msub ${{[0-9]+}}, ${{[0-9]+}} + +define i32 @msub4(i32 %a, i32 %b, i32 %c) { +entry: + %mul = mul nsw i32 %a, %b + %sub = sub nsw i32 %c, %mul + + ret i32 %sub +} diff --git a/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/test/CodeGen/Mips/msa/f16-llvm-ir.ll index ac69dc913c18..b3ed8bdd3b9a 100644 --- a/test/CodeGen/Mips/msa/f16-llvm-ir.ll +++ b/test/CodeGen/Mips/msa/f16-llvm-ir.ll @@ -1,21 +1,21 @@ ; RUN: llc -relocation-model=pic -march=mipsel -mcpu=mips32r5 \ -; RUN: -mattr=+fp64,+msa < %s | FileCheck %s \ +; RUN: -mattr=+fp64,+msa -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=ALL,MIPS32,MIPSR5,MIPS32-O32,MIPS32R5-O32 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r5 \ -; RUN: -mattr=+fp64,+msa -target-abi n32 < %s | FileCheck %s \ +; RUN: -mattr=+fp64,+msa -verify-machineinstrs -target-abi n32 < %s | FileCheck %s \ ; RUN: --check-prefixes=ALL,MIPS64,MIPSR5,MIPS64-N32,MIPS64R5-N32 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r5 \ -; RUN: -mattr=+fp64,+msa -target-abi n64 < %s | FileCheck %s \ +; RUN: -mattr=+fp64,+msa -verify-machineinstrs -target-abi n64 < %s | FileCheck %s \ ; RUN: --check-prefixes=ALL,MIPS64,MIPSR5,MIPS64-N64,MIPS64R5-N64 ; RUN: llc -relocation-model=pic -march=mipsel -mcpu=mips32r6 \ -; RUN: -mattr=+fp64,+msa < %s | FileCheck %s \ +; RUN: -mattr=+fp64,+msa -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=ALL,MIPS32,MIPSR6,MIPSR6-O32 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r6 \ -; RUN: -mattr=+fp64,+msa -target-abi n32 < %s | FileCheck %s \ +; RUN: -mattr=+fp64,+msa -verify-machineinstrs -target-abi n32 < %s | FileCheck %s \ ; RUN: --check-prefixes=ALL,MIPS64,MIPSR6,MIPS64-N32,MIPSR6-N32 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r6 \ -; RUN: -mattr=+fp64,+msa -target-abi n64 < %s | FileCheck %s \ +; RUN: -mattr=+fp64,+msa -verify-machineinstrs -target-abi n64 < %s | FileCheck %s \ ; RUN: --check-prefixes=ALL,MIPS64,MIPSR6,MIPS64-N64,MIPSR6-N64 diff --git a/test/CodeGen/PowerPC/PR33671.ll b/test/CodeGen/PowerPC/PR33671.ll new file mode 100644 index 000000000000..0edd2e8daff4 --- /dev/null +++ b/test/CodeGen/PowerPC/PR33671.ll @@ -0,0 +1,32 @@ +; Function Attrs: norecurse nounwind +; RUN: llc -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 < %s | FileCheck %s +define void @test1(i32* nocapture readonly %arr, i32* nocapture %arrTo) { +entry: + %arrayidx = getelementptr inbounds i32, i32* %arrTo, i64 4 + %0 = bitcast i32* %arrayidx to <4 x i32>* + %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 4 + %1 = bitcast i32* %arrayidx1 to <4 x i32>* + %2 = load <4 x i32>, <4 x i32>* %1, align 16 + store <4 x i32> %2, <4 x i32>* %0, align 16 + ret void +; CHECK-LABEL: test1 +; CHECK: lxv [[LD:[0-9]+]], 16(3) +; CHECK: stxv [[LD]], 16(4) +} + +; Function Attrs: norecurse nounwind +define void @test2(i32* nocapture readonly %arr, i32* nocapture %arrTo) { +entry: + %arrayidx = getelementptr inbounds i32, i32* %arrTo, i64 1 + %0 = bitcast i32* %arrayidx to <4 x i32>* + %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2 + %1 = bitcast i32* %arrayidx1 to <4 x i32>* + %2 = load <4 x i32>, <4 x i32>* %1, align 16 + store <4 x i32> %2, <4 x i32>* %0, align 16 + ret void +; CHECK-LABEL: test2 +; CHECK: addi 3, 3, 8 +; CHECK: lxvx [[LD:[0-9]+]], 0, 3 +; CHECK: addi 3, 4, 4 +; CHECK: stxvx [[LD]], 0, 3 +} diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll index 60bec4d18f12..3ad432872c0e 100644 --- a/test/CodeGen/PowerPC/build-vector-tests.ll +++ b/test/CodeGen/PowerPC/build-vector-tests.ll @@ -1018,13 +1018,13 @@ entry: ; P8BE-LABEL: fromDiffMemVarDi ; P8LE-LABEL: fromDiffMemVarDi ; P9BE: sldi {{r[0-9]+}}, r4, 2 -; P9BE-DAG: lxv {{v[0-9]+}} -; P9BE-DAG: lxv +; P9BE-DAG: lxvx {{v[0-9]+}} +; P9BE-DAG: lxvx ; P9BE: vperm ; P9BE: blr ; P9LE: sldi {{r[0-9]+}}, r4, 2 -; P9LE-DAG: lxv {{v[0-9]+}} -; P9LE-DAG: lxv +; P9LE-DAG: lxvx {{v[0-9]+}} +; P9LE-DAG: lxvx ; P9LE: vperm ; P9LE: blr ; P8BE: sldi {{r[0-9]+}}, r4, 2 @@ -1584,16 +1584,16 @@ entry: ; P9LE-LABEL: fromDiffMemConsAConvdtoi ; P8BE-LABEL: fromDiffMemConsAConvdtoi ; P8LE-LABEL: fromDiffMemConsAConvdtoi -; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3) -; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3) +; P9BE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3) +; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] ; P9BE: vmrgew v2, [[REG6]], [[REG5]] ; P9BE: xvcvspsxws v2, v2 -; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3) -; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3) +; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3) +; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]] ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]] ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] @@ -2177,12 +2177,14 @@ entry: ; P8BE-LABEL: fromDiffMemVarDui ; P8LE-LABEL: fromDiffMemVarDui ; P9BE-DAG: sldi {{r[0-9]+}}, r4, 2 -; P9BE-DAG: lxv {{v[0-9]+}}, -12(r3) -; P9BE-DAG: lxv +; P9BE-DAG: addi r3, r3, -12 +; P9BE-DAG: lxvx {{v[0-9]+}}, 0, r3 +; P9BE-DAG: lxvx ; P9BE: vperm ; P9BE: blr ; P9LE-DAG: sldi {{r[0-9]+}}, r4, 2 -; P9LE-DAG: lxv {{v[0-9]+}}, -12(r3) +; P9LE-DAG: addi r3, r3, -12 +; P9LE-DAG: lxvx {{v[0-9]+}}, 0, r3 ; P9LE-DAG: lxv ; P9LE: vperm ; P9LE: blr @@ -2742,16 +2744,16 @@ entry: ; P9LE-LABEL: fromDiffMemConsAConvdtoui ; P8BE-LABEL: fromDiffMemConsAConvdtoui ; P8LE-LABEL: fromDiffMemConsAConvdtoui -; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3) -; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3) +; P9BE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3) +; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] ; P9BE: vmrgew v2, [[REG6]], [[REG5]] ; P9BE: xvcvspuxws v2, v2 -; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3) -; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3) +; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3) +; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]] ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]] ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] @@ -3466,9 +3468,9 @@ entry: ; P9LE-LABEL: fromDiffConstsConvftoll ; P8BE-LABEL: fromDiffConstsConvftoll ; P8LE-LABEL: fromDiffConstsConvftoll -; P9BE: lxv v2 +; P9BE: lxvx v2 ; P9BE: blr -; P9LE: lxv v2 +; P9LE: lxvx v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr @@ -4370,9 +4372,9 @@ entry: ; P9LE-LABEL: fromDiffConstsConvftoull ; P8BE-LABEL: fromDiffConstsConvftoull ; P8LE-LABEL: fromDiffConstsConvftoull -; P9BE: lxv v2 +; P9BE: lxvx v2 ; P9BE: blr -; P9LE: lxv v2 +; P9LE: lxvx v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr diff --git a/test/CodeGen/PowerPC/ppc64-i128-abi.ll b/test/CodeGen/PowerPC/ppc64-i128-abi.ll index 90dd1d84fc23..6d19d7f0d629 100644 --- a/test/CodeGen/PowerPC/ppc64-i128-abi.ll +++ b/test/CodeGen/PowerPC/ppc64-i128-abi.ll @@ -63,7 +63,7 @@ define <1 x i128> @v1i128_increment_by_one(<1 x i128> %a) nounwind { ; FIXME: li [[R1:r[0-9]+]], 1 ; FIXME: li [[R2:r[0-9]+]], 0 ; FIXME: mtvsrdd [[V1:v[0-9]+]], [[R2]], [[R1]] -; CHECK-P9: lxv [[V1:v[0-9]+]] +; CHECK-P9: lxvx [[V1:v[0-9]+]] ; CHECK-P9: vadduqm v2, v2, [[V1]] ; CHECK-P9: blr @@ -237,8 +237,8 @@ define <1 x i128> @call_v1i128_increment_by_val() nounwind { ; CHECK-LE: blr ; CHECK-P9-LABEL: @call_v1i128_increment_by_val -; CHECK-P9-DAG: lxv v2 -; CHECK-P9-DAG: lxv v3 +; CHECK-P9-DAG: lxvx v2 +; CHECK-P9-DAG: lxvx v3 ; CHECK-P9: bl v1i128_increment_by_val ; CHECK-P9: blr diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll index e7640cab6aef..d573441f2cc9 100644 --- a/test/CodeGen/PowerPC/swaps-le-6.ll +++ b/test/CodeGen/PowerPC/swaps-le-6.ll @@ -33,11 +33,11 @@ entry: ; CHECK: stxvd2x [[REG5]] ; CHECK-P9-LABEL: @bar0 -; CHECK-P9-DAG: lxv [[REG1:[0-9]+]] +; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]] ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3) ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 ; CHECK-P9: xxpermdi [[REG5:[0-9]+]], [[REG1]], [[REG4]], 1 -; CHECK-P9: stxv [[REG5]] +; CHECK-P9: stxvx [[REG5]] define void @bar1() { entry: @@ -56,9 +56,9 @@ entry: ; CHECK: stxvd2x [[REG5]] ; CHECK-P9-LABEL: @bar1 -; CHECK-P9-DAG: lxv [[REG1:[0-9]+]] +; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]] ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3) ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 ; CHECK-P9: xxmrgld [[REG5:[0-9]+]], [[REG4]], [[REG1]] -; CHECK-P9: stxv [[REG5]] +; CHECK-P9: stxvx [[REG5]] diff --git a/test/CodeGen/PowerPC/vsx-p9.ll b/test/CodeGen/PowerPC/vsx-p9.ll index 0c29b6adad77..1ca679f474c3 100644 --- a/test/CodeGen/PowerPC/vsx-p9.ll +++ b/test/CodeGen/PowerPC/vsx-p9.ll @@ -36,8 +36,8 @@ entry: %1 = load <16 x i8>, <16 x i8>* @ucb, align 16 %add.i = add <16 x i8> %1, %0 tail call void (...) @sink(<16 x i8> %add.i) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vaddubm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -45,8 +45,8 @@ entry: %3 = load <16 x i8>, <16 x i8>* @scb, align 16 %add.i22 = add <16 x i8> %3, %2 tail call void (...) @sink(<16 x i8> %add.i22) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vaddubm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -54,8 +54,8 @@ entry: %5 = load <8 x i16>, <8 x i16>* @usb, align 16 %add.i21 = add <8 x i16> %5, %4 tail call void (...) @sink(<8 x i16> %add.i21) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduhm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -63,8 +63,8 @@ entry: %7 = load <8 x i16>, <8 x i16>* @ssb, align 16 %add.i20 = add <8 x i16> %7, %6 tail call void (...) @sink(<8 x i16> %add.i20) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduhm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -72,8 +72,8 @@ entry: %9 = load <4 x i32>, <4 x i32>* @uib, align 16 %add.i19 = add <4 x i32> %9, %8 tail call void (...) @sink(<4 x i32> %add.i19) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduwm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -81,8 +81,8 @@ entry: %11 = load <4 x i32>, <4 x i32>* @sib, align 16 %add.i18 = add <4 x i32> %11, %10 tail call void (...) @sink(<4 x i32> %add.i18) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduwm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -90,8 +90,8 @@ entry: %13 = load <2 x i64>, <2 x i64>* @ullb, align 16 %add.i17 = add <2 x i64> %13, %12 tail call void (...) @sink(<2 x i64> %add.i17) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vaddudm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -99,8 +99,8 @@ entry: %15 = load <2 x i64>, <2 x i64>* @sllb, align 16 %add.i16 = add <2 x i64> %15, %14 tail call void (...) @sink(<2 x i64> %add.i16) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vaddudm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -108,8 +108,8 @@ entry: %17 = load <1 x i128>, <1 x i128>* @uxb, align 16 %add.i15 = add <1 x i128> %17, %16 tail call void (...) @sink(<1 x i128> %add.i15) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduqm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -117,8 +117,8 @@ entry: %19 = load <1 x i128>, <1 x i128>* @sxb, align 16 %add.i14 = add <1 x i128> %19, %18 tail call void (...) @sink(<1 x i128> %add.i14) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduqm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -126,8 +126,8 @@ entry: %21 = load <4 x float>, <4 x float>* @vfb, align 16 %add.i13 = fadd <4 x float> %20, %21 tail call void (...) @sink(<4 x float> %add.i13) -; CHECK: lxv 0, 0(3) -; CHECK: lxv 1, 0(4) +; CHECK: lxvx 0, 0, 3 +; CHECK: lxvx 1, 0, 4 ; CHECK: xvaddsp 34, 0, 1 ; CHECK: stxv 34, ; CHECK: bl sink @@ -135,8 +135,8 @@ entry: %23 = load <2 x double>, <2 x double>* @vdb, align 16 %add.i12 = fadd <2 x double> %22, %23 tail call void (...) @sink(<2 x double> %add.i12) -; CHECK: lxv 0, 0(3) -; CHECK: lxv 1, 0(4) +; CHECK: lxvx 0, 0, 3 +; CHECK: lxvx 1, 0, 4 ; CHECK: xvadddp 0, 0, 1 ; CHECK: stxv 0, ; CHECK: bl sink diff --git a/test/CodeGen/SPARC/soft-mul-div.ll b/test/CodeGen/SPARC/soft-mul-div.ll new file mode 100644 index 000000000000..7c453dd35be7 --- /dev/null +++ b/test/CodeGen/SPARC/soft-mul-div.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=sparc -mcpu=v7 -O0 < %s | FileCheck %s + +define i32 @test_mul32(i32 %a, i32 %b) #0 { + ; CHECK-LABEL: test_mul32 + ; CHECK: call .umul + %m = mul i32 %a, %b + ret i32 %m +} + +define i16 @test_mul16(i16 %a, i16 %b) #0 { + ; CHECK-LABEL: test_mul16 + ; CHECK: call .umul + %m = mul i16 %a, %b + ret i16 %m +} + +define i8 @test_mul8(i8 %a, i8 %b) #0 { + ; CHECK-LABEL: test_mul8 + ; CHECK: call .umul + %m = mul i8 %a, %b + ret i8 %m +} + +define i32 @test_sdiv32(i32 %a, i32 %b) #0 { + ; CHECK-LABEL: test_sdiv32 + ; CHECK: call .div + %d = sdiv i32 %a, %b + ret i32 %d +} + +define i16 @test_sdiv16(i16 %a, i16 %b) #0 { + ; CHECK-LABEL: test_sdiv16 + ; CHECK: call .div + %d = sdiv i16 %a, %b + ret i16 %d +} + +define i8 @test_sdiv8(i8 %a, i8 %b) #0 { + ; CHECK-LABEL: test_sdiv8 + ; CHECK: call .div + %d = sdiv i8 %a, %b + ret i8 %d +} + +define i32 @test_udiv32(i32 %a, i32 %b) #0 { + ; CHECK-LABEL: test_udiv32 + ; CHECK: call .udiv + %d = udiv i32 %a, %b + ret i32 %d +} + +define i16 @test_udiv16(i16 %a, i16 %b) #0 { + ; CHECK-LABEL: test_udiv16 + ; CHECK: call .udiv + %d = udiv i16 %a, %b + ret i16 %d +} + +define i8 @test_udiv8(i8 %a, i8 %b) #0 { + ; CHECK-LABEL: test_udiv8 + ; CHECK: call .udiv + %d = udiv i8 %a, %b + ret i8 %d +} + diff --git a/test/CodeGen/SystemZ/branch-11.ll b/test/CodeGen/SystemZ/branch-11.ll new file mode 100644 index 000000000000..ce7b3ef267b4 --- /dev/null +++ b/test/CodeGen/SystemZ/branch-11.ll @@ -0,0 +1,56 @@ +; Test indirect jumps on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +define i32 @f1(i32 %x, i32 %y, i32 %op) { +; CHECK-LABEL: f1: +; CHECK: ahi %r4, -1 +; CHECK: clibh %r4, 5, 0(%r14) +; CHECK: llgfr [[OP64:%r[0-5]]], %r4 +; CHECK: sllg [[INDEX:%r[1-5]]], [[OP64]], 3 +; CHECK: larl [[BASE:%r[1-5]]] +; CHECK: bi 0([[BASE]],[[INDEX]]) +entry: + switch i32 %op, label %exit [ + i32 1, label %b.add + i32 2, label %b.sub + i32 3, label %b.and + i32 4, label %b.or + i32 5, label %b.xor + i32 6, label %b.mul + ] + +b.add: + %add = add i32 %x, %y + br label %exit + +b.sub: + %sub = sub i32 %x, %y + br label %exit + +b.and: + %and = and i32 %x, %y + br label %exit + +b.or: + %or = or i32 %x, %y + br label %exit + +b.xor: + %xor = xor i32 %x, %y + br label %exit + +b.mul: + %mul = mul i32 %x, %y + br label %exit + +exit: + %res = phi i32 [ %x, %entry ], + [ %add, %b.add ], + [ %sub, %b.sub ], + [ %and, %b.and ], + [ %or, %b.or ], + [ %xor, %b.xor ], + [ %mul, %b.mul ] + ret i32 %res +} diff --git a/test/CodeGen/SystemZ/fp-abs-03.ll b/test/CodeGen/SystemZ/fp-abs-03.ll new file mode 100644 index 000000000000..cab6c116bc08 --- /dev/null +++ b/test/CodeGen/SystemZ/fp-abs-03.ll @@ -0,0 +1,43 @@ +; Test floating-point absolute on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test f32. +declare float @llvm.fabs.f32(float %f) +define float @f1(float %f) { +; CHECK-LABEL: f1: +; CHECK: lpdfr %f0, %f0 +; CHECK: br %r14 + %res = call float @llvm.fabs.f32(float %f) + ret float %res +} + +; Test f64. +declare double @llvm.fabs.f64(double %f) +define double @f2(double %f) { +; CHECK-LABEL: f2: +; CHECK: lpdfr %f0, %f0 +; CHECK: br %r14 + %res = call double @llvm.fabs.f64(double %f) + ret double %res +} + +; Test f128. With the loads and stores, a pure absolute would probably +; be better implemented using an NI on the upper byte. Do some extra +; processing so that using FPRs is unequivocally better. +declare fp128 @llvm.fabs.f128(fp128 %f) +define void @f3(fp128 *%ptr, fp128 *%ptr2) { +; CHECK-LABEL: f3: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK-DAG: wflpxb [[POSREG1:%v[0-9]+]], [[REG1]] +; CHECK: wfdxb [[RES:%v[0-9]+]], [[POSREG1]], [[REG2]] +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %orig = load fp128 , fp128 *%ptr + %abs = call fp128 @llvm.fabs.f128(fp128 %orig) + %op2 = load fp128 , fp128 *%ptr2 + %res = fdiv fp128 %abs, %op2 + store fp128 %res, fp128 *%ptr + ret void +} diff --git a/test/CodeGen/SystemZ/fp-abs-04.ll b/test/CodeGen/SystemZ/fp-abs-04.ll new file mode 100644 index 000000000000..606bce3de36e --- /dev/null +++ b/test/CodeGen/SystemZ/fp-abs-04.ll @@ -0,0 +1,46 @@ +; Test negated floating-point absolute on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test f32. +declare float @llvm.fabs.f32(float %f) +define float @f1(float %f) { +; CHECK-LABEL: f1: +; CHECK: lndfr %f0, %f0 +; CHECK: br %r14 + %abs = call float @llvm.fabs.f32(float %f) + %res = fsub float -0.0, %abs + ret float %res +} + +; Test f64. +declare double @llvm.fabs.f64(double %f) +define double @f2(double %f) { +; CHECK-LABEL: f2: +; CHECK: lndfr %f0, %f0 +; CHECK: br %r14 + %abs = call double @llvm.fabs.f64(double %f) + %res = fsub double -0.0, %abs + ret double %res +} + +; Test f128. With the loads and stores, a pure negative-absolute would +; probably be better implemented using an OI on the upper byte. Do some +; extra processing so that using FPRs is unequivocally better. +declare fp128 @llvm.fabs.f128(fp128 %f) +define void @f3(fp128 *%ptr, fp128 *%ptr2) { +; CHECK-LABEL: f3: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK-DAG: wflnxb [[NEGREG1:%v[0-9]+]], [[REG1]] +; CHECK: wfdxb [[RES:%v[0-9]+]], [[NEGREG1]], [[REG2]] +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %orig = load fp128 , fp128 *%ptr + %abs = call fp128 @llvm.fabs.f128(fp128 %orig) + %negabs = fsub fp128 0xL00000000000000008000000000000000, %abs + %op2 = load fp128 , fp128 *%ptr2 + %res = fdiv fp128 %negabs, %op2 + store fp128 %res, fp128 *%ptr + ret void +} diff --git a/test/CodeGen/SystemZ/fp-add-01.ll b/test/CodeGen/SystemZ/fp-add-01.ll index 5b0ed0513a37..219607d628d7 100644 --- a/test/CodeGen/SystemZ/fp-add-01.ll +++ b/test/CodeGen/SystemZ/fp-add-01.ll @@ -1,6 +1,8 @@ ; Test 32-bit floating-point addition. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s declare float @foo() @@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) { define float @f7(float *%ptr0) { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT -; CHECK: aeb %f0, 16{{[04]}}(%r15) +; CHECK-SCALAR: aeb %f0, 16{{[04]}}(%r15) ; CHECK: br %r14 %ptr1 = getelementptr float, float *%ptr0, i64 2 %ptr2 = getelementptr float, float *%ptr0, i64 4 diff --git a/test/CodeGen/SystemZ/fp-add-04.ll b/test/CodeGen/SystemZ/fp-add-04.ll new file mode 100644 index 000000000000..186f37ca5182 --- /dev/null +++ b/test/CodeGen/SystemZ/fp-add-04.ll @@ -0,0 +1,17 @@ +; Test 128-bit floating-point addition on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +define void @f1(fp128 *%ptr1, fp128 *%ptr2) { +; CHECK-LABEL: f1: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK: wfaxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]] +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %f1 = load fp128, fp128 *%ptr1 + %f2 = load fp128, fp128 *%ptr2 + %sum = fadd fp128 %f1, %f2 + store fp128 %sum, fp128 *%ptr1 + ret void +} diff --git a/test/CodeGen/SystemZ/fp-cmp-01.ll b/test/CodeGen/SystemZ/fp-cmp-01.ll index 075c7aa3dd84..146b16bc695f 100644 --- a/test/CodeGen/SystemZ/fp-cmp-01.ll +++ b/test/CodeGen/SystemZ/fp-cmp-01.ll @@ -1,7 +1,10 @@ ; Test 32-bit floating-point comparison. The tests assume a z10 implementation ; of select, using conditional branches rather than LOCGR. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s declare float @foo() @@ -9,8 +12,9 @@ declare float @foo() define i64 @f1(i64 %a, i64 %b, float %f1, float %f2) { ; CHECK-LABEL: f1: ; CHECK: cebr %f0, %f2 -; CHECK-NEXT: ber %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: ber %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrne %r2, %r3 ; CHECK: br %r14 %cond = fcmp oeq float %f1, %f2 %res = select i1 %cond, i64 %a, i64 %b @@ -21,8 +25,9 @@ define i64 @f1(i64 %a, i64 %b, float %f1, float %f2) { define i64 @f2(i64 %a, i64 %b, float %f1, float *%ptr) { ; CHECK-LABEL: f2: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: ber %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: ber %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrne %r2, %r3 ; CHECK: br %r14 %f2 = load float , float *%ptr %cond = fcmp oeq float %f1, %f2 @@ -34,8 +39,9 @@ define i64 @f2(i64 %a, i64 %b, float %f1, float *%ptr) { define i64 @f3(i64 %a, i64 %b, float %f1, float *%base) { ; CHECK-LABEL: f3: ; CHECK: ceb %f0, 4092(%r4) -; CHECK-NEXT: ber %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: ber %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrne %r2, %r3 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 1023 %f2 = load float , float *%ptr @@ -50,8 +56,9 @@ define i64 @f4(i64 %a, i64 %b, float %f1, float *%base) { ; CHECK-LABEL: f4: ; CHECK: aghi %r4, 4096 ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: ber %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: ber %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrne %r2, %r3 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 1024 %f2 = load float , float *%ptr @@ -65,8 +72,9 @@ define i64 @f5(i64 %a, i64 %b, float %f1, float *%base) { ; CHECK-LABEL: f5: ; CHECK: aghi %r4, -4 ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: ber %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: ber %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrne %r2, %r3 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 -1 %f2 = load float , float *%ptr @@ -80,8 +88,9 @@ define i64 @f6(i64 %a, i64 %b, float %f1, float *%base, i64 %index) { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r5, 2 ; CHECK: ceb %f0, 400(%r1,%r4) -; CHECK-NEXT: ber %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: ber %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrne %r2, %r3 ; CHECK: br %r14 %ptr1 = getelementptr float, float *%base, i64 %index %ptr2 = getelementptr float, float *%ptr1, i64 100 @@ -95,7 +104,7 @@ define i64 @f6(i64 %a, i64 %b, float %f1, float *%base, i64 %index) { define float @f7(float *%ptr0) { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT -; CHECK: ceb {{%f[0-9]+}}, 16{{[04]}}(%r15) +; CHECK-SCALAR: ceb {{%f[0-9]+}}, 16{{[04]}}(%r15) ; CHECK: br %r14 %ptr1 = getelementptr float, float *%ptr0, i64 2 %ptr2 = getelementptr float, float *%ptr0, i64 4 @@ -153,8 +162,9 @@ define float @f7(float *%ptr0) { define i64 @f8(i64 %a, i64 %b, float %f) { ; CHECK-LABEL: f8: ; CHECK: ltebr %f0, %f0 -; CHECK-NEXT: ber %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: ber %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrne %r2, %r3 ; CHECK: br %r14 %cond = fcmp oeq float %f, 0.0 %res = select i1 %cond, i64 %a, i64 %b @@ -166,8 +176,9 @@ define i64 @f8(i64 %a, i64 %b, float %f) { define i64 @f9(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f9: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: ber %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: ber %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrne %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp oeq float %f1, %f2 @@ -179,8 +190,9 @@ define i64 @f9(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f10(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f10: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: blhr %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: blhr %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrnlh %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp one float %f1, %f2 @@ -192,8 +204,9 @@ define i64 @f10(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f11(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f11: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: bhr %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: bhr %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrnh %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp olt float %f1, %f2 @@ -205,8 +218,9 @@ define i64 @f11(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f12(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f12: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: bher %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: bher %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrnhe %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp ole float %f1, %f2 @@ -218,8 +232,9 @@ define i64 @f12(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f13(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f13: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: bler %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: bler %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrnle %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp oge float %f1, %f2 @@ -231,8 +246,9 @@ define i64 @f13(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f14(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f14: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: blr %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: blr %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrnl %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp ogt float %f1, %f2 @@ -244,8 +260,9 @@ define i64 @f14(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f15(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f15: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: bnlhr %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: bnlhr %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrlh %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp ueq float %f1, %f2 @@ -257,8 +274,9 @@ define i64 @f15(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f16(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f16: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: bner %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: bner %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgre %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp une float %f1, %f2 @@ -270,8 +288,9 @@ define i64 @f16(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f17(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f17: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: bnler %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: bnler %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrle %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp ult float %f1, %f2 @@ -283,8 +302,9 @@ define i64 @f17(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f18(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f18: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: bnlr %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: bnlr %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrl %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp ule float %f1, %f2 @@ -296,8 +316,9 @@ define i64 @f18(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f19(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f19: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: bnhr %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: bnhr %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrh %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp uge float %f1, %f2 @@ -309,8 +330,9 @@ define i64 @f19(i64 %a, i64 %b, float %f2, float *%ptr) { define i64 @f20(i64 %a, i64 %b, float %f2, float *%ptr) { ; CHECK-LABEL: f20: ; CHECK: ceb %f0, 0(%r4) -; CHECK-NEXT: bnher %r14 -; CHECK: lgr %r2, %r3 +; CHECK-SCALAR-NEXT: bnher %r14 +; CHECK-SCALAR: lgr %r2, %r3 +; CHECK-VECTOR-NEXT: locgrhe %r2, %r3 ; CHECK: br %r14 %f1 = load float , float *%ptr %cond = fcmp ugt float %f1, %f2 diff --git a/test/CodeGen/SystemZ/fp-cmp-06.ll b/test/CodeGen/SystemZ/fp-cmp-06.ll new file mode 100644 index 000000000000..e146b51e4fb2 --- /dev/null +++ b/test/CodeGen/SystemZ/fp-cmp-06.ll @@ -0,0 +1,33 @@ +; Test f128 comparisons on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; There is no memory form of 128-bit comparison. +define i64 @f1(i64 %a, i64 %b, fp128 *%ptr1, fp128 *%ptr2) { +; CHECK-LABEL: f1: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r4) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r5) +; CHECK: wfcxb [[REG1]], [[REG2]] +; CHECK-NEXT: locgrne %r2, %r3 +; CHECK: br %r14 + %f1 = load fp128, fp128 *%ptr1 + %f2 = load fp128, fp128 *%ptr2 + %cond = fcmp oeq fp128 %f1, %f2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Check comparison with zero -- it is not worthwhile to copy to +; FP pairs just so we can use LTXBR, so simply load up a zero. +define i64 @f2(i64 %a, i64 %b, fp128 *%ptr) { +; CHECK-LABEL: f2: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r4) +; CHECK-DAG: vzero [[REG2:%v[0-9]+]] +; CHECK: wfcxb [[REG1]], [[REG2]] +; CHECK-NEXT: locgrne %r2, %r3 +; CHECK: br %r14 + %f = load fp128, fp128 *%ptr + %cond = fcmp oeq fp128 %f, 0xL00000000000000000000000000000000 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} diff --git a/test/CodeGen/SystemZ/fp-const-11.ll b/test/CodeGen/SystemZ/fp-const-11.ll new file mode 100644 index 000000000000..8523f2786c34 --- /dev/null +++ b/test/CodeGen/SystemZ/fp-const-11.ll @@ -0,0 +1,40 @@ +; Test loads of f128 floating-point constants on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s -check-prefix=CONST + +; Test loading zero. +define void @f1(fp128 *%x) { +; CHECK-LABEL: f1: +; CHECK: vzero [[REG:%v[0-9]+]] +; CHECK: vst [[REG]], 0(%r2) +; CHECK: br %r14 + store fp128 0xL00000000000000000000000000000000, fp128 *%x + ret void +} + +; Test loading of negative floating-point zero. +define void @f2(fp128 *%x) { +; CHECK-LABEL: f2: +; CHECK: vzero [[REG:%v[0-9]+]] +; CHECK: wflnxb [[REG]], [[REG]] +; CHECK: vst [[REG]], 0(%r2) +; CHECK: br %r14 + store fp128 0xL00000000000000008000000000000000, fp128 *%x + ret void +} + +; Test loading of a 128-bit floating-point constant. This value would +; actually fit within the 32-bit format, but we don't have extending +; loads into vector registers. +define void @f3(fp128 *%x) { +; CHECK-LABEL: f3: +; CHECK: larl [[REGISTER:%r[1-5]+]], {{.*}} +; CHECK: vl [[REG:%v[0-9]+]], 0([[REGISTER]]) +; CHECK: vst [[REG]], 0(%r2) +; CHECK: br %r14 +; CONST: .quad 4611404543484231680 +; CONST: .quad 0 + store fp128 0xL00000000000000003fff000002000000, fp128 *%x + ret void +} diff --git a/test/CodeGen/SystemZ/fp-conv-15.ll b/test/CodeGen/SystemZ/fp-conv-15.ll new file mode 100644 index 000000000000..61100016c426 --- /dev/null +++ b/test/CodeGen/SystemZ/fp-conv-15.ll @@ -0,0 +1,50 @@ +; Test f128 floating-point truncations/extensions on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test f128->f64. +define double @f1(fp128 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) +; CHECK: wflrx %f0, [[REG]], 0, 0 +; CHECK: br %r14 + %val = load fp128, fp128 *%ptr + %res = fptrunc fp128 %val to double + ret double %res +} + +; Test f128->f32. +define float @f2(fp128 *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) +; CHECK: wflrx %f0, [[REG]], 0, 3 +; CHECK: ledbra %f0, 0, %f0, 0 +; CHECK: br %r14 + %val = load fp128, fp128 *%ptr + %res = fptrunc fp128 %val to float + ret float %res +} + +; Test f64->f128. +define void @f3(fp128 *%dst, double %val) { +; CHECK-LABEL: f3: +; CHECK: wflld [[RES:%v[0-9]+]], %f0 +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %res = fpext double %val to fp128 + store fp128 %res, fp128 *%dst + ret void +} + +; Test f32->f128. +define void @f4(fp128 *%dst, float %val) { +; CHECK-LABEL: f4: +; CHECK: ldebr %f0, %f0 +; CHECK: wflld [[RES:%v[0-9]+]], %f0 +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %res = fpext float %val to fp128 + store fp128 %res, fp128 *%dst + ret void +} + diff --git a/test/CodeGen/SystemZ/fp-conv-16.ll b/test/CodeGen/SystemZ/fp-conv-16.ll new file mode 100644 index 000000000000..4f9bb865694b --- /dev/null +++ b/test/CodeGen/SystemZ/fp-conv-16.ll @@ -0,0 +1,99 @@ +; Test f128 floating-point conversion to/from integers on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test signed i32->f128. +define void @f1(i32 %i, fp128 *%dst) { +; CHECK-LABEL: f1: +; CHECK: cxfbr %f0, %r2 +; CHECK: vmrhg %v0, %v0, %v2 +; CHECK: vst %v0, 0(%r3) +; CHECK: br %r14 + %conv = sitofp i32 %i to fp128 + store fp128 %conv, fp128 *%dst + ret void +} + +; Test signed i64->f128. +define void @f2(i64 %i, fp128 *%dst) { +; CHECK-LABEL: f2: +; CHECK: cxgbr %f0, %r2 +; CHECK: vmrhg %v0, %v0, %v2 +; CHECK: vst %v0, 0(%r3) +; CHECK: br %r14 + %conv = sitofp i64 %i to fp128 + store fp128 %conv, fp128 *%dst + ret void +} + +; Test unsigned i32->f128. +define void @f3(i32 %i, fp128 *%dst) { +; CHECK-LABEL: f3: +; CHECK: cxlfbr %f0, 0, %r2, 0 +; CHECK: vmrhg %v0, %v0, %v2 +; CHECK: vst %v0, 0(%r3) +; CHECK: br %r14 + %conv = uitofp i32 %i to fp128 + store fp128 %conv, fp128 *%dst + ret void +} + +; Test unsigned i64->f128. +define void @f4(i64 %i, fp128 *%dst) { +; CHECK-LABEL: f4: +; CHECK: cxlgbr %f0, 0, %r2, 0 +; CHECK: vmrhg %v0, %v0, %v2 +; CHECK: vst %v0, 0(%r3) +; CHECK: br %r14 + %conv = uitofp i64 %i to fp128 + store fp128 %conv, fp128 *%dst + ret void +} + +; Test signed f128->i32. +define i32 @f5(fp128 *%src) { +; CHECK-LABEL: f5: +; CHECK: vl %v0, 0(%r2) +; CHECK: vrepg %v2, %v0, 1 +; CHECK: cfxbr %r2, 5, %f0 +; CHECK: br %r14 + %f = load fp128, fp128 *%src + %conv = fptosi fp128 %f to i32 + ret i32 %conv +} + +; Test signed f128->i64. +define i64 @f6(fp128 *%src) { +; CHECK-LABEL: f6: +; CHECK: vl %v0, 0(%r2) +; CHECK: vrepg %v2, %v0, 1 +; CHECK: cgxbr %r2, 5, %f0 +; CHECK: br %r14 + %f = load fp128, fp128 *%src + %conv = fptosi fp128 %f to i64 + ret i64 %conv +} + +; Test unsigned f128->i32. +define i32 @f7(fp128 *%src) { +; CHECK-LABEL: f7: +; CHECK: vl %v0, 0(%r2) +; CHECK: vrepg %v2, %v0, 1 +; CHECK: clfxbr %r2, 5, %f0, 0 +; CHECK: br %r14 + %f = load fp128 , fp128 *%src + %conv = fptoui fp128 %f to i32 + ret i32 %conv +} + +; Test unsigned f128->i64. +define i64 @f8(fp128 *%src) { +; CHECK-LABEL: f8: +; CHECK: vl %v0, 0(%r2) +; CHECK: vrepg %v2, %v0, 1 +; CHECK: clgxbr %r2, 5, %f0, 0 +; CHECK: br %r14 + %f = load fp128 , fp128 *%src + %conv = fptoui fp128 %f to i64 + ret i64 %conv +} diff --git a/test/CodeGen/SystemZ/fp-copysign-02.ll b/test/CodeGen/SystemZ/fp-copysign-02.ll new file mode 100644 index 000000000000..657c0e18767b --- /dev/null +++ b/test/CodeGen/SystemZ/fp-copysign-02.ll @@ -0,0 +1,81 @@ +; Test f128 copysign operations on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare float @copysignf(float, float) readnone +declare double @copysign(double, double) readnone +; FIXME: not really the correct prototype for SystemZ. +declare fp128 @copysignl(fp128, fp128) readnone + +; Test f32 copies in which the sign comes from an f128. +define float @f1(float %a, fp128 *%bptr) { +; CHECK-LABEL: f1: +; CHECK: vl %v[[REG:[0-9]+]], 0(%r2) +; CHECK: cpsdr %f0, %f[[REG]], %f0 +; CHECK: br %r14 + %bl = load volatile fp128, fp128 *%bptr + %b = fptrunc fp128 %bl to float + %res = call float @copysignf(float %a, float %b) readnone + ret float %res +} + +; Test f64 copies in which the sign comes from an f128. +define double @f2(double %a, fp128 *%bptr) { +; CHECK-LABEL: f2: +; CHECK: vl %v[[REG:[0-9]+]], 0(%r2) +; CHECK: cpsdr %f0, %f[[REG]], %f0 +; CHECK: br %r14 + %bl = load volatile fp128, fp128 *%bptr + %b = fptrunc fp128 %bl to double + %res = call double @copysign(double %a, double %b) readnone + ret double %res +} + +; Test f128 copies in which the sign comes from an f32. +define void @f7(fp128 *%cptr, fp128 *%aptr, float %bf) { +; CHECK-LABEL: f7: +; CHECK: vl [[REG1:%v[0-7]+]], 0(%r3) +; CHECK: tmlh +; CHECK: wflnxb [[REG1]], [[REG1]] +; CHECK: wflpxb [[REG1]], [[REG1]] +; CHECK: vst [[REG1]], 0(%r2) +; CHECK: br %r14 + %a = load volatile fp128, fp128 *%aptr + %b = fpext float %bf to fp128 + %c = call fp128 @copysignl(fp128 %a, fp128 %b) readnone + store fp128 %c, fp128 *%cptr + ret void +} + +; As above, but the sign comes from an f64. +define void @f8(fp128 *%cptr, fp128 *%aptr, double %bd) { +; CHECK-LABEL: f8: +; CHECK: vl [[REG1:%v[0-7]+]], 0(%r3) +; CHECK: tmhh +; CHECK: wflnxb [[REG1]], [[REG1]] +; CHECK: wflpxb [[REG1]], [[REG1]] +; CHECK: vst [[REG1]], 0(%r2) +; CHECK: br %r14 + %a = load volatile fp128, fp128 *%aptr + %b = fpext double %bd to fp128 + %c = call fp128 @copysignl(fp128 %a, fp128 %b) readnone + store fp128 %c, fp128 *%cptr + ret void +} + +; As above, but the sign comes from an f128. +define void @f9(fp128 *%cptr, fp128 *%aptr, fp128 *%bptr) { +; CHECK-LABEL: f9: +; CHECK: vl [[REG1:%v[0-7]+]], 0(%r3) +; CHECK: vl [[REG2:%v[0-7]+]], 0(%r4) +; CHECK: tm +; CHECK: wflnxb [[REG1]], [[REG1]] +; CHECK: wflpxb [[REG1]], [[REG1]] +; CHECK: vst [[REG1]], 0(%r2) +; CHECK: br %r14 + %a = load volatile fp128, fp128 *%aptr + %b = load volatile fp128, fp128 *%bptr + %c = call fp128 @copysignl(fp128 %a, fp128 %b) readnone + store fp128 %c, fp128 *%cptr + ret void +} diff --git a/test/CodeGen/SystemZ/fp-div-01.ll b/test/CodeGen/SystemZ/fp-div-01.ll index 0791e8db93f8..ee514dc474e9 100644 --- a/test/CodeGen/SystemZ/fp-div-01.ll +++ b/test/CodeGen/SystemZ/fp-div-01.ll @@ -1,6 +1,8 @@ ; Test 32-bit floating-point division. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s declare float @foo() @@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) { define float @f7(float *%ptr0) { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT -; CHECK: deb %f0, 16{{[04]}}(%r15) +; CHECK-SCALAR: deb %f0, 16{{[04]}}(%r15) ; CHECK: br %r14 %ptr1 = getelementptr float, float *%ptr0, i64 2 %ptr2 = getelementptr float, float *%ptr0, i64 4 diff --git a/test/CodeGen/SystemZ/fp-div-04.ll b/test/CodeGen/SystemZ/fp-div-04.ll new file mode 100644 index 000000000000..54e87f46c84a --- /dev/null +++ b/test/CodeGen/SystemZ/fp-div-04.ll @@ -0,0 +1,17 @@ +; Test 128-bit floating-point division on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +define void @f1(fp128 *%ptr1, fp128 *%ptr2) { +; CHECK-LABEL: f1: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK: wfdxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]] +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %f1 = load fp128, fp128 *%ptr1 + %f2 = load fp128, fp128 *%ptr2 + %sum = fdiv fp128 %f1, %f2 + store fp128 %sum, fp128 *%ptr1 + ret void +} diff --git a/test/CodeGen/SystemZ/fp-move-13.ll b/test/CodeGen/SystemZ/fp-move-13.ll new file mode 100644 index 000000000000..d6c53eaceeef --- /dev/null +++ b/test/CodeGen/SystemZ/fp-move-13.ll @@ -0,0 +1,46 @@ +; Test f128 moves on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; VR-to-VR moves. Since f128s are passed by reference, +; we need to force a copy by other means. +define void @f1(fp128 *%x) { +; CHECK-LABEL: f1: +; CHECK: vlr +; CHECK: vleig +; CHECK: br %r14 + %val = load volatile fp128 , fp128 *%x + %t1 = bitcast fp128 %val to <2 x i64> + %t2 = insertelement <2 x i64> %t1, i64 0, i32 0 + %res = bitcast <2 x i64> %t2 to fp128 + store volatile fp128 %res, fp128 *%x + store volatile fp128 %val, fp128 *%x + ret void +} + +; Test 128-bit moves from GPRs to VRs. i128 isn't a legitimate type, +; so this goes through memory. +define void @f2(fp128 *%a, i128 *%b) { +; CHECK-LABEL: f2: +; CHECK: lg +; CHECK: lg +; CHECK: stg +; CHECK: stg +; CHECK: br %r14 + %val = load i128 , i128 *%b + %res = bitcast i128 %val to fp128 + store fp128 %res, fp128 *%a + ret void +} + +; Test 128-bit moves from VRs to GPRs, with the same restriction as f2. +define void @f3(fp128 *%a, i128 *%b) { +; CHECK-LABEL: f3: +; CHECK: vl +; CHECK: vst + %val = load fp128 , fp128 *%a + %res = bitcast fp128 %val to i128 + store i128 %res, i128 *%b + ret void +} + diff --git a/test/CodeGen/SystemZ/fp-mul-01.ll b/test/CodeGen/SystemZ/fp-mul-01.ll index 3b72d25e0b5c..126567b218ab 100644 --- a/test/CodeGen/SystemZ/fp-mul-01.ll +++ b/test/CodeGen/SystemZ/fp-mul-01.ll @@ -1,6 +1,8 @@ ; Test multiplication of two f32s, producing an f32 result. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s declare float @foo() @@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) { define float @f7(float *%ptr0) { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT -; CHECK: meeb %f0, 16{{[04]}}(%r15) +; CHECK-SCALAR: meeb %f0, 16{{[04]}}(%r15) ; CHECK: br %r14 %ptr1 = getelementptr float, float *%ptr0, i64 2 %ptr2 = getelementptr float, float *%ptr0, i64 4 diff --git a/test/CodeGen/SystemZ/fp-mul-06.ll b/test/CodeGen/SystemZ/fp-mul-06.ll index 896fafecbdaf..581e44eeaa2f 100644 --- a/test/CodeGen/SystemZ/fp-mul-06.ll +++ b/test/CodeGen/SystemZ/fp-mul-06.ll @@ -1,11 +1,15 @@ -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s declare float @llvm.fma.f32(float %f1, float %f2, float %f3) define float @f1(float %f1, float %f2, float %acc) { ; CHECK-LABEL: f1: -; CHECK: maebr %f4, %f0, %f2 -; CHECK: ler %f0, %f4 +; CHECK-SCALAR: maebr %f4, %f0, %f2 +; CHECK-SCALAR: ler %f0, %f4 +; CHECK-VECTOR: wfmasb %f0, %f0, %f2, %f4 ; CHECK: br %r14 %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc) ret float %res @@ -14,7 +18,8 @@ define float @f1(float %f1, float %f2, float %acc) { define float @f2(float %f1, float *%ptr, float %acc) { ; CHECK-LABEL: f2: ; CHECK: maeb %f2, %f0, 0(%r2) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %f2 = load float , float *%ptr %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc) @@ -24,7 +29,8 @@ define float @f2(float %f1, float *%ptr, float %acc) { define float @f3(float %f1, float *%base, float %acc) { ; CHECK-LABEL: f3: ; CHECK: maeb %f2, %f0, 4092(%r2) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 1023 %f2 = load float , float *%ptr @@ -39,7 +45,8 @@ define float @f4(float %f1, float *%base, float %acc) { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: maeb %f2, %f0, 0(%r2) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 1024 %f2 = load float , float *%ptr @@ -54,7 +61,8 @@ define float @f5(float %f1, float *%base, float %acc) { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -4 ; CHECK: maeb %f2, %f0, 0(%r2) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 -1 %f2 = load float , float *%ptr @@ -66,7 +74,8 @@ define float @f6(float %f1, float *%base, i64 %index, float %acc) { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: maeb %f2, %f0, 0(%r1,%r2) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 %index %f2 = load float , float *%ptr @@ -78,7 +87,8 @@ define float @f7(float %f1, float *%base, i64 %index, float %acc) { ; CHECK-LABEL: f7: ; CHECK: sllg %r1, %r3, 2 ; CHECK: maeb %f2, %f0, 4092({{%r1,%r2|%r2,%r1}}) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %index2 = add i64 %index, 1023 %ptr = getelementptr float, float *%base, i64 %index2 @@ -92,7 +102,8 @@ define float @f8(float %f1, float *%base, i64 %index, float %acc) { ; CHECK: sllg %r1, %r3, 2 ; CHECK: lay %r1, 4096({{%r1,%r2|%r2,%r1}}) ; CHECK: maeb %f2, %f0, 0(%r1) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %index2 = add i64 %index, 1024 %ptr = getelementptr float, float *%base, i64 %index2 diff --git a/test/CodeGen/SystemZ/fp-mul-08.ll b/test/CodeGen/SystemZ/fp-mul-08.ll index 5e5538bfacc9..5b1f9b96c089 100644 --- a/test/CodeGen/SystemZ/fp-mul-08.ll +++ b/test/CodeGen/SystemZ/fp-mul-08.ll @@ -1,11 +1,15 @@ -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s declare float @llvm.fma.f32(float %f1, float %f2, float %f3) define float @f1(float %f1, float %f2, float %acc) { ; CHECK-LABEL: f1: -; CHECK: msebr %f4, %f0, %f2 -; CHECK: ler %f0, %f4 +; CHECK-SCALAR: msebr %f4, %f0, %f2 +; CHECK-SCALAR: ler %f0, %f4 +; CHECK-VECTOR: wfmssb %f0, %f0, %f2, %f4 ; CHECK: br %r14 %negacc = fsub float -0.0, %acc %res = call float @llvm.fma.f32 (float %f1, float %f2, float %negacc) @@ -15,7 +19,8 @@ define float @f1(float %f1, float %f2, float %acc) { define float @f2(float %f1, float *%ptr, float %acc) { ; CHECK-LABEL: f2: ; CHECK: mseb %f2, %f0, 0(%r2) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %f2 = load float , float *%ptr %negacc = fsub float -0.0, %acc @@ -26,7 +31,8 @@ define float @f2(float %f1, float *%ptr, float %acc) { define float @f3(float %f1, float *%base, float %acc) { ; CHECK-LABEL: f3: ; CHECK: mseb %f2, %f0, 4092(%r2) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 1023 %f2 = load float , float *%ptr @@ -42,7 +48,8 @@ define float @f4(float %f1, float *%base, float %acc) { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: mseb %f2, %f0, 0(%r2) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 1024 %f2 = load float , float *%ptr @@ -58,7 +65,8 @@ define float @f5(float %f1, float *%base, float %acc) { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -4 ; CHECK: mseb %f2, %f0, 0(%r2) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 -1 %f2 = load float , float *%ptr @@ -71,7 +79,8 @@ define float @f6(float %f1, float *%base, i64 %index, float %acc) { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: mseb %f2, %f0, 0(%r1,%r2) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 %index %f2 = load float , float *%ptr @@ -84,7 +93,8 @@ define float @f7(float %f1, float *%base, i64 %index, float %acc) { ; CHECK-LABEL: f7: ; CHECK: sllg %r1, %r3, 2 ; CHECK: mseb %f2, %f0, 4092({{%r1,%r2|%r2,%r1}}) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %index2 = add i64 %index, 1023 %ptr = getelementptr float, float *%base, i64 %index2 @@ -99,7 +109,8 @@ define float @f8(float %f1, float *%base, i64 %index, float %acc) { ; CHECK: sllg %r1, %r3, 2 ; CHECK: lay %r1, 4096({{%r1,%r2|%r2,%r1}}) ; CHECK: mseb %f2, %f0, 0(%r1) -; CHECK: ler %f0, %f2 +; CHECK-SCALAR: ler %f0, %f2 +; CHECK-VECTOR: ldr %f0, %f2 ; CHECK: br %r14 %index2 = add i64 %index, 1024 %ptr = getelementptr float, float *%base, i64 %index2 diff --git a/test/CodeGen/SystemZ/fp-mul-10.ll b/test/CodeGen/SystemZ/fp-mul-10.ll new file mode 100644 index 000000000000..c23a6a202ad5 --- /dev/null +++ b/test/CodeGen/SystemZ/fp-mul-10.ll @@ -0,0 +1,43 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare double @llvm.fma.f64(double %f1, double %f2, double %f3) +declare float @llvm.fma.f32(float %f1, float %f2, float %f3) + +define double @f1(double %f1, double %f2, double %acc) { +; CHECK-LABEL: f1: +; CHECK: wfnmadb %f0, %f0, %f2, %f4 +; CHECK: br %r14 + %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc) + %negres = fsub double -0.0, %res + ret double %negres +} + +define double @f2(double %f1, double %f2, double %acc) { +; CHECK-LABEL: f2: +; CHECK: wfnmsdb %f0, %f0, %f2, %f4 +; CHECK: br %r14 + %negacc = fsub double -0.0, %acc + %res = call double @llvm.fma.f64 (double %f1, double %f2, double %negacc) + %negres = fsub double -0.0, %res + ret double %negres +} + +define float @f3(float %f1, float %f2, float %acc) { +; CHECK-LABEL: f3: +; CHECK: wfnmasb %f0, %f0, %f2, %f4 +; CHECK: br %r14 + %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc) + %negres = fsub float -0.0, %res + ret float %negres +} + +define float @f4(float %f1, float %f2, float %acc) { +; CHECK-LABEL: f4: +; CHECK: wfnmssb %f0, %f0, %f2, %f4 +; CHECK: br %r14 + %negacc = fsub float -0.0, %acc + %res = call float @llvm.fma.f32 (float %f1, float %f2, float %negacc) + %negres = fsub float -0.0, %res + ret float %negres +} + diff --git a/test/CodeGen/SystemZ/fp-mul-11.ll b/test/CodeGen/SystemZ/fp-mul-11.ll new file mode 100644 index 000000000000..ef45bf184a4c --- /dev/null +++ b/test/CodeGen/SystemZ/fp-mul-11.ll @@ -0,0 +1,32 @@ +; Test 128-bit floating-point multiplication on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +define void @f1(fp128 *%ptr1, fp128 *%ptr2) { +; CHECK-LABEL: f1: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK: wfmxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]] +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %f1 = load fp128, fp128 *%ptr1 + %f2 = load fp128, fp128 *%ptr2 + %sum = fmul fp128 %f1, %f2 + store fp128 %sum, fp128 *%ptr1 + ret void +} + +define void @f2(double %f1, double %f2, fp128 *%dst) { +; CHECK-LABEL: f2: +; CHECK-DAG: wflld [[REG1:%v[0-9]+]], %f0 +; CHECK-DAG: wflld [[REG2:%v[0-9]+]], %f2 +; CHECK: wfmxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]] +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %f1x = fpext double %f1 to fp128 + %f2x = fpext double %f2 to fp128 + %res = fmul fp128 %f1x, %f2x + store fp128 %res, fp128 *%dst + ret void +} + diff --git a/test/CodeGen/SystemZ/fp-mul-12.ll b/test/CodeGen/SystemZ/fp-mul-12.ll new file mode 100644 index 000000000000..331f9a30c274 --- /dev/null +++ b/test/CodeGen/SystemZ/fp-mul-12.ll @@ -0,0 +1,72 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare fp128 @llvm.fma.f128(fp128 %f1, fp128 %f2, fp128 %f3) + +define void @f1(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) { +; CHECK-LABEL: f1: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4) +; CHECK: wfmaxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]] +; CHECK: vst [[RES]], 0(%r5) +; CHECK: br %r14 + %f1 = load fp128, fp128 *%ptr1 + %f2 = load fp128, fp128 *%ptr2 + %f3 = load fp128, fp128 *%ptr3 + %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %f3) + store fp128 %res, fp128 *%dst + ret void +} + +define void @f2(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) { +; CHECK-LABEL: f2: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4) +; CHECK: wfmsxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]] +; CHECK: vst [[RES]], 0(%r5) +; CHECK: br %r14 + %f1 = load fp128, fp128 *%ptr1 + %f2 = load fp128, fp128 *%ptr2 + %f3 = load fp128, fp128 *%ptr3 + %neg = fsub fp128 0xL00000000000000008000000000000000, %f3 + %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %neg) + store fp128 %res, fp128 *%dst + ret void +} + +define void @f3(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) { +; CHECK-LABEL: f3: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4) +; CHECK: wfnmaxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]] +; CHECK: vst [[RES]], 0(%r5) +; CHECK: br %r14 + %f1 = load fp128, fp128 *%ptr1 + %f2 = load fp128, fp128 *%ptr2 + %f3 = load fp128, fp128 *%ptr3 + %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %f3) + %negres = fsub fp128 0xL00000000000000008000000000000000, %res + store fp128 %negres, fp128 *%dst + ret void +} + +define void @f4(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) { +; CHECK-LABEL: f4: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4) +; CHECK: wfnmsxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]] +; CHECK: vst [[RES]], 0(%r5) +; CHECK: br %r14 + %f1 = load fp128, fp128 *%ptr1 + %f2 = load fp128, fp128 *%ptr2 + %f3 = load fp128, fp128 *%ptr3 + %neg = fsub fp128 0xL00000000000000008000000000000000, %f3 + %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %neg) + %negres = fsub fp128 0xL00000000000000008000000000000000, %res + store fp128 %negres, fp128 *%dst + ret void +} + diff --git a/test/CodeGen/SystemZ/fp-neg-02.ll b/test/CodeGen/SystemZ/fp-neg-02.ll new file mode 100644 index 000000000000..38fb3a58d404 --- /dev/null +++ b/test/CodeGen/SystemZ/fp-neg-02.ll @@ -0,0 +1,41 @@ +; Test floating-point negation on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test f32. +define float @f1(float %f) { +; CHECK-LABEL: f1: +; CHECK: lcdfr %f0, %f0 +; CHECK: br %r14 + %res = fsub float -0.0, %f + ret float %res +} + +; Test f64. +define double @f2(double %f) { +; CHECK-LABEL: f2: +; CHECK: lcdfr %f0, %f0 +; CHECK: br %r14 + %res = fsub double -0.0, %f + ret double %res +} + +; Test f128. With the loads and stores, a pure negation would probably +; be better implemented using an XI on the upper byte. Do some extra +; processing so that using FPRs is unequivocally better. +define void @f3(fp128 *%ptr, fp128 *%ptr2) { +; CHECK-LABEL: f3: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK-DAG: wflcxb [[NEGREG1:%v[0-9]+]], [[REG1]] +; CHECK: wfdxb [[RES:%v[0-9]+]], [[NEGREG1]], [[REG2]] +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %orig = load fp128 , fp128 *%ptr + %negzero = fpext float -0.0 to fp128 + %neg = fsub fp128 0xL00000000000000008000000000000000, %orig + %op2 = load fp128 , fp128 *%ptr2 + %res = fdiv fp128 %neg, %op2 + store fp128 %res, fp128 *%ptr + ret void +} diff --git a/test/CodeGen/SystemZ/fp-round-03.ll b/test/CodeGen/SystemZ/fp-round-03.ll new file mode 100644 index 000000000000..762e793701d1 --- /dev/null +++ b/test/CodeGen/SystemZ/fp-round-03.ll @@ -0,0 +1,207 @@ +; Test rounding functions for z14 and above. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test rint for f32. +declare float @llvm.rint.f32(float %f) +define float @f1(float %f) { +; CHECK-LABEL: f1: +; CHECK: fiebra %f0, 0, %f0, 0 +; CHECK: br %r14 + %res = call float @llvm.rint.f32(float %f) + ret float %res +} + +; Test rint for f64. +declare double @llvm.rint.f64(double %f) +define double @f2(double %f) { +; CHECK-LABEL: f2: +; CHECK: fidbra %f0, 0, %f0, 0 +; CHECK: br %r14 + %res = call double @llvm.rint.f64(double %f) + ret double %res +} + +; Test rint for f128. +declare fp128 @llvm.rint.f128(fp128 %f) +define void @f3(fp128 *%ptr) { +; CHECK-LABEL: f3: +; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) +; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 0, 0 +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %src = load fp128 , fp128 *%ptr + %res = call fp128 @llvm.rint.f128(fp128 %src) + store fp128 %res, fp128 *%ptr + ret void +} + +; Test nearbyint for f32. +declare float @llvm.nearbyint.f32(float %f) +define float @f4(float %f) { +; CHECK-LABEL: f4: +; CHECK: fiebra %f0, 0, %f0, 4 +; CHECK: br %r14 + %res = call float @llvm.nearbyint.f32(float %f) + ret float %res +} + +; Test nearbyint for f64. +declare double @llvm.nearbyint.f64(double %f) +define double @f5(double %f) { +; CHECK-LABEL: f5: +; CHECK: fidbra %f0, 0, %f0, 4 +; CHECK: br %r14 + %res = call double @llvm.nearbyint.f64(double %f) + ret double %res +} + +; Test nearbyint for f128. +declare fp128 @llvm.nearbyint.f128(fp128 %f) +define void @f6(fp128 *%ptr) { +; CHECK-LABEL: f6: +; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) +; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 0 +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %src = load fp128 , fp128 *%ptr + %res = call fp128 @llvm.nearbyint.f128(fp128 %src) + store fp128 %res, fp128 *%ptr + ret void +} + +; Test floor for f32. +declare float @llvm.floor.f32(float %f) +define float @f7(float %f) { +; CHECK-LABEL: f7: +; CHECK: fiebra %f0, 7, %f0, 4 +; CHECK: br %r14 + %res = call float @llvm.floor.f32(float %f) + ret float %res +} + +; Test floor for f64. +declare double @llvm.floor.f64(double %f) +define double @f8(double %f) { +; CHECK-LABEL: f8: +; CHECK: fidbra %f0, 7, %f0, 4 +; CHECK: br %r14 + %res = call double @llvm.floor.f64(double %f) + ret double %res +} + +; Test floor for f128. +declare fp128 @llvm.floor.f128(fp128 %f) +define void @f9(fp128 *%ptr) { +; CHECK-LABEL: f9: +; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) +; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 7 +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %src = load fp128 , fp128 *%ptr + %res = call fp128 @llvm.floor.f128(fp128 %src) + store fp128 %res, fp128 *%ptr + ret void +} + +; Test ceil for f32. +declare float @llvm.ceil.f32(float %f) +define float @f10(float %f) { +; CHECK-LABEL: f10: +; CHECK: fiebra %f0, 6, %f0, 4 +; CHECK: br %r14 + %res = call float @llvm.ceil.f32(float %f) + ret float %res +} + +; Test ceil for f64. +declare double @llvm.ceil.f64(double %f) +define double @f11(double %f) { +; CHECK-LABEL: f11: +; CHECK: fidbra %f0, 6, %f0, 4 +; CHECK: br %r14 + %res = call double @llvm.ceil.f64(double %f) + ret double %res +} + +; Test ceil for f128. +declare fp128 @llvm.ceil.f128(fp128 %f) +define void @f12(fp128 *%ptr) { +; CHECK-LABEL: f12: +; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) +; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 6 +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %src = load fp128 , fp128 *%ptr + %res = call fp128 @llvm.ceil.f128(fp128 %src) + store fp128 %res, fp128 *%ptr + ret void +} + +; Test trunc for f32. +declare float @llvm.trunc.f32(float %f) +define float @f13(float %f) { +; CHECK-LABEL: f13: +; CHECK: fiebra %f0, 5, %f0, 4 +; CHECK: br %r14 + %res = call float @llvm.trunc.f32(float %f) + ret float %res +} + +; Test trunc for f64. +declare double @llvm.trunc.f64(double %f) +define double @f14(double %f) { +; CHECK-LABEL: f14: +; CHECK: fidbra %f0, 5, %f0, 4 +; CHECK: br %r14 + %res = call double @llvm.trunc.f64(double %f) + ret double %res +} + +; Test trunc for f128. +declare fp128 @llvm.trunc.f128(fp128 %f) +define void @f15(fp128 *%ptr) { +; CHECK-LABEL: f15: +; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) +; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 5 +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %src = load fp128 , fp128 *%ptr + %res = call fp128 @llvm.trunc.f128(fp128 %src) + store fp128 %res, fp128 *%ptr + ret void +} + +; Test round for f32. +declare float @llvm.round.f32(float %f) +define float @f16(float %f) { +; CHECK-LABEL: f16: +; CHECK: fiebra %f0, 1, %f0, 4 +; CHECK: br %r14 + %res = call float @llvm.round.f32(float %f) + ret float %res +} + +; Test round for f64. +declare double @llvm.round.f64(double %f) +define double @f17(double %f) { +; CHECK-LABEL: f17: +; CHECK: fidbra %f0, 1, %f0, 4 +; CHECK: br %r14 + %res = call double @llvm.round.f64(double %f) + ret double %res +} + +; Test round for f128. +declare fp128 @llvm.round.f128(fp128 %f) +define void @f18(fp128 *%ptr) { +; CHECK-LABEL: f18: +; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) +; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 1 +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %src = load fp128 , fp128 *%ptr + %res = call fp128 @llvm.round.f128(fp128 %src) + store fp128 %res, fp128 *%ptr + ret void +} diff --git a/test/CodeGen/SystemZ/fp-sqrt-01.ll b/test/CodeGen/SystemZ/fp-sqrt-01.ll index 3680207e7f20..85a46bc2d7fc 100644 --- a/test/CodeGen/SystemZ/fp-sqrt-01.ll +++ b/test/CodeGen/SystemZ/fp-sqrt-01.ll @@ -1,6 +1,8 @@ ; Test 32-bit square root. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s declare float @llvm.sqrt.f32(float) declare float @sqrtf(float) @@ -77,7 +79,7 @@ define float @f6(float *%base, i64 %index) { ; to use SQEB if possible. define void @f7(float *%ptr) { ; CHECK-LABEL: f7: -; CHECK: sqeb {{%f[0-9]+}}, 16{{[04]}}(%r15) +; CHECK-SCALAR: sqeb {{%f[0-9]+}}, 16{{[04]}}(%r15) ; CHECK: br %r14 %val0 = load volatile float , float *%ptr %val1 = load volatile float , float *%ptr @@ -160,7 +162,7 @@ define float @f8(float %dummy, float %val) { ; CHECK: sqebr %f0, %f2 ; CHECK: cebr %f0, %f0 ; CHECK: bnor %r14 -; CHECK: ler %f0, %f2 +; CHECK: {{ler|ldr}} %f0, %f2 ; CHECK: jg sqrtf@PLT %res = tail call float @sqrtf(float %val) ret float %res diff --git a/test/CodeGen/SystemZ/fp-sqrt-04.ll b/test/CodeGen/SystemZ/fp-sqrt-04.ll new file mode 100644 index 000000000000..e0fb2569b39a --- /dev/null +++ b/test/CodeGen/SystemZ/fp-sqrt-04.ll @@ -0,0 +1,17 @@ +; Test 128-bit floating-point square root on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare fp128 @llvm.sqrt.f128(fp128 %f) + +define void @f1(fp128 *%ptr) { +; CHECK-LABEL: f1: +; CHECK-DAG: vl [[REG:%v[0-9]+]], 0(%r2) +; CHECK: wfsqxb [[RES:%v[0-9]+]], [[REG]] +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %f = load fp128, fp128 *%ptr + %res = call fp128 @llvm.sqrt.f128(fp128 %f) + store fp128 %res, fp128 *%ptr + ret void +} diff --git a/test/CodeGen/SystemZ/fp-sub-01.ll b/test/CodeGen/SystemZ/fp-sub-01.ll index f4185ca3108d..41f72e1810e9 100644 --- a/test/CodeGen/SystemZ/fp-sub-01.ll +++ b/test/CodeGen/SystemZ/fp-sub-01.ll @@ -1,6 +1,8 @@ ; Test 32-bit floating-point subtraction. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \ +; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s declare float @foo() @@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) { define float @f7(float *%ptr0) { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT -; CHECK: seb %f0, 16{{[04]}}(%r15) +; CHECK-SCALAR: seb %f0, 16{{[04]}}(%r15) ; CHECK: br %r14 %ptr1 = getelementptr float, float *%ptr0, i64 2 %ptr2 = getelementptr float, float *%ptr0, i64 4 diff --git a/test/CodeGen/SystemZ/fp-sub-04.ll b/test/CodeGen/SystemZ/fp-sub-04.ll new file mode 100644 index 000000000000..5f88132664ef --- /dev/null +++ b/test/CodeGen/SystemZ/fp-sub-04.ll @@ -0,0 +1,17 @@ +; Test 128-bit floating-point subtraction on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +define void @f1(fp128 *%ptr1, fp128 *%ptr2) { +; CHECK-LABEL: f1: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK: wfsxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]] +; CHECK: vst [[RES]], 0(%r2) +; CHECK: br %r14 + %f1 = load fp128, fp128 *%ptr1 + %f2 = load fp128, fp128 *%ptr2 + %sum = fsub fp128 %f1, %f2 + store fp128 %sum, fp128 *%ptr1 + ret void +} diff --git a/test/CodeGen/SystemZ/int-add-17.ll b/test/CodeGen/SystemZ/int-add-17.ll new file mode 100644 index 000000000000..fd245871c652 --- /dev/null +++ b/test/CodeGen/SystemZ/int-add-17.ll @@ -0,0 +1,95 @@ +; Test additions between an i64 and a sign-extended i16 on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare i64 @foo() + +; Check AGH with no displacement. +define i64 @f1(i64 %a, i16 *%src) { +; CHECK-LABEL: f1: +; CHECK: agh %r2, 0(%r3) +; CHECK: br %r14 + %b = load i16, i16 *%src + %bext = sext i16 %b to i64 + %add = add i64 %a, %bext + ret i64 %add +} + +; Check the high end of the aligned AGH range. +define i64 @f2(i64 %a, i16 *%src) { +; CHECK-LABEL: f2: +; CHECK: agh %r2, 524286(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 262143 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %add = add i64 %a, %bext + ret i64 %add +} + +; Check the next word up, which needs separate address logic. +; Other sequences besides this one would be OK. +define i64 @f3(i64 %a, i16 *%src) { +; CHECK-LABEL: f3: +; CHECK: agfi %r3, 524288 +; CHECK: agh %r2, 0(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 262144 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %add = add i64 %a, %bext + ret i64 %add +} + +; Check the high end of the negative aligned AGH range. +define i64 @f4(i64 %a, i16 *%src) { +; CHECK-LABEL: f4: +; CHECK: agh %r2, -2(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -1 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %add = add i64 %a, %bext + ret i64 %add +} + +; Check the low end of the AGH range. +define i64 @f5(i64 %a, i16 *%src) { +; CHECK-LABEL: f5: +; CHECK: agh %r2, -524288(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -262144 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %add = add i64 %a, %bext + ret i64 %add +} + +; Check the next word down, which needs separate address logic. +; Other sequences besides this one would be OK. +define i64 @f6(i64 %a, i16 *%src) { +; CHECK-LABEL: f6: +; CHECK: agfi %r3, -524290 +; CHECK: agh %r2, 0(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -262145 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %add = add i64 %a, %bext + ret i64 %add +} + +; Check that AGH allows an index. +define i64 @f7(i64 %a, i64 %src, i64 %index) { +; CHECK-LABEL: f7: +; CHECK: agh %r2, 524284({{%r4,%r3|%r3,%r4}}) +; CHECK: br %r14 + %add1 = add i64 %src, %index + %add2 = add i64 %add1, 524284 + %ptr = inttoptr i64 %add2 to i16 * + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %add = add i64 %a, %bext + ret i64 %add +} + diff --git a/test/CodeGen/SystemZ/int-mul-09.ll b/test/CodeGen/SystemZ/int-mul-09.ll new file mode 100644 index 000000000000..3e384e72db5d --- /dev/null +++ b/test/CodeGen/SystemZ/int-mul-09.ll @@ -0,0 +1,95 @@ +; Test multiplications between an i64 and a sign-extended i16 on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare i64 @foo() + +; Check MGH with no displacement. +define i64 @f1(i64 %a, i16 *%src) { +; CHECK-LABEL: f1: +; CHECK: mgh %r2, 0(%r3) +; CHECK: br %r14 + %b = load i16, i16 *%src + %bext = sext i16 %b to i64 + %mul = mul i64 %a, %bext + ret i64 %mul +} + +; Check the high end of the aligned MGH range. +define i64 @f2(i64 %a, i16 *%src) { +; CHECK-LABEL: f2: +; CHECK: mgh %r2, 524286(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 262143 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %mul = mul i64 %a, %bext + ret i64 %mul +} + +; Check the next word up, which needs separate address logic. +; Other sequences besides this one would be OK. +define i64 @f3(i64 %a, i16 *%src) { +; CHECK-LABEL: f3: +; CHECK: agfi %r3, 524288 +; CHECK: mgh %r2, 0(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 262144 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %mul = mul i64 %a, %bext + ret i64 %mul +} + +; Check the high end of the negative aligned MGH range. +define i64 @f4(i64 %a, i16 *%src) { +; CHECK-LABEL: f4: +; CHECK: mgh %r2, -2(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -1 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %mul = mul i64 %a, %bext + ret i64 %mul +} + +; Check the low end of the MGH range. +define i64 @f5(i64 %a, i16 *%src) { +; CHECK-LABEL: f5: +; CHECK: mgh %r2, -524288(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -262144 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %mul = mul i64 %a, %bext + ret i64 %mul +} + +; Check the next word down, which needs separate address logic. +; Other sequences besides this one would be OK. +define i64 @f6(i64 %a, i16 *%src) { +; CHECK-LABEL: f6: +; CHECK: agfi %r3, -524290 +; CHECK: mgh %r2, 0(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -262145 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %mul = mul i64 %a, %bext + ret i64 %mul +} + +; Check that MGH allows an index. +define i64 @f7(i64 %a, i64 %src, i64 %index) { +; CHECK-LABEL: f7: +; CHECK: mgh %r2, 524284({{%r4,%r3|%r3,%r4}}) +; CHECK: br %r14 + %add1 = add i64 %src, %index + %add2 = add i64 %add1, 524284 + %ptr = inttoptr i64 %add2 to i16 * + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %mul = mul i64 %a, %bext + ret i64 %mul +} + diff --git a/test/CodeGen/SystemZ/int-mul-10.ll b/test/CodeGen/SystemZ/int-mul-10.ll new file mode 100644 index 000000000000..a4d80af36a3c --- /dev/null +++ b/test/CodeGen/SystemZ/int-mul-10.ll @@ -0,0 +1,165 @@ +; Test signed high-part i64->i128 multiplications on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare i64 @foo() + +; Check sign-extended multiplication in which only the high part is used. +define i64 @f1(i64 %dummy, i64 %a, i64 %b) { +; CHECK-LABEL: f1: +; CHECK-NOT: {{%r[234]}} +; CHECK: mgrk %r2, %r3, %r4 +; CHECK: br %r14 + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check sign-extended multiplication in which only part of the high half +; is used. +define i64 @f2(i64 %dummy, i64 %a, i64 %b) { +; CHECK-LABEL: f2: +; CHECK-NOT: {{%r[234]}} +; CHECK: mgrk [[REG:%r[0-9]+]], %r3, %r4 +; CHECK: srlg %r2, [[REG]], 3 +; CHECK: br %r14 + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 67 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check sign-extended multiplication in which the result is split into +; high and low halves. +define i64 @f3(i64 %dummy, i64 %a, i64 %b) { +; CHECK-LABEL: f3: +; CHECK-NOT: {{%r[234]}} +; CHECK: mgrk %r2, %r3, %r4 +; CHECK: ogr %r2, %r3 +; CHECK: br %r14 + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + %low = trunc i128 %mulx to i64 + %or = or i64 %high, %low + ret i64 %or +} + +; Check MG with no displacement. +define i64 @f4(i64 %dummy, i64 %a, i64 *%src) { +; CHECK-LABEL: f4: +; CHECK-NOT: {{%r[234]}} +; CHECK: mg %r2, 0(%r4) +; CHECK: br %r14 + %b = load i64 , i64 *%src + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check the high end of the aligned MG range. +define i64 @f5(i64 %dummy, i64 %a, i64 *%src) { +; CHECK-LABEL: f5: +; CHECK: mg %r2, 524280(%r4) +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%src, i64 65535 + %b = load i64 , i64 *%ptr + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check the next doubleword up, which requires separate address logic. +; Other sequences besides this one would be OK. +define i64 @f6(i64 %dummy, i64 %a, i64 *%src) { +; CHECK-LABEL: f6: +; CHECK: agfi %r4, 524288 +; CHECK: mg %r2, 0(%r4) +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%src, i64 65536 + %b = load i64 , i64 *%ptr + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check the high end of the negative aligned MG range. +define i64 @f7(i64 %dummy, i64 %a, i64 *%src) { +; CHECK-LABEL: f7: +; CHECK: mg %r2, -8(%r4) +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%src, i64 -1 + %b = load i64 , i64 *%ptr + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check the low end of the MG range. +define i64 @f8(i64 %dummy, i64 %a, i64 *%src) { +; CHECK-LABEL: f8: +; CHECK: mg %r2, -524288(%r4) +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%src, i64 -65536 + %b = load i64 , i64 *%ptr + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check the next doubleword down, which needs separate address logic. +; Other sequences besides this one would be OK. +define i64 @f9(i64 *%dest, i64 %a, i64 *%src) { +; CHECK-LABEL: f9: +; CHECK: agfi %r4, -524296 +; CHECK: mg %r2, 0(%r4) +; CHECK: br %r14 + %ptr = getelementptr i64, i64 *%src, i64 -65537 + %b = load i64 , i64 *%ptr + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check that MG allows an index. +define i64 @f10(i64 *%dest, i64 %a, i64 %src, i64 %index) { +; CHECK-LABEL: f10: +; CHECK: mg %r2, 524287(%r5,%r4) +; CHECK: br %r14 + %add1 = add i64 %src, %index + %add2 = add i64 %add1, 524287 + %ptr = inttoptr i64 %add2 to i64 * + %b = load i64 , i64 *%ptr + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + diff --git a/test/CodeGen/SystemZ/int-mul-11.ll b/test/CodeGen/SystemZ/int-mul-11.ll new file mode 100644 index 000000000000..f26251982518 --- /dev/null +++ b/test/CodeGen/SystemZ/int-mul-11.ll @@ -0,0 +1,32 @@ +; Test three-operand multiplication instructions on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Check MSRKC. +define i32 @f1(i32 %dummy, i32 %a, i32 %b) { +; CHECK-LABEL: f1: +; CHECK: msrkc %r2, %r3, %r4 +; CHECK: br %r14 + %mul = mul i32 %a, %b + ret i32 %mul +} + +; Check MSGRKC. +define i64 @f2(i64 %dummy, i64 %a, i64 %b) { +; CHECK-LABEL: f2: +; CHECK: msgrkc %r2, %r3, %r4 +; CHECK: br %r14 + %mul = mul i64 %a, %b + ret i64 %mul +} + +; Verify that we still use MSGFR for i32->i64 multiplies. +define i64 @f3(i64 %a, i32 %b) { +; CHECK-LABEL: f3: +; CHECK: msgfr %r2, %r3 +; CHECK: br %r14 + %bext = sext i32 %b to i64 + %mul = mul i64 %a, %bext + ret i64 %mul +} + diff --git a/test/CodeGen/SystemZ/int-sub-10.ll b/test/CodeGen/SystemZ/int-sub-10.ll new file mode 100644 index 000000000000..bf6638575e55 --- /dev/null +++ b/test/CodeGen/SystemZ/int-sub-10.ll @@ -0,0 +1,95 @@ +; Test subtractions of a sign-extended i16 from an i64 on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare i64 @foo() + +; Check SGH with no displacement. +define i64 @f1(i64 %a, i16 *%src) { +; CHECK-LABEL: f1: +; CHECK: sgh %r2, 0(%r3) +; CHECK: br %r14 + %b = load i16, i16 *%src + %bext = sext i16 %b to i64 + %sub = sub i64 %a, %bext + ret i64 %sub +} + +; Check the high end of the aligned SGH range. +define i64 @f2(i64 %a, i16 *%src) { +; CHECK-LABEL: f2: +; CHECK: sgh %r2, 524286(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 262143 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %sub = sub i64 %a, %bext + ret i64 %sub +} + +; Check the next word up, which needs separate address logic. +; Other sequences besides this one would be OK. +define i64 @f3(i64 %a, i16 *%src) { +; CHECK-LABEL: f3: +; CHECK: agfi %r3, 524288 +; CHECK: sgh %r2, 0(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 262144 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %sub = sub i64 %a, %bext + ret i64 %sub +} + +; Check the high end of the negative aligned SGH range. +define i64 @f4(i64 %a, i16 *%src) { +; CHECK-LABEL: f4: +; CHECK: sgh %r2, -2(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -1 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %sub = sub i64 %a, %bext + ret i64 %sub +} + +; Check the low end of the SGH range. +define i64 @f5(i64 %a, i16 *%src) { +; CHECK-LABEL: f5: +; CHECK: sgh %r2, -524288(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -262144 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %sub = sub i64 %a, %bext + ret i64 %sub +} + +; Check the next word down, which needs separate address logic. +; Other sequences besides this one would be OK. +define i64 @f6(i64 %a, i16 *%src) { +; CHECK-LABEL: f6: +; CHECK: agfi %r3, -524290 +; CHECK: sgh %r2, 0(%r3) +; CHECK: br %r14 + %ptr = getelementptr i16, i16 *%src, i64 -262145 + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %sub = sub i64 %a, %bext + ret i64 %sub +} + +; Check that SGH allows an index. +define i64 @f7(i64 %a, i64 %src, i64 %index) { +; CHECK-LABEL: f7: +; CHECK: sgh %r2, 524284({{%r4,%r3|%r3,%r4}}) +; CHECK: br %r14 + %add1 = add i64 %src, %index + %add2 = add i64 %add1, 524284 + %ptr = inttoptr i64 %add2 to i16 * + %b = load i16, i16 *%ptr + %bext = sext i16 %b to i64 + %sub = sub i64 %a, %bext + ret i64 %sub +} + diff --git a/test/CodeGen/SystemZ/tdc-07.ll b/test/CodeGen/SystemZ/tdc-07.ll new file mode 100644 index 000000000000..6651410e7c66 --- /dev/null +++ b/test/CodeGen/SystemZ/tdc-07.ll @@ -0,0 +1,18 @@ +; Test the Test Data Class instruction on z14 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare i32 @llvm.s390.tdc.f128(fp128, i64) + +; Check using as i32 - f128 +define i32 @f3(fp128 %x) { +; CHECK-LABEL: f3 +; CHECK: vl %v0, 0(%r2) +; CHECK: vrepg %v2, %v0, 1 +; CHECK: tcxb %f0, 123 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 + %res = call i32 @llvm.s390.tdc.f128(fp128 %x, i64 123) + ret i32 %res +} + diff --git a/test/CodeGen/SystemZ/vec-abs-06.ll b/test/CodeGen/SystemZ/vec-abs-06.ll new file mode 100644 index 000000000000..8eee1d9d2507 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-abs-06.ll @@ -0,0 +1,47 @@ +; Test f32 and v4f32 absolute on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare float @llvm.fabs.f32(float) +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) + +; Test a plain absolute. +define <4 x float> @f1(<4 x float> %val) { +; CHECK-LABEL: f1: +; CHECK: vflpsb %v24, %v24 +; CHECK: br %r14 + %ret = call <4 x float> @llvm.fabs.v4f32(<4 x float> %val) + ret <4 x float> %ret +} + +; Test a negative absolute. +define <4 x float> @f2(<4 x float> %val) { +; CHECK-LABEL: f2: +; CHECK: vflnsb %v24, %v24 +; CHECK: br %r14 + %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %val) + %ret = fsub <4 x float> , %abs + ret <4 x float> %ret +} + +; Test an f32 absolute that uses vector registers. +define float @f3(<4 x float> %val) { +; CHECK-LABEL: f3: +; CHECK: wflpsb %f0, %v24 +; CHECK: br %r14 + %scalar = extractelement <4 x float> %val, i32 0 + %ret = call float @llvm.fabs.f32(float %scalar) + ret float %ret +} + +; Test an f32 negative absolute that uses vector registers. +define float @f4(<4 x float> %val) { +; CHECK-LABEL: f4: +; CHECK: wflnsb %f0, %v24 +; CHECK: br %r14 + %scalar = extractelement <4 x float> %val, i32 0 + %abs = call float @llvm.fabs.f32(float %scalar) + %ret = fsub float -0.0, %abs + ret float %ret +} diff --git a/test/CodeGen/SystemZ/vec-add-02.ll b/test/CodeGen/SystemZ/vec-add-02.ll new file mode 100644 index 000000000000..97a9b84a063c --- /dev/null +++ b/test/CodeGen/SystemZ/vec-add-02.ll @@ -0,0 +1,24 @@ +; Test vector addition on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test a v4f32 addition. +define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f1: +; CHECK: vfasb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = fadd <4 x float> %val1, %val2 + ret <4 x float> %ret +} + +; Test an f32 addition that uses vector registers. +define float @f2(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f2: +; CHECK: wfasb %f0, %v24, %v26 +; CHECK: br %r14 + %scalar1 = extractelement <4 x float> %val1, i32 0 + %scalar2 = extractelement <4 x float> %val2, i32 0 + %ret = fadd float %scalar1, %scalar2 + ret float %ret +} diff --git a/test/CodeGen/SystemZ/vec-and-04.ll b/test/CodeGen/SystemZ/vec-and-04.ll new file mode 100644 index 000000000000..e9355beb4296 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-and-04.ll @@ -0,0 +1,47 @@ +; Test vector NAND on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test a v16i8 NAND. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vnn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <16 x i8> %val1, %val2 + %not = xor <16 x i8> %ret, + ret <16 x i8> %not +} + +; Test a v8i16 NAND. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vnn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <8 x i16> %val1, %val2 + %not = xor <8 x i16> %ret, + ret <8 x i16> %not +} + +; Test a v4i32 NAND. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vnn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <4 x i32> %val1, %val2 + %not = xor <4 x i32> %ret, + ret <4 x i32> %not +} + +; Test a v2i64 NAND. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vnn %v24, %v26, %v28 +; CHECK: br %r14 + %ret = and <2 x i64> %val1, %val2 + %not = xor <2 x i64> %ret, + ret <2 x i64> %not +} diff --git a/test/CodeGen/SystemZ/vec-cmp-07.ll b/test/CodeGen/SystemZ/vec-cmp-07.ll new file mode 100644 index 000000000000..f272ba4bd755 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-cmp-07.ll @@ -0,0 +1,349 @@ +; Test f32 and v4f32 comparisons on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test oeq. +define <4 x i32> @f1(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f1: +; CHECK: vfcesb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = fcmp oeq <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test one. +define <4 x i32> @f2(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f2: +; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26 +; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v26, %v28 +; CHECK: vo %v24, [[REG1]], [[REG2]] +; CHECK-NEXT: br %r14 + %cmp = fcmp one <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ogt. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f3: +; CHECK: vfchsb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = fcmp ogt <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test oge. +define <4 x i32> @f4(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f4: +; CHECK: vfchesb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + %cmp = fcmp oge <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ole. +define <4 x i32> @f5(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f5: +; CHECK: vfchesb %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = fcmp ole <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test olt. +define <4 x i32> @f6(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f6: +; CHECK: vfchsb %v24, %v28, %v26 +; CHECK-NEXT: br %r14 + %cmp = fcmp olt <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ueq. +define <4 x i32> @f7(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f7: +; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26 +; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v26, %v28 +; CHECK: vno %v24, [[REG1]], [[REG2]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ueq <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test une. +define <4 x i32> @f8(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f8: +; CHECK: vfcesb [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp une <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ugt. +define <4 x i32> @f9(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f9: +; CHECK: vfchesb [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ugt <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test uge. +define <4 x i32> @f10(<4 x i32> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f10: +; CHECK: vfchsb [[REG:%v[0-9]+]], %v28, %v26 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp uge <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ule. +define <4 x i32> @f11(<4 x i32> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f11: +; CHECK: vfchsb [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ule <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ult. +define <4 x i32> @f12(<4 x i32> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f12: +; CHECK: vfchesb [[REG:%v[0-9]+]], %v26, %v28 +; CHECK-NEXT: vno %v24, [[REG]], [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ult <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test ord. +define <4 x i32> @f13(<4 x i32> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f13: +; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26 +; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v26, %v28 +; CHECK: vo %v24, [[REG1]], [[REG2]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ord <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test uno. +define <4 x i32> @f14(<4 x i32> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f14: +; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26 +; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v26, %v28 +; CHECK: vno %v24, [[REG1]], [[REG2]] +; CHECK-NEXT: br %r14 + %cmp = fcmp uno <4 x float> %val1, %val2 + %ret = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Test oeq selects. +define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f15: +; CHECK: vfcesb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp oeq <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test one selects. +define <4 x float> @f16(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f16: +; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24 +; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v24, %v26 +; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]] +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp one <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ogt selects. +define <4 x float> @f17(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f17: +; CHECK: vfchsb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ogt <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test oge selects. +define <4 x float> @f18(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f18: +; CHECK: vfchesb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp oge <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ole selects. +define <4 x float> @f19(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f19: +; CHECK: vfchesb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ole <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test olt selects. +define <4 x float> @f20(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f20: +; CHECK: vfchsb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp olt <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ueq selects. +define <4 x float> @f21(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f21: +; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24 +; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v24, %v26 +; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]] +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ueq <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test une selects. +define <4 x float> @f22(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f22: +; CHECK: vfcesb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp une <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ugt selects. +define <4 x float> @f23(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f23: +; CHECK: vfchesb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ugt <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test uge selects. +define <4 x float> @f24(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f24: +; CHECK: vfchsb [[REG:%v[0-9]+]], %v26, %v24 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp uge <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ule selects. +define <4 x float> @f25(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f25: +; CHECK: vfchsb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ule <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ult selects. +define <4 x float> @f26(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f26: +; CHECK: vfchesb [[REG:%v[0-9]+]], %v24, %v26 +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ult <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test ord selects. +define <4 x float> @f27(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f27: +; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24 +; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v24, %v26 +; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]] +; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp ord <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test uno selects. +define <4 x float> @f28(<4 x float> %val1, <4 x float> %val2, + <4 x float> %val3, <4 x float> %val4) { +; CHECK-LABEL: f28: +; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24 +; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v24, %v26 +; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]] +; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]] +; CHECK-NEXT: br %r14 + %cmp = fcmp uno <4 x float> %val1, %val2 + %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %ret +} + +; Test an f32 comparison that uses vector registers. +define i64 @f29(i64 %a, i64 %b, float %f1, <4 x float> %vec) { +; CHECK-LABEL: f29: +; CHECK: wfcsb %f0, %v24 +; CHECK-NEXT: locgrne %r2, %r3 +; CHECK: br %r14 + %f2 = extractelement <4 x float> %vec, i32 0 + %cond = fcmp oeq float %f1, %f2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} diff --git a/test/CodeGen/SystemZ/vec-ctpop-02.ll b/test/CodeGen/SystemZ/vec-ctpop-02.ll new file mode 100644 index 000000000000..ee50e88d0430 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-ctpop-02.ll @@ -0,0 +1,45 @@ +; Test vector population-count instruction on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) +declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + +define <16 x i8> @f1(<16 x i8> %a) { +; CHECK-LABEL: f1: +; CHECK: vpopctb %v24, %v24 +; CHECK: br %r14 + + %popcnt = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) + ret <16 x i8> %popcnt +} + +define <8 x i16> @f2(<8 x i16> %a) { +; CHECK-LABEL: f2: +; CHECK: vpopcth %v24, %v24 +; CHECK: br %r14 + + %popcnt = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) + ret <8 x i16> %popcnt +} + +define <4 x i32> @f3(<4 x i32> %a) { +; CHECK-LABEL: f3: +; CHECK: vpopctf %v24, %v24 +; CHECK: br %r14 + + %popcnt = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) + ret <4 x i32> %popcnt +} + +define <2 x i64> @f4(<2 x i64> %a) { +; CHECK-LABEL: f4: +; CHECK: vpopctg %v24, %v24 +; CHECK: br %r14 + + %popcnt = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + ret <2 x i64> %popcnt +} + diff --git a/test/CodeGen/SystemZ/vec-div-02.ll b/test/CodeGen/SystemZ/vec-div-02.ll new file mode 100644 index 000000000000..74e3b5148ad5 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-div-02.ll @@ -0,0 +1,24 @@ +; Test vector division on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test a v4f32 division. +define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f1: +; CHECK: vfdsb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = fdiv <4 x float> %val1, %val2 + ret <4 x float> %ret +} + +; Test an f32 division that uses vector registers. +define float @f2(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f2: +; CHECK: wfdsb %f0, %v24, %v26 +; CHECK: br %r14 + %scalar1 = extractelement <4 x float> %val1, i32 0 + %scalar2 = extractelement <4 x float> %val2, i32 0 + %ret = fdiv float %scalar1, %scalar2 + ret float %ret +} diff --git a/test/CodeGen/SystemZ/vec-intrinsics-01.ll b/test/CodeGen/SystemZ/vec-intrinsics-01.ll new file mode 100644 index 000000000000..6f5eb0691aa8 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-intrinsics-01.ll @@ -0,0 +1,3335 @@ +; Test vector intrinsics. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare i32 @llvm.s390.lcbb(i8 *, i32) +declare <16 x i8> @llvm.s390.vlbb(i8 *, i32) +declare <16 x i8> @llvm.s390.vll(i32, i8 *) +declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32) +declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>) +declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>) +declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>) +declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>) +declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>) +declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>) +declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>) +declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>) +declare void @llvm.s390.vstl(<16 x i8>, i32, i8 *) +declare <8 x i16> @llvm.s390.vuphb(<16 x i8>) +declare <4 x i32> @llvm.s390.vuphh(<8 x i16>) +declare <2 x i64> @llvm.s390.vuphf(<4 x i32>) +declare <8 x i16> @llvm.s390.vuplhb(<16 x i8>) +declare <4 x i32> @llvm.s390.vuplhh(<8 x i16>) +declare <2 x i64> @llvm.s390.vuplhf(<4 x i32>) +declare <8 x i16> @llvm.s390.vuplb(<16 x i8>) +declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>) +declare <2 x i64> @llvm.s390.vuplf(<4 x i32>) +declare <8 x i16> @llvm.s390.vupllb(<16 x i8>) +declare <4 x i32> @llvm.s390.vupllh(<8 x i16>) +declare <2 x i64> @llvm.s390.vupllf(<4 x i32>) +declare <16 x i8> @llvm.s390.vaccb(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vacch(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vaccf(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.s390.vaccg(<2 x i64>, <2 x i64>) +declare <16 x i8> @llvm.s390.vaq(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vacq(<16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vaccq(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vacccq(<16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vavgb(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vavgh(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vavgf(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.s390.vavgg(<2 x i64>, <2 x i64>) +declare <16 x i8> @llvm.s390.vavglb(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vavglh(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vavglf(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.s390.vavglg(<2 x i64>, <2 x i64>) +declare <4 x i32> @llvm.s390.vcksm(<4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.s390.vgfmb(<16 x i8>, <16 x i8>) +declare <4 x i32> @llvm.s390.vgfmh(<8 x i16>, <8 x i16>) +declare <2 x i64> @llvm.s390.vgfmf(<4 x i32>, <4 x i32>) +declare <16 x i8> @llvm.s390.vgfmg(<2 x i64>, <2 x i64>) +declare <8 x i16> @llvm.s390.vgfmab(<16 x i8>, <16 x i8>, <8 x i16>) +declare <4 x i32> @llvm.s390.vgfmah(<8 x i16>, <8 x i16>, <4 x i32>) +declare <2 x i64> @llvm.s390.vgfmaf(<4 x i32>, <4 x i32>, <2 x i64>) +declare <16 x i8> @llvm.s390.vgfmag(<2 x i64>, <2 x i64>, <16 x i8>) +declare <16 x i8> @llvm.s390.vmahb(<16 x i8>, <16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vmahh(<8 x i16>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vmahf(<4 x i32>, <4 x i32>, <4 x i32>) +declare <16 x i8> @llvm.s390.vmalhb(<16 x i8>, <16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vmalhh(<8 x i16>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vmalhf(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.s390.vmaeb(<16 x i8>, <16 x i8>, <8 x i16>) +declare <4 x i32> @llvm.s390.vmaeh(<8 x i16>, <8 x i16>, <4 x i32>) +declare <2 x i64> @llvm.s390.vmaef(<4 x i32>, <4 x i32>, <2 x i64>) +declare <8 x i16> @llvm.s390.vmaleb(<16 x i8>, <16 x i8>, <8 x i16>) +declare <4 x i32> @llvm.s390.vmaleh(<8 x i16>, <8 x i16>, <4 x i32>) +declare <2 x i64> @llvm.s390.vmalef(<4 x i32>, <4 x i32>, <2 x i64>) +declare <8 x i16> @llvm.s390.vmaob(<16 x i8>, <16 x i8>, <8 x i16>) +declare <4 x i32> @llvm.s390.vmaoh(<8 x i16>, <8 x i16>, <4 x i32>) +declare <2 x i64> @llvm.s390.vmaof(<4 x i32>, <4 x i32>, <2 x i64>) +declare <8 x i16> @llvm.s390.vmalob(<16 x i8>, <16 x i8>, <8 x i16>) +declare <4 x i32> @llvm.s390.vmaloh(<8 x i16>, <8 x i16>, <4 x i32>) +declare <2 x i64> @llvm.s390.vmalof(<4 x i32>, <4 x i32>, <2 x i64>) +declare <16 x i8> @llvm.s390.vmhb(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vmhh(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vmhf(<4 x i32>, <4 x i32>) +declare <16 x i8> @llvm.s390.vmlhb(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vmlhh(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vmlhf(<4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.s390.vmeb(<16 x i8>, <16 x i8>) +declare <4 x i32> @llvm.s390.vmeh(<8 x i16>, <8 x i16>) +declare <2 x i64> @llvm.s390.vmef(<4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.s390.vmleb(<16 x i8>, <16 x i8>) +declare <4 x i32> @llvm.s390.vmleh(<8 x i16>, <8 x i16>) +declare <2 x i64> @llvm.s390.vmlef(<4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.s390.vmob(<16 x i8>, <16 x i8>) +declare <4 x i32> @llvm.s390.vmoh(<8 x i16>, <8 x i16>) +declare <2 x i64> @llvm.s390.vmof(<4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.s390.vmlob(<16 x i8>, <16 x i8>) +declare <4 x i32> @llvm.s390.vmloh(<8 x i16>, <8 x i16>) +declare <2 x i64> @llvm.s390.vmlof(<4 x i32>, <4 x i32>) +declare <16 x i8> @llvm.s390.verllvb(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.verllvh(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.verllvf(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.s390.verllvg(<2 x i64>, <2 x i64>) +declare <16 x i8> @llvm.s390.verllb(<16 x i8>, i32) +declare <8 x i16> @llvm.s390.verllh(<8 x i16>, i32) +declare <4 x i32> @llvm.s390.verllf(<4 x i32>, i32) +declare <2 x i64> @llvm.s390.verllg(<2 x i64>, i32) +declare <16 x i8> @llvm.s390.verimb(<16 x i8>, <16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.s390.verimh(<8 x i16>, <8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.s390.verimf(<4 x i32>, <4 x i32>, <4 x i32>, i32) +declare <2 x i64> @llvm.s390.verimg(<2 x i64>, <2 x i64>, <2 x i64>, i32) +declare <16 x i8> @llvm.s390.vsl(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vslb(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vsra(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vsrab(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vsrl(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vsrlb(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32) +declare <16 x i8> @llvm.s390.vscbib(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vscbih(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vscbif(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.s390.vscbig(<2 x i64>, <2 x i64>) +declare <16 x i8> @llvm.s390.vsq(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vsbiq(<16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vscbiq(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vsbcbiq(<16 x i8>, <16 x i8>, <16 x i8>) +declare <4 x i32> @llvm.s390.vsumb(<16 x i8>, <16 x i8>) +declare <4 x i32> @llvm.s390.vsumh(<8 x i16>, <8 x i16>) +declare <2 x i64> @llvm.s390.vsumgh(<8 x i16>, <8 x i16>) +declare <2 x i64> @llvm.s390.vsumgf(<4 x i32>, <4 x i32>) +declare <16 x i8> @llvm.s390.vsumqf(<4 x i32>, <4 x i32>) +declare <16 x i8> @llvm.s390.vsumqg(<2 x i64>, <2 x i64>) +declare i32 @llvm.s390.vtm(<16 x i8>, <16 x i8>) +declare {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8>, <16 x i8>) +declare {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16>, <8 x i16>) +declare {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32>, <4 x i32>) +declare {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64>, <2 x i64>) +declare {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8>, <16 x i8>) +declare {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16>, <8 x i16>) +declare {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32>, <4 x i32>) +declare {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64>, <2 x i64>) +declare {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8>, <16 x i8>) +declare {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16>, <8 x i16>) +declare {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32>, <4 x i32>) +declare {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64>, <2 x i64>) +declare <16 x i8> @llvm.s390.vfaeb(<16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.s390.vfaeh(<8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.s390.vfaef(<4 x i32>, <4 x i32>, i32) +declare {<16 x i8>, i32} @llvm.s390.vfaebs(<16 x i8>, <16 x i8>, i32) +declare {<8 x i16>, i32} @llvm.s390.vfaehs(<8 x i16>, <8 x i16>, i32) +declare {<4 x i32>, i32} @llvm.s390.vfaefs(<4 x i32>, <4 x i32>, i32) +declare <16 x i8> @llvm.s390.vfaezb(<16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.s390.vfaezh(<8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.s390.vfaezf(<4 x i32>, <4 x i32>, i32) +declare {<16 x i8>, i32} @llvm.s390.vfaezbs(<16 x i8>, <16 x i8>, i32) +declare {<8 x i16>, i32} @llvm.s390.vfaezhs(<8 x i16>, <8 x i16>, i32) +declare {<4 x i32>, i32} @llvm.s390.vfaezfs(<4 x i32>, <4 x i32>, i32) +declare <16 x i8> @llvm.s390.vfeeb(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vfeeh(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vfeef(<4 x i32>, <4 x i32>) +declare {<16 x i8>, i32} @llvm.s390.vfeebs(<16 x i8>, <16 x i8>) +declare {<8 x i16>, i32} @llvm.s390.vfeehs(<8 x i16>, <8 x i16>) +declare {<4 x i32>, i32} @llvm.s390.vfeefs(<4 x i32>, <4 x i32>) +declare <16 x i8> @llvm.s390.vfeezb(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vfeezh(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vfeezf(<4 x i32>, <4 x i32>) +declare {<16 x i8>, i32} @llvm.s390.vfeezbs(<16 x i8>, <16 x i8>) +declare {<8 x i16>, i32} @llvm.s390.vfeezhs(<8 x i16>, <8 x i16>) +declare {<4 x i32>, i32} @llvm.s390.vfeezfs(<4 x i32>, <4 x i32>) +declare <16 x i8> @llvm.s390.vfeneb(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vfeneh(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vfenef(<4 x i32>, <4 x i32>) +declare {<16 x i8>, i32} @llvm.s390.vfenebs(<16 x i8>, <16 x i8>) +declare {<8 x i16>, i32} @llvm.s390.vfenehs(<8 x i16>, <8 x i16>) +declare {<4 x i32>, i32} @llvm.s390.vfenefs(<4 x i32>, <4 x i32>) +declare <16 x i8> @llvm.s390.vfenezb(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.s390.vfenezh(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.s390.vfenezf(<4 x i32>, <4 x i32>) +declare {<16 x i8>, i32} @llvm.s390.vfenezbs(<16 x i8>, <16 x i8>) +declare {<8 x i16>, i32} @llvm.s390.vfenezhs(<8 x i16>, <8 x i16>) +declare {<4 x i32>, i32} @llvm.s390.vfenezfs(<4 x i32>, <4 x i32>) +declare <16 x i8> @llvm.s390.vistrb(<16 x i8>) +declare <8 x i16> @llvm.s390.vistrh(<8 x i16>) +declare <4 x i32> @llvm.s390.vistrf(<4 x i32>) +declare {<16 x i8>, i32} @llvm.s390.vistrbs(<16 x i8>) +declare {<8 x i16>, i32} @llvm.s390.vistrhs(<8 x i16>) +declare {<4 x i32>, i32} @llvm.s390.vistrfs(<4 x i32>) +declare <16 x i8> @llvm.s390.vstrcb(<16 x i8>, <16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.s390.vstrch(<8 x i16>, <8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.s390.vstrcf(<4 x i32>, <4 x i32>, <4 x i32>, i32) +declare {<16 x i8>, i32} @llvm.s390.vstrcbs(<16 x i8>, <16 x i8>, <16 x i8>, + i32) +declare {<8 x i16>, i32} @llvm.s390.vstrchs(<8 x i16>, <8 x i16>, <8 x i16>, + i32) +declare {<4 x i32>, i32} @llvm.s390.vstrcfs(<4 x i32>, <4 x i32>, <4 x i32>, + i32) +declare <16 x i8> @llvm.s390.vstrczb(<16 x i8>, <16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.s390.vstrczh(<8 x i16>, <8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.s390.vstrczf(<4 x i32>, <4 x i32>, <4 x i32>, i32) +declare {<16 x i8>, i32} @llvm.s390.vstrczbs(<16 x i8>, <16 x i8>, <16 x i8>, + i32) +declare {<8 x i16>, i32} @llvm.s390.vstrczhs(<8 x i16>, <8 x i16>, <8 x i16>, + i32) +declare {<4 x i32>, i32} @llvm.s390.vstrczfs(<4 x i32>, <4 x i32>, <4 x i32>, + i32) +declare {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double>, <2 x double>) +declare {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double>, <2 x double>) +declare {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double>, <2 x double>) +declare {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double>, i32) +declare <2 x double> @llvm.s390.vfidb(<2 x double>, i32, i32) + +; LCBB with the lowest M3 operand. +define i32 @test_lcbb1(i8 *%ptr) { +; CHECK-LABEL: test_lcbb1: +; CHECK: lcbb %r2, 0(%r2), 0 +; CHECK: br %r14 + %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 0) + ret i32 %res +} + +; LCBB with the highest M3 operand. +define i32 @test_lcbb2(i8 *%ptr) { +; CHECK-LABEL: test_lcbb2: +; CHECK: lcbb %r2, 0(%r2), 15 +; CHECK: br %r14 + %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 15) + ret i32 %res +} + +; LCBB with a displacement and index. +define i32 @test_lcbb3(i8 *%base, i64 %index) { +; CHECK-LABEL: test_lcbb3: +; CHECK: lcbb %r2, 4095({{%r2,%r3|%r3,%r2}}), 4 +; CHECK: br %r14 + %add = add i64 %index, 4095 + %ptr = getelementptr i8, i8 *%base, i64 %add + %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 4) + ret i32 %res +} + +; LCBB with an out-of-range displacement. +define i32 @test_lcbb4(i8 *%base) { +; CHECK-LABEL: test_lcbb4: +; CHECK: lcbb %r2, 0({{%r[1-5]}}), 5 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 5) + ret i32 %res +} + +; VLBB with the lowest M3 operand. +define <16 x i8> @test_vlbb1(i8 *%ptr) { +; CHECK-LABEL: test_vlbb1: +; CHECK: vlbb %v24, 0(%r2), 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 0) + ret <16 x i8> %res +} + +; VLBB with the highest M3 operand. +define <16 x i8> @test_vlbb2(i8 *%ptr) { +; CHECK-LABEL: test_vlbb2: +; CHECK: vlbb %v24, 0(%r2), 15 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 15) + ret <16 x i8> %res +} + +; VLBB with a displacement and index. +define <16 x i8> @test_vlbb3(i8 *%base, i64 %index) { +; CHECK-LABEL: test_vlbb3: +; CHECK: vlbb %v24, 4095({{%r2,%r3|%r3,%r2}}), 4 +; CHECK: br %r14 + %add = add i64 %index, 4095 + %ptr = getelementptr i8, i8 *%base, i64 %add + %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 4) + ret <16 x i8> %res +} + +; VLBB with an out-of-range displacement. +define <16 x i8> @test_vlbb4(i8 *%base) { +; CHECK-LABEL: test_vlbb4: +; CHECK: vlbb %v24, 0({{%r[1-5]}}), 5 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 5) + ret <16 x i8> %res +} + +; VLL with the lowest in-range displacement. +define <16 x i8> @test_vll1(i8 *%ptr, i32 %length) { +; CHECK-LABEL: test_vll1: +; CHECK: vll %v24, %r3, 0(%r2) +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr) + ret <16 x i8> %res +} + +; VLL with the highest in-range displacement. +define <16 x i8> @test_vll2(i8 *%base, i32 %length) { +; CHECK-LABEL: test_vll2: +; CHECK: vll %v24, %r3, 4095(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4095 + %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr) + ret <16 x i8> %res +} + +; VLL with an out-of-range displacementa. +define <16 x i8> @test_vll3(i8 *%base, i32 %length) { +; CHECK-LABEL: test_vll3: +; CHECK: vll %v24, %r3, 0({{%r[1-5]}}) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr) + ret <16 x i8> %res +} + +; Check that VLL doesn't allow an index. +define <16 x i8> @test_vll4(i8 *%base, i64 %index, i32 %length) { +; CHECK-LABEL: test_vll4: +; CHECK: vll %v24, %r4, 0({{%r[1-5]}}) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 %index + %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr) + ret <16 x i8> %res +} + +; VPDI taking element 0 from each half. +define <2 x i64> @test_vpdi1(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vpdi1: +; CHECK: vpdi %v24, %v24, %v26, 0 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vpdi(<2 x i64> %a, <2 x i64> %b, i32 0) + ret <2 x i64> %res +} + +; VPDI taking element 1 from each half. +define <2 x i64> @test_vpdi2(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vpdi2: +; CHECK: vpdi %v24, %v24, %v26, 10 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vpdi(<2 x i64> %a, <2 x i64> %b, i32 10) + ret <2 x i64> %res +} + +; VPERM. +define <16 x i8> @test_vperm(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vperm: +; CHECK: vperm %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vperm(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c) + ret <16 x i8> %res +} + +; VPKSH. +define <16 x i8> @test_vpksh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vpksh: +; CHECK: vpksh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vpksh(<8 x i16> %a, <8 x i16> %b) + ret <16 x i8> %res +} + +; VPKSF. +define <8 x i16> @test_vpksf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vpksf: +; CHECK: vpksf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vpksf(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %res +} + +; VPKSG. +define <4 x i32> @test_vpksg(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vpksg: +; CHECK: vpksg %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vpksg(<2 x i64> %a, <2 x i64> %b) + ret <4 x i32> %res +} + +; VPKSHS with no processing of the result. +define <16 x i8> @test_vpkshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vpkshs: +; CHECK: vpkshs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VPKSHS, storing to %ptr if all values were saturated. +define <16 x i8> @test_vpkshs_all_store(<8 x i16> %a, <8 x i16> %b, i32 *%ptr) { +; CHECK-LABEL: test_vpkshs_all_store: +; CHECK: vpkshs %v24, %v24, %v26 +; CHECK-NEXT: {{bnor|bler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + %cmp = icmp uge i32 %cc, 3 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <16 x i8> %res +} + +; VPKSFS with no processing of the result. +define <8 x i16> @test_vpksfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vpksfs: +; CHECK: vpksfs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VPKSFS, storing to %ptr if any values were saturated. +define <8 x i16> @test_vpksfs_any_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) { +; CHECK-LABEL: test_vpksfs_any_store: +; CHECK: vpksfs %v24, %v24, %v26 +; CHECK-NEXT: {{bher|ber}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + %cmp = icmp ugt i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <8 x i16> %res +} + +; VPKSGS with no processing of the result. +define <4 x i32> @test_vpksgs(<2 x i64> %a, <2 x i64> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vpksgs: +; CHECK: vpksgs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VPKSGS, storing to %ptr if no elements were saturated +define <4 x i32> @test_vpksgs_none_store(<2 x i64> %a, <2 x i64> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vpksgs_none_store: +; CHECK: vpksgs %v24, %v24, %v26 +; CHECK-NEXT: {{bnher|bner}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp sle i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <4 x i32> %res +} + +; VPKLSH. +define <16 x i8> @test_vpklsh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vpklsh: +; CHECK: vpklsh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vpklsh(<8 x i16> %a, <8 x i16> %b) + ret <16 x i8> %res +} + +; VPKLSF. +define <8 x i16> @test_vpklsf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vpklsf: +; CHECK: vpklsf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> %a, <4 x i32> %b) + ret <8 x i16> %res +} + +; VPKLSG. +define <4 x i32> @test_vpklsg(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vpklsg: +; CHECK: vpklsg %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> %a, <2 x i64> %b) + ret <4 x i32> %res +} + +; VPKLSHS with no processing of the result. +define <16 x i8> @test_vpklshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vpklshs: +; CHECK: vpklshs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VPKLSHS, storing to %ptr if all values were saturated. +define <16 x i8> @test_vpklshs_all_store(<8 x i16> %a, <8 x i16> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vpklshs_all_store: +; CHECK: vpklshs %v24, %v24, %v26 +; CHECK-NEXT: {{bnor|bler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + %cmp = icmp eq i32 %cc, 3 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <16 x i8> %res +} + +; VPKLSFS with no processing of the result. +define <8 x i16> @test_vpklsfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vpklsfs: +; CHECK: vpklsfs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VPKLSFS, storing to %ptr if any values were saturated. +define <8 x i16> @test_vpklsfs_any_store(<4 x i32> %a, <4 x i32> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vpklsfs_any_store: +; CHECK: vpklsfs %v24, %v24, %v26 +; CHECK-NEXT: {{bher|ber}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + %cmp = icmp ne i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <8 x i16> %res +} + +; VPKLSGS with no processing of the result. +define <4 x i32> @test_vpklsgs(<2 x i64> %a, <2 x i64> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vpklsgs: +; CHECK: vpklsgs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VPKLSGS, storing to %ptr if no elements were saturated +define <4 x i32> @test_vpklsgs_none_store(<2 x i64> %a, <2 x i64> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vpklsgs_none_store: +; CHECK: vpklsgs %v24, %v24, %v26 +; CHECK-NEXT: {{bnher|bner}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp eq i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <4 x i32> %res +} + +; VSTL with the lowest in-range displacement. +define void @test_vstl1(<16 x i8> %vec, i8 *%ptr, i32 %length) { +; CHECK-LABEL: test_vstl1: +; CHECK: vstl %v24, %r3, 0(%r2) +; CHECK: br %r14 + call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr) + ret void +} + +; VSTL with the highest in-range displacement. +define void @test_vstl2(<16 x i8> %vec, i8 *%base, i32 %length) { +; CHECK-LABEL: test_vstl2: +; CHECK: vstl %v24, %r3, 4095(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4095 + call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr) + ret void +} + +; VSTL with an out-of-range displacement. +define void @test_vstl3(<16 x i8> %vec, i8 *%base, i32 %length) { +; CHECK-LABEL: test_vstl3: +; CHECK: vstl %v24, %r3, 0({{%r[1-5]}}) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr) + ret void +} + +; Check that VSTL doesn't allow an index. +define void @test_vstl4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) { +; CHECK-LABEL: test_vstl4: +; CHECK: vstl %v24, %r4, 0({{%r[1-5]}}) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 %index + call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr) + ret void +} + +; VUPHB. +define <8 x i16> @test_vuphb(<16 x i8> %a) { +; CHECK-LABEL: test_vuphb: +; CHECK: vuphb %v24, %v24 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vuphb(<16 x i8> %a) + ret <8 x i16> %res +} + +; VUPHH. +define <4 x i32> @test_vuphh(<8 x i16> %a) { +; CHECK-LABEL: test_vuphh: +; CHECK: vuphh %v24, %v24 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vuphh(<8 x i16> %a) + ret <4 x i32> %res +} + +; VUPHF. +define <2 x i64> @test_vuphf(<4 x i32> %a) { +; CHECK-LABEL: test_vuphf: +; CHECK: vuphf %v24, %v24 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vuphf(<4 x i32> %a) + ret <2 x i64> %res +} + +; VUPLHB. +define <8 x i16> @test_vuplhb(<16 x i8> %a) { +; CHECK-LABEL: test_vuplhb: +; CHECK: vuplhb %v24, %v24 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vuplhb(<16 x i8> %a) + ret <8 x i16> %res +} + +; VUPLHH. +define <4 x i32> @test_vuplhh(<8 x i16> %a) { +; CHECK-LABEL: test_vuplhh: +; CHECK: vuplhh %v24, %v24 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vuplhh(<8 x i16> %a) + ret <4 x i32> %res +} + +; VUPLHF. +define <2 x i64> @test_vuplhf(<4 x i32> %a) { +; CHECK-LABEL: test_vuplhf: +; CHECK: vuplhf %v24, %v24 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> %a) + ret <2 x i64> %res +} + +; VUPLB. +define <8 x i16> @test_vuplb(<16 x i8> %a) { +; CHECK-LABEL: test_vuplb: +; CHECK: vuplb %v24, %v24 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vuplb(<16 x i8> %a) + ret <8 x i16> %res +} + +; VUPLHW. +define <4 x i32> @test_vuplhw(<8 x i16> %a) { +; CHECK-LABEL: test_vuplhw: +; CHECK: vuplhw %v24, %v24 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vuplhw(<8 x i16> %a) + ret <4 x i32> %res +} + +; VUPLF. +define <2 x i64> @test_vuplf(<4 x i32> %a) { +; CHECK-LABEL: test_vuplf: +; CHECK: vuplf %v24, %v24 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vuplf(<4 x i32> %a) + ret <2 x i64> %res +} + +; VUPLLB. +define <8 x i16> @test_vupllb(<16 x i8> %a) { +; CHECK-LABEL: test_vupllb: +; CHECK: vupllb %v24, %v24 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vupllb(<16 x i8> %a) + ret <8 x i16> %res +} + +; VUPLLH. +define <4 x i32> @test_vupllh(<8 x i16> %a) { +; CHECK-LABEL: test_vupllh: +; CHECK: vupllh %v24, %v24 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vupllh(<8 x i16> %a) + ret <4 x i32> %res +} + +; VUPLLF. +define <2 x i64> @test_vupllf(<4 x i32> %a) { +; CHECK-LABEL: test_vupllf: +; CHECK: vupllf %v24, %v24 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vupllf(<4 x i32> %a) + ret <2 x i64> %res +} + +; VACCB. +define <16 x i8> @test_vaccb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vaccb: +; CHECK: vaccb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vaccb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VACCH. +define <8 x i16> @test_vacch(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vacch: +; CHECK: vacch %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vacch(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VACCF. +define <4 x i32> @test_vaccf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vaccf: +; CHECK: vaccf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vaccf(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VACCG. +define <2 x i64> @test_vaccg(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vaccg: +; CHECK: vaccg %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vaccg(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %res +} + +; VAQ. +define <16 x i8> @test_vaq(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vaq: +; CHECK: vaq %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vaq(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VACQ. +define <16 x i8> @test_vacq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vacq: +; CHECK: vacq %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vacq(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c) + ret <16 x i8> %res +} + +; VACCQ. +define <16 x i8> @test_vaccq(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vaccq: +; CHECK: vaccq %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vaccq(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VACCCQ. +define <16 x i8> @test_vacccq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vacccq: +; CHECK: vacccq %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vacccq(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c) + ret <16 x i8> %res +} + +; VAVGB. +define <16 x i8> @test_vavgb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vavgb: +; CHECK: vavgb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vavgb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VAVGH. +define <8 x i16> @test_vavgh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vavgh: +; CHECK: vavgh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vavgh(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VAVGF. +define <4 x i32> @test_vavgf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vavgf: +; CHECK: vavgf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vavgf(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VAVGG. +define <2 x i64> @test_vavgg(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vavgg: +; CHECK: vavgg %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vavgg(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %res +} + +; VAVGLB. +define <16 x i8> @test_vavglb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vavglb: +; CHECK: vavglb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vavglb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VAVGLH. +define <8 x i16> @test_vavglh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vavglh: +; CHECK: vavglh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vavglh(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VAVGLF. +define <4 x i32> @test_vavglf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vavglf: +; CHECK: vavglf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vavglf(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VAVGLG. +define <2 x i64> @test_vavglg(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vavglg: +; CHECK: vavglg %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vavglg(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %res +} + +; VCKSM. +define <4 x i32> @test_vcksm(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vcksm: +; CHECK: vcksm %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vcksm(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VGFMB. +define <8 x i16> @test_vgfmb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vgfmb: +; CHECK: vgfmb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vgfmb(<16 x i8> %a, <16 x i8> %b) + ret <8 x i16> %res +} + +; VGFMH. +define <4 x i32> @test_vgfmh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vgfmh: +; CHECK: vgfmh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vgfmh(<8 x i16> %a, <8 x i16> %b) + ret <4 x i32> %res +} + +; VGFMF. +define <2 x i64> @test_vgfmf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vgfmf: +; CHECK: vgfmf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vgfmf(<4 x i32> %a, <4 x i32> %b) + ret <2 x i64> %res +} + +; VGFMG. +define <16 x i8> @test_vgfmg(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vgfmg: +; CHECK: vgfmg %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vgfmg(<2 x i64> %a, <2 x i64> %b) + ret <16 x i8> %res +} + +; VGFMAB. +define <8 x i16> @test_vgfmab(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vgfmab: +; CHECK: vgfmab %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vgfmab(<16 x i8> %a, <16 x i8> %b, + <8 x i16> %c) + ret <8 x i16> %res +} + +; VGFMAH. +define <4 x i32> @test_vgfmah(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vgfmah: +; CHECK: vgfmah %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vgfmah(<8 x i16> %a, <8 x i16> %b, + <4 x i32> %c) + ret <4 x i32> %res +} + +; VGFMAF. +define <2 x i64> @test_vgfmaf(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { +; CHECK-LABEL: test_vgfmaf: +; CHECK: vgfmaf %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vgfmaf(<4 x i32> %a, <4 x i32> %b, + <2 x i64> %c) + ret <2 x i64> %res +} + +; VGFMAG. +define <16 x i8> @test_vgfmag(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vgfmag: +; CHECK: vgfmag %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vgfmag(<2 x i64> %a, <2 x i64> %b, + <16 x i8> %c) + ret <16 x i8> %res +} + +; VMAHB. +define <16 x i8> @test_vmahb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vmahb: +; CHECK: vmahb %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vmahb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c) + ret <16 x i8> %res +} + +; VMAHH. +define <8 x i16> @test_vmahh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmahh: +; CHECK: vmahh %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmahh(<8 x i16> %a, <8 x i16> %b, + <8 x i16> %c) + ret <8 x i16> %res +} + +; VMAHF. +define <4 x i32> @test_vmahf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmahf: +; CHECK: vmahf %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmahf(<4 x i32> %a, <4 x i32> %b, + <4 x i32> %c) + ret <4 x i32> %res +} + +; VMALHB. +define <16 x i8> @test_vmalhb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vmalhb: +; CHECK: vmalhb %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vmalhb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c) + ret <16 x i8> %res +} + +; VMALHH. +define <8 x i16> @test_vmalhh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmalhh: +; CHECK: vmalhh %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmalhh(<8 x i16> %a, <8 x i16> %b, + <8 x i16> %c) + ret <8 x i16> %res +} + +; VMALHF. +define <4 x i32> @test_vmalhf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmalhf: +; CHECK: vmalhf %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmalhf(<4 x i32> %a, <4 x i32> %b, + <4 x i32> %c) + ret <4 x i32> %res +} + +; VMAEB. +define <8 x i16> @test_vmaeb(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmaeb: +; CHECK: vmaeb %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmaeb(<16 x i8> %a, <16 x i8> %b, + <8 x i16> %c) + ret <8 x i16> %res +} + +; VMAEH. +define <4 x i32> @test_vmaeh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmaeh: +; CHECK: vmaeh %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmaeh(<8 x i16> %a, <8 x i16> %b, + <4 x i32> %c) + ret <4 x i32> %res +} + +; VMAEF. +define <2 x i64> @test_vmaef(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { +; CHECK-LABEL: test_vmaef: +; CHECK: vmaef %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vmaef(<4 x i32> %a, <4 x i32> %b, + <2 x i64> %c) + ret <2 x i64> %res +} + +; VMALEB. +define <8 x i16> @test_vmaleb(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmaleb: +; CHECK: vmaleb %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmaleb(<16 x i8> %a, <16 x i8> %b, + <8 x i16> %c) + ret <8 x i16> %res +} + +; VMALEH. +define <4 x i32> @test_vmaleh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmaleh: +; CHECK: vmaleh %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmaleh(<8 x i16> %a, <8 x i16> %b, + <4 x i32> %c) + ret <4 x i32> %res +} + +; VMALEF. +define <2 x i64> @test_vmalef(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { +; CHECK-LABEL: test_vmalef: +; CHECK: vmalef %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vmalef(<4 x i32> %a, <4 x i32> %b, + <2 x i64> %c) + ret <2 x i64> %res +} + +; VMAOB. +define <8 x i16> @test_vmaob(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmaob: +; CHECK: vmaob %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmaob(<16 x i8> %a, <16 x i8> %b, + <8 x i16> %c) + ret <8 x i16> %res +} + +; VMAOH. +define <4 x i32> @test_vmaoh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmaoh: +; CHECK: vmaoh %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmaoh(<8 x i16> %a, <8 x i16> %b, + <4 x i32> %c) + ret <4 x i32> %res +} + +; VMAOF. +define <2 x i64> @test_vmaof(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { +; CHECK-LABEL: test_vmaof: +; CHECK: vmaof %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vmaof(<4 x i32> %a, <4 x i32> %b, + <2 x i64> %c) + ret <2 x i64> %res +} + +; VMALOB. +define <8 x i16> @test_vmalob(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vmalob: +; CHECK: vmalob %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmalob(<16 x i8> %a, <16 x i8> %b, + <8 x i16> %c) + ret <8 x i16> %res +} + +; VMALOH. +define <4 x i32> @test_vmaloh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vmaloh: +; CHECK: vmaloh %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmaloh(<8 x i16> %a, <8 x i16> %b, + <4 x i32> %c) + ret <4 x i32> %res +} + +; VMALOF. +define <2 x i64> @test_vmalof(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { +; CHECK-LABEL: test_vmalof: +; CHECK: vmalof %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vmalof(<4 x i32> %a, <4 x i32> %b, + <2 x i64> %c) + ret <2 x i64> %res +} + +; VMHB. +define <16 x i8> @test_vmhb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmhb: +; CHECK: vmhb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vmhb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VMHH. +define <8 x i16> @test_vmhh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmhh: +; CHECK: vmhh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmhh(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VMHF. +define <4 x i32> @test_vmhf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmhf: +; CHECK: vmhf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmhf(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VMLHB. +define <16 x i8> @test_vmlhb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmlhb: +; CHECK: vmlhb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vmlhb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VMLHH. +define <8 x i16> @test_vmlhh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmlhh: +; CHECK: vmlhh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmlhh(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VMLHF. +define <4 x i32> @test_vmlhf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmlhf: +; CHECK: vmlhf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmlhf(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VMEB. +define <8 x i16> @test_vmeb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmeb: +; CHECK: vmeb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmeb(<16 x i8> %a, <16 x i8> %b) + ret <8 x i16> %res +} + +; VMEH. +define <4 x i32> @test_vmeh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmeh: +; CHECK: vmeh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmeh(<8 x i16> %a, <8 x i16> %b) + ret <4 x i32> %res +} + +; VMEF. +define <2 x i64> @test_vmef(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmef: +; CHECK: vmef %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vmef(<4 x i32> %a, <4 x i32> %b) + ret <2 x i64> %res +} + +; VMLEB. +define <8 x i16> @test_vmleb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmleb: +; CHECK: vmleb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmleb(<16 x i8> %a, <16 x i8> %b) + ret <8 x i16> %res +} + +; VMLEH. +define <4 x i32> @test_vmleh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmleh: +; CHECK: vmleh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmleh(<8 x i16> %a, <8 x i16> %b) + ret <4 x i32> %res +} + +; VMLEF. +define <2 x i64> @test_vmlef(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmlef: +; CHECK: vmlef %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vmlef(<4 x i32> %a, <4 x i32> %b) + ret <2 x i64> %res +} + +; VMOB. +define <8 x i16> @test_vmob(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmob: +; CHECK: vmob %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmob(<16 x i8> %a, <16 x i8> %b) + ret <8 x i16> %res +} + +; VMOH. +define <4 x i32> @test_vmoh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmoh: +; CHECK: vmoh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmoh(<8 x i16> %a, <8 x i16> %b) + ret <4 x i32> %res +} + +; VMOF. +define <2 x i64> @test_vmof(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmof: +; CHECK: vmof %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vmof(<4 x i32> %a, <4 x i32> %b) + ret <2 x i64> %res +} + +; VMLOB. +define <8 x i16> @test_vmlob(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmlob: +; CHECK: vmlob %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vmlob(<16 x i8> %a, <16 x i8> %b) + ret <8 x i16> %res +} + +; VMLOH. +define <4 x i32> @test_vmloh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmloh: +; CHECK: vmloh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vmloh(<8 x i16> %a, <8 x i16> %b) + ret <4 x i32> %res +} + +; VMLOF. +define <2 x i64> @test_vmlof(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmlof: +; CHECK: vmlof %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vmlof(<4 x i32> %a, <4 x i32> %b) + ret <2 x i64> %res +} + +; VERLLVB. +define <16 x i8> @test_verllvb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_verllvb: +; CHECK: verllvb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.verllvb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VERLLVH. +define <8 x i16> @test_verllvh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_verllvh: +; CHECK: verllvh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.verllvh(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VERLLVF. +define <4 x i32> @test_verllvf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_verllvf: +; CHECK: verllvf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.verllvf(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VERLLVG. +define <2 x i64> @test_verllvg(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_verllvg: +; CHECK: verllvg %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.verllvg(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %res +} + +; VERLLB. +define <16 x i8> @test_verllb(<16 x i8> %a, i32 %b) { +; CHECK-LABEL: test_verllb: +; CHECK: verllb %v24, %v24, 0(%r2) +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 %b) + ret <16 x i8> %res +} + +; VERLLH. +define <8 x i16> @test_verllh(<8 x i16> %a, i32 %b) { +; CHECK-LABEL: test_verllh: +; CHECK: verllh %v24, %v24, 0(%r2) +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.verllh(<8 x i16> %a, i32 %b) + ret <8 x i16> %res +} + +; VERLLF. +define <4 x i32> @test_verllf(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: test_verllf: +; CHECK: verllf %v24, %v24, 0(%r2) +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.verllf(<4 x i32> %a, i32 %b) + ret <4 x i32> %res +} + +; VERLLG. +define <2 x i64> @test_verllg(<2 x i64> %a, i32 %b) { +; CHECK-LABEL: test_verllg: +; CHECK: verllg %v24, %v24, 0(%r2) +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.verllg(<2 x i64> %a, i32 %b) + ret <2 x i64> %res +} + +; VERLLB with the smallest count. +define <16 x i8> @test_verllb_1(<16 x i8> %a) { +; CHECK-LABEL: test_verllb_1: +; CHECK: verllb %v24, %v24, 1 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 1) + ret <16 x i8> %res +} + +; VERLLB with the largest count. +define <16 x i8> @test_verllb_4095(<16 x i8> %a) { +; CHECK-LABEL: test_verllb_4095: +; CHECK: verllb %v24, %v24, 4095 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4095) + ret <16 x i8> %res +} + +; VERLLB with the largest count + 1. +define <16 x i8> @test_verllb_4096(<16 x i8> %a) { +; CHECK-LABEL: test_verllb_4096: +; CHECK: lhi [[REG:%r[1-5]]], 4096 +; CHECK: verllb %v24, %v24, 0([[REG]]) +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4096) + ret <16 x i8> %res +} + +; VERIMB. +define <16 x i8> @test_verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_verimb: +; CHECK: verimb %v24, %v26, %v28, 1 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, i32 1) + ret <16 x i8> %res +} + +; VERIMH. +define <8 x i16> @test_verimh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_verimh: +; CHECK: verimh %v24, %v26, %v28, 1 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.verimh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, i32 1) + ret <8 x i16> %res +} + +; VERIMF. +define <4 x i32> @test_verimf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_verimf: +; CHECK: verimf %v24, %v26, %v28, 1 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.verimf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 1) + ret <4 x i32> %res +} + +; VERIMG. +define <2 x i64> @test_verimg(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { +; CHECK-LABEL: test_verimg: +; CHECK: verimg %v24, %v26, %v28, 1 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.verimg(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, i32 1) + ret <2 x i64> %res +} + +; VERIMB with a different mask. +define <16 x i8> @test_verimb_254(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_verimb_254: +; CHECK: verimb %v24, %v26, %v28, 254 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, i32 254) + ret <16 x i8> %res +} + +; VSL. +define <16 x i8> @test_vsl(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsl: +; CHECK: vsl %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsl(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VSLB. +define <16 x i8> @test_vslb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vslb: +; CHECK: vslb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vslb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VSRA. +define <16 x i8> @test_vsra(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsra: +; CHECK: vsra %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsra(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VSRAB. +define <16 x i8> @test_vsrab(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsrab: +; CHECK: vsrab %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsrab(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VSRL. +define <16 x i8> @test_vsrl(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsrl: +; CHECK: vsrl %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsrl(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VSRLB. +define <16 x i8> @test_vsrlb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsrlb: +; CHECK: vsrlb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VSLDB with the minimum useful value. +define <16 x i8> @test_vsldb_1(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsldb_1: +; CHECK: vsldb %v24, %v24, %v26, 1 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsldb(<16 x i8> %a, <16 x i8> %b, i32 1) + ret <16 x i8> %res +} + +; VSLDB with the maximum value. +define <16 x i8> @test_vsldb_15(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsldb_15: +; CHECK: vsldb %v24, %v24, %v26, 15 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsldb(<16 x i8> %a, <16 x i8> %b, i32 15) + ret <16 x i8> %res +} + +; VSCBIB. +define <16 x i8> @test_vscbib(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vscbib: +; CHECK: vscbib %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vscbib(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VSCBIH. +define <8 x i16> @test_vscbih(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vscbih: +; CHECK: vscbih %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vscbih(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VSCBIF. +define <4 x i32> @test_vscbif(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vscbif: +; CHECK: vscbif %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vscbif(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VSCBIG. +define <2 x i64> @test_vscbig(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vscbig: +; CHECK: vscbig %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vscbig(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %res +} + +; VSQ. +define <16 x i8> @test_vsq(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsq: +; CHECK: vsq %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsq(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VSBIQ. +define <16 x i8> @test_vsbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vsbiq: +; CHECK: vsbiq %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsbiq(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c) + ret <16 x i8> %res +} + +; VSCBIQ. +define <16 x i8> @test_vscbiq(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vscbiq: +; CHECK: vscbiq %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vscbiq(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VSBCBIQ. +define <16 x i8> @test_vsbcbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vsbcbiq: +; CHECK: vsbcbiq %v24, %v24, %v26, %v28 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsbcbiq(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c) + ret <16 x i8> %res +} + +; VSUMB. +define <4 x i32> @test_vsumb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsumb: +; CHECK: vsumb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vsumb(<16 x i8> %a, <16 x i8> %b) + ret <4 x i32> %res +} + +; VSUMH. +define <4 x i32> @test_vsumh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vsumh: +; CHECK: vsumh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vsumh(<8 x i16> %a, <8 x i16> %b) + ret <4 x i32> %res +} + +; VSUMGH. +define <2 x i64> @test_vsumgh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vsumgh: +; CHECK: vsumgh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vsumgh(<8 x i16> %a, <8 x i16> %b) + ret <2 x i64> %res +} + +; VSUMGF. +define <2 x i64> @test_vsumgf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vsumgf: +; CHECK: vsumgf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vsumgf(<4 x i32> %a, <4 x i32> %b) + ret <2 x i64> %res +} + +; VSUMQF. +define <16 x i8> @test_vsumqf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vsumqf: +; CHECK: vsumqf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsumqf(<4 x i32> %a, <4 x i32> %b) + ret <16 x i8> %res +} + +; VSUMQG. +define <16 x i8> @test_vsumqg(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vsumqg: +; CHECK: vsumqg %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vsumqg(<2 x i64> %a, <2 x i64> %b) + ret <16 x i8> %res +} + +; VTM with no processing of the result. +define i32 @test_vtm(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vtm: +; CHECK: vtm %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b) + ret i32 %res +} + +; VTM, storing to %ptr if all bits are set. +define void @test_vtm_all_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) { +; CHECK-LABEL: test_vtm_all_store: +; CHECK-NOT: %r +; CHECK: vtm %v24, %v26 +; CHECK-NEXT: {{bnor|bler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b) + %cmp = icmp sge i32 %res, 3 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret void +} + +; VCEQBS with no processing of the result. +define i32 @test_vceqbs(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vceqbs: +; CHECK: vceqbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 1 + ret i32 %res +} + +; VCEQBS, returning 1 if any elements are equal (CC != 3). +define i32 @test_vceqbs_any_bool(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vceqbs_any_bool: +; CHECK: vceqbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: afi %r2, -536870912 +; CHECK: srl %r2, 31 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 1 + %cmp = icmp ne i32 %res, 3 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCEQBS, storing to %ptr if any elements are equal. +define <16 x i8> @test_vceqbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) { +; CHECK-LABEL: test_vceqbs_any_store: +; CHECK-NOT: %r +; CHECK: vceqbs %v24, %v24, %v26 +; CHECK-NEXT: {{bor|bnler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + %cmp = icmp ule i32 %cc, 2 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <16 x i8> %res +} + +; VCEQHS with no processing of the result. +define i32 @test_vceqhs(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vceqhs: +; CHECK: vceqhs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 1 + ret i32 %res +} + +; VCEQHS, returning 1 if not all elements are equal. +define i32 @test_vceqhs_notall_bool(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vceqhs_notall_bool: +; CHECK: vceqhs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: risblg %r2, [[REG]], 31, 159, 36 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 1 + %cmp = icmp sge i32 %res, 1 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCEQHS, storing to %ptr if not all elements are equal. +define <8 x i16> @test_vceqhs_notall_store(<8 x i16> %a, <8 x i16> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vceqhs_notall_store: +; CHECK-NOT: %r +; CHECK: vceqhs %v24, %v24, %v26 +; CHECK-NEXT: {{bher|ber}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + %cmp = icmp ugt i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <8 x i16> %res +} + +; VCEQFS with no processing of the result. +define i32 @test_vceqfs(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vceqfs: +; CHECK: vceqfs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + ret i32 %res +} + +; VCEQFS, returning 1 if no elements are equal. +define i32 @test_vceqfs_none_bool(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vceqfs_none_bool: +; CHECK: vceqfs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: risblg %r2, [[REG]], 31, 159, 35 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp eq i32 %res, 3 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCEQFS, storing to %ptr if no elements are equal. +define <4 x i32> @test_vceqfs_none_store(<4 x i32> %a, <4 x i32> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vceqfs_none_store: +; CHECK-NOT: %r +; CHECK: vceqfs %v24, %v24, %v26 +; CHECK-NEXT: {{bnor|bler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp uge i32 %cc, 3 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <4 x i32> %res +} + +; VCEQGS with no processing of the result. +define i32 @test_vceqgs(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vceqgs: +; CHECK: vceqgs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + ret i32 %res +} + +; VCEQGS returning 1 if all elements are equal (CC == 0). +define i32 @test_vceqgs_all_bool(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vceqgs_all_bool: +; CHECK: vceqgs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: afi %r2, -268435456 +; CHECK: srl %r2, 31 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp ult i32 %res, 1 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCEQGS, storing to %ptr if all elements are equal. +define <2 x i64> @test_vceqgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) { +; CHECK-LABEL: test_vceqgs_all_store: +; CHECK-NOT: %r +; CHECK: vceqgs %v24, %v24, %v26 +; CHECK-NEXT: {{bnher|bner}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<2 x i64>, i32} %call, 0 + %cc = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp sle i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <2 x i64> %res +} + +; VCHBS with no processing of the result. +define i32 @test_vchbs(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vchbs: +; CHECK: vchbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 1 + ret i32 %res +} + +; VCHBS, returning 1 if any elements are higher (CC != 3). +define i32 @test_vchbs_any_bool(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vchbs_any_bool: +; CHECK: vchbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: afi %r2, -536870912 +; CHECK: srl %r2, 31 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 1 + %cmp = icmp ne i32 %res, 3 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCHBS, storing to %ptr if any elements are higher. +define <16 x i8> @test_vchbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) { +; CHECK-LABEL: test_vchbs_any_store: +; CHECK-NOT: %r +; CHECK: vchbs %v24, %v24, %v26 +; CHECK-NEXT: {{bor|bnler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + %cmp = icmp ule i32 %cc, 2 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <16 x i8> %res +} + +; VCHHS with no processing of the result. +define i32 @test_vchhs(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vchhs: +; CHECK: vchhs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 1 + ret i32 %res +} + +; VCHHS, returning 1 if not all elements are higher. +define i32 @test_vchhs_notall_bool(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vchhs_notall_bool: +; CHECK: vchhs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: risblg %r2, [[REG]], 31, 159, 36 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 1 + %cmp = icmp sge i32 %res, 1 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCHHS, storing to %ptr if not all elements are higher. +define <8 x i16> @test_vchhs_notall_store(<8 x i16> %a, <8 x i16> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vchhs_notall_store: +; CHECK-NOT: %r +; CHECK: vchhs %v24, %v24, %v26 +; CHECK-NEXT: {{bher|ber}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + %cmp = icmp ugt i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <8 x i16> %res +} + +; VCHFS with no processing of the result. +define i32 @test_vchfs(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vchfs: +; CHECK: vchfs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + ret i32 %res +} + +; VCHFS, returning 1 if no elements are higher. +define i32 @test_vchfs_none_bool(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vchfs_none_bool: +; CHECK: vchfs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: risblg %r2, [[REG]], 31, 159, 35 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp eq i32 %res, 3 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCHFS, storing to %ptr if no elements are higher. +define <4 x i32> @test_vchfs_none_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) { +; CHECK-LABEL: test_vchfs_none_store: +; CHECK-NOT: %r +; CHECK: vchfs %v24, %v24, %v26 +; CHECK-NEXT: {{bnor|bler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp uge i32 %cc, 3 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <4 x i32> %res +} + +; VCHGS with no processing of the result. +define i32 @test_vchgs(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vchgs: +; CHECK: vchgs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + ret i32 %res +} + +; VCHGS returning 1 if all elements are higher (CC == 0). +define i32 @test_vchgs_all_bool(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vchgs_all_bool: +; CHECK: vchgs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: afi %r2, -268435456 +; CHECK: srl %r2, 31 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp ult i32 %res, 1 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCHGS, storing to %ptr if all elements are higher. +define <2 x i64> @test_vchgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) { +; CHECK-LABEL: test_vchgs_all_store: +; CHECK-NOT: %r +; CHECK: vchgs %v24, %v24, %v26 +; CHECK-NEXT: {{bnher|bner}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<2 x i64>, i32} %call, 0 + %cc = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp sle i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <2 x i64> %res +} + +; VCHLBS with no processing of the result. +define i32 @test_vchlbs(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vchlbs: +; CHECK: vchlbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 1 + ret i32 %res +} + +; VCHLBS, returning 1 if any elements are higher (CC != 3). +define i32 @test_vchlbs_any_bool(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vchlbs_any_bool: +; CHECK: vchlbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: afi %r2, -536870912 +; CHECK: srl %r2, 31 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 1 + %cmp = icmp ne i32 %res, 3 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCHLBS, storing to %ptr if any elements are higher. +define <16 x i8> @test_vchlbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) { +; CHECK-LABEL: test_vchlbs_any_store: +; CHECK-NOT: %r +; CHECK: vchlbs %v24, %v24, %v26 +; CHECK-NEXT: {{bor|bnler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + %cmp = icmp sle i32 %cc, 2 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <16 x i8> %res +} + +; VCHLHS with no processing of the result. +define i32 @test_vchlhs(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vchlhs: +; CHECK: vchlhs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 1 + ret i32 %res +} + +; VCHLHS, returning 1 if not all elements are higher. +define i32 @test_vchlhs_notall_bool(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vchlhs_notall_bool: +; CHECK: vchlhs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: risblg %r2, [[REG]], 31, 159, 36 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 1 + %cmp = icmp uge i32 %res, 1 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCHLHS, storing to %ptr if not all elements are higher. +define <8 x i16> @test_vchlhs_notall_store(<8 x i16> %a, <8 x i16> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vchlhs_notall_store: +; CHECK-NOT: %r +; CHECK: vchlhs %v24, %v24, %v26 +; CHECK-NEXT: {{bher|ber}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + %cmp = icmp sgt i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <8 x i16> %res +} + +; VCHLFS with no processing of the result. +define i32 @test_vchlfs(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vchlfs: +; CHECK: vchlfs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + ret i32 %res +} + +; VCHLFS, returning 1 if no elements are higher. +define i32 @test_vchlfs_none_bool(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vchlfs_none_bool: +; CHECK: vchlfs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: risblg %r2, [[REG]], 31, 159, 35 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp eq i32 %res, 3 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCHLFS, storing to %ptr if no elements are higher. +define <4 x i32> @test_vchlfs_none_store(<4 x i32> %a, <4 x i32> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vchlfs_none_store: +; CHECK-NOT: %r +; CHECK: vchlfs %v24, %v24, %v26 +; CHECK-NEXT: {{bnor|bler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp sge i32 %cc, 3 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <4 x i32> %res +} + +; VCHLGS with no processing of the result. +define i32 @test_vchlgs(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vchlgs: +; CHECK: vchlgs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + ret i32 %res +} + +; VCHLGS returning 1 if all elements are higher (CC == 0). +define i32 @test_vchlgs_all_bool(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vchlgs_all_bool: +; CHECK: vchlgs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: afi %r2, -268435456 +; CHECK: srl %r2, 31 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp slt i32 %res, 1 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VCHLGS, storing to %ptr if all elements are higher. +define <2 x i64> @test_vchlgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) { +; CHECK-LABEL: test_vchlgs_all_store: +; CHECK-NOT: %r +; CHECK: vchlgs %v24, %v24, %v26 +; CHECK-NEXT: {{bnher|bner}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b) + %res = extractvalue {<2 x i64>, i32} %call, 0 + %cc = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp ule i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <2 x i64> %res +} + +; VFAEB with !IN !RT. +define <16 x i8> @test_vfaeb_0(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfaeb_0: +; CHECK: vfaeb %v24, %v24, %v26, 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 0) + ret <16 x i8> %res +} + +; VFAEB with !IN RT. +define <16 x i8> @test_vfaeb_4(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfaeb_4: +; CHECK: vfaeb %v24, %v24, %v26, 4 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 4) + ret <16 x i8> %res +} + +; VFAEB with IN !RT. +define <16 x i8> @test_vfaeb_8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfaeb_8: +; CHECK: vfaeb %v24, %v24, %v26, 8 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 8) + ret <16 x i8> %res +} + +; VFAEB with IN RT. +define <16 x i8> @test_vfaeb_12(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfaeb_12: +; CHECK: vfaeb %v24, %v24, %v26, 12 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 12) + ret <16 x i8> %res +} + +; VFAEB with CS -- should be ignored. +define <16 x i8> @test_vfaeb_1(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfaeb_1: +; CHECK: vfaeb %v24, %v24, %v26, 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 1) + ret <16 x i8> %res +} + +; VFAEH. +define <8 x i16> @test_vfaeh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vfaeh: +; CHECK: vfaeh %v24, %v24, %v26, 4 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vfaeh(<8 x i16> %a, <8 x i16> %b, i32 4) + ret <8 x i16> %res +} + +; VFAEF. +define <4 x i32> @test_vfaef(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vfaef: +; CHECK: vfaef %v24, %v24, %v26, 8 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vfaef(<4 x i32> %a, <4 x i32> %b, i32 8) + ret <4 x i32> %res +} + +; VFAEBS. +define <16 x i8> @test_vfaebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfaebs: +; CHECK: vfaebs %v24, %v24, %v26, 0 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vfaebs(<16 x i8> %a, <16 x i8> %b, + i32 0) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VFAEHS. +define <8 x i16> @test_vfaehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfaehs: +; CHECK: vfaehs %v24, %v24, %v26, 4 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vfaehs(<8 x i16> %a, <8 x i16> %b, + i32 4) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VFAEFS. +define <4 x i32> @test_vfaefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfaefs: +; CHECK: vfaefs %v24, %v24, %v26, 8 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfaefs(<4 x i32> %a, <4 x i32> %b, + i32 8) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VFAEZB with !IN !RT. +define <16 x i8> @test_vfaezb_0(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfaezb_0: +; CHECK: vfaezb %v24, %v24, %v26, 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 0) + ret <16 x i8> %res +} + +; VFAEZB with !IN RT. +define <16 x i8> @test_vfaezb_4(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfaezb_4: +; CHECK: vfaezb %v24, %v24, %v26, 4 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 4) + ret <16 x i8> %res +} + +; VFAEZB with IN !RT. +define <16 x i8> @test_vfaezb_8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfaezb_8: +; CHECK: vfaezb %v24, %v24, %v26, 8 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 8) + ret <16 x i8> %res +} + +; VFAEZB with IN RT. +define <16 x i8> @test_vfaezb_12(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfaezb_12: +; CHECK: vfaezb %v24, %v24, %v26, 12 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 12) + ret <16 x i8> %res +} + +; VFAEZB with CS -- should be ignored. +define <16 x i8> @test_vfaezb_1(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfaezb_1: +; CHECK: vfaezb %v24, %v24, %v26, 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 1) + ret <16 x i8> %res +} + +; VFAEZH. +define <8 x i16> @test_vfaezh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vfaezh: +; CHECK: vfaezh %v24, %v24, %v26, 4 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vfaezh(<8 x i16> %a, <8 x i16> %b, i32 4) + ret <8 x i16> %res +} + +; VFAEZF. +define <4 x i32> @test_vfaezf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vfaezf: +; CHECK: vfaezf %v24, %v24, %v26, 8 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vfaezf(<4 x i32> %a, <4 x i32> %b, i32 8) + ret <4 x i32> %res +} + +; VFAEZBS. +define <16 x i8> @test_vfaezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfaezbs: +; CHECK: vfaezbs %v24, %v24, %v26, 0 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vfaezbs(<16 x i8> %a, <16 x i8> %b, + i32 0) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VFAEZHS. +define <8 x i16> @test_vfaezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfaezhs: +; CHECK: vfaezhs %v24, %v24, %v26, 4 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vfaezhs(<8 x i16> %a, <8 x i16> %b, + i32 4) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VFAEZFS. +define <4 x i32> @test_vfaezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfaezfs: +; CHECK: vfaezfs %v24, %v24, %v26, 8 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfaezfs(<4 x i32> %a, <4 x i32> %b, + i32 8) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VFEEB. +define <16 x i8> @test_vfeeb_0(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfeeb_0: +; CHECK: vfeeb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfeeb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VFEEH. +define <8 x i16> @test_vfeeh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vfeeh: +; CHECK: vfeeh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vfeeh(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VFEEF. +define <4 x i32> @test_vfeef(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vfeef: +; CHECK: vfeef %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vfeef(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VFEEBS. +define <16 x i8> @test_vfeebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfeebs: +; CHECK: vfeebs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vfeebs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VFEEHS. +define <8 x i16> @test_vfeehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfeehs: +; CHECK: vfeehs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vfeehs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VFEEFS. +define <4 x i32> @test_vfeefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfeefs: +; CHECK: vfeefs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfeefs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VFEEZB. +define <16 x i8> @test_vfeezb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfeezb: +; CHECK: vfeezb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfeezb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VFEEZH. +define <8 x i16> @test_vfeezh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vfeezh: +; CHECK: vfeezh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vfeezh(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VFEEZF. +define <4 x i32> @test_vfeezf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vfeezf: +; CHECK: vfeezf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vfeezf(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VFEEZBS. +define <16 x i8> @test_vfeezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfeezbs: +; CHECK: vfeezbs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vfeezbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VFEEZHS. +define <8 x i16> @test_vfeezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfeezhs: +; CHECK: vfeezhs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vfeezhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VFEEZFS. +define <4 x i32> @test_vfeezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfeezfs: +; CHECK: vfeezfs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfeezfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VFENEB. +define <16 x i8> @test_vfeneb_0(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfeneb_0: +; CHECK: vfeneb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfeneb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VFENEH. +define <8 x i16> @test_vfeneh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vfeneh: +; CHECK: vfeneh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vfeneh(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VFENEF. +define <4 x i32> @test_vfenef(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vfenef: +; CHECK: vfenef %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vfenef(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VFENEBS. +define <16 x i8> @test_vfenebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfenebs: +; CHECK: vfenebs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vfenebs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VFENEHS. +define <8 x i16> @test_vfenehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfenehs: +; CHECK: vfenehs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vfenehs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VFENEFS. +define <4 x i32> @test_vfenefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfenefs: +; CHECK: vfenefs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfenefs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VFENEZB. +define <16 x i8> @test_vfenezb(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vfenezb: +; CHECK: vfenezb %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vfenezb(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %res +} + +; VFENEZH. +define <8 x i16> @test_vfenezh(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vfenezh: +; CHECK: vfenezh %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vfenezh(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +; VFENEZF. +define <4 x i32> @test_vfenezf(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vfenezf: +; CHECK: vfenezf %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vfenezf(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %res +} + +; VFENEZBS. +define <16 x i8> @test_vfenezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfenezbs: +; CHECK: vfenezbs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vfenezbs(<16 x i8> %a, <16 x i8> %b) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VFENEZHS. +define <8 x i16> @test_vfenezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfenezhs: +; CHECK: vfenezhs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vfenezhs(<8 x i16> %a, <8 x i16> %b) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VFENEZFS. +define <4 x i32> @test_vfenezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { +; CHECK-LABEL: test_vfenezfs: +; CHECK: vfenezfs %v24, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfenezfs(<4 x i32> %a, <4 x i32> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VISTRB. +define <16 x i8> @test_vistrb(<16 x i8> %a) { +; CHECK-LABEL: test_vistrb: +; CHECK: vistrb %v24, %v24 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vistrb(<16 x i8> %a) + ret <16 x i8> %res +} + +; VISTRH. +define <8 x i16> @test_vistrh(<8 x i16> %a) { +; CHECK-LABEL: test_vistrh: +; CHECK: vistrh %v24, %v24 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vistrh(<8 x i16> %a) + ret <8 x i16> %res +} + +; VISTRF. +define <4 x i32> @test_vistrf(<4 x i32> %a) { +; CHECK-LABEL: test_vistrf: +; CHECK: vistrf %v24, %v24 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vistrf(<4 x i32> %a) + ret <4 x i32> %res +} + +; VISTRBS. +define <16 x i8> @test_vistrbs(<16 x i8> %a, i32 *%ccptr) { +; CHECK-LABEL: test_vistrbs: +; CHECK: vistrbs %v24, %v24 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vistrbs(<16 x i8> %a) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VISTRHS. +define <8 x i16> @test_vistrhs(<8 x i16> %a, i32 *%ccptr) { +; CHECK-LABEL: test_vistrhs: +; CHECK: vistrhs %v24, %v24 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vistrhs(<8 x i16> %a) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VISTRFS. +define <4 x i32> @test_vistrfs(<4 x i32> %a, i32 *%ccptr) { +; CHECK-LABEL: test_vistrfs: +; CHECK: vistrfs %v24, %v24 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vistrfs(<4 x i32> %a) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VSTRCB with !IN !RT. +define <16 x i8> @test_vstrcb_0(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vstrcb_0: +; CHECK: vstrcb %v24, %v24, %v26, %v28, 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 0) + ret <16 x i8> %res +} + +; VSTRCB with !IN RT. +define <16 x i8> @test_vstrcb_4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vstrcb_4: +; CHECK: vstrcb %v24, %v24, %v26, %v28, 4 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 4) + ret <16 x i8> %res +} + +; VSTRCB with IN !RT. +define <16 x i8> @test_vstrcb_8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vstrcb_8: +; CHECK: vstrcb %v24, %v24, %v26, %v28, 8 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 8) + ret <16 x i8> %res +} + +; VSTRCB with IN RT. +define <16 x i8> @test_vstrcb_12(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vstrcb_12: +; CHECK: vstrcb %v24, %v24, %v26, %v28, 12 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 12) + ret <16 x i8> %res +} + +; VSTRCB with CS -- should be ignored. +define <16 x i8> @test_vstrcb_1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vstrcb_1: +; CHECK: vstrcb %v24, %v24, %v26, %v28, 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 1) + ret <16 x i8> %res +} + +; VSTRCH. +define <8 x i16> @test_vstrch(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vstrch: +; CHECK: vstrch %v24, %v24, %v26, %v28, 4 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vstrch(<8 x i16> %a, <8 x i16> %b, + <8 x i16> %c, i32 4) + ret <8 x i16> %res +} + +; VSTRCF. +define <4 x i32> @test_vstrcf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vstrcf: +; CHECK: vstrcf %v24, %v24, %v26, %v28, 8 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vstrcf(<4 x i32> %a, <4 x i32> %b, + <4 x i32> %c, i32 8) + ret <4 x i32> %res +} + +; VSTRCBS. +define <16 x i8> @test_vstrcbs(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, + i32 *%ccptr) { +; CHECK-LABEL: test_vstrcbs: +; CHECK: vstrcbs %v24, %v24, %v26, %v28, 0 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vstrcbs(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 0) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VSTRCHS. +define <8 x i16> @test_vstrchs(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, + i32 *%ccptr) { +; CHECK-LABEL: test_vstrchs: +; CHECK: vstrchs %v24, %v24, %v26, %v28, 4 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vstrchs(<8 x i16> %a, <8 x i16> %b, + <8 x i16> %c, i32 4) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VSTRCFS. +define <4 x i32> @test_vstrcfs(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, + i32 *%ccptr) { +; CHECK-LABEL: test_vstrcfs: +; CHECK: vstrcfs %v24, %v24, %v26, %v28, 8 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vstrcfs(<4 x i32> %a, <4 x i32> %b, + <4 x i32> %c, i32 8) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VSTRCZB with !IN !RT. +define <16 x i8> @test_vstrczb_0(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vstrczb_0: +; CHECK: vstrczb %v24, %v24, %v26, %v28, 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 0) + ret <16 x i8> %res +} + +; VSTRCZB with !IN RT. +define <16 x i8> @test_vstrczb_4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vstrczb_4: +; CHECK: vstrczb %v24, %v24, %v26, %v28, 4 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 4) + ret <16 x i8> %res +} + +; VSTRCZB with IN !RT. +define <16 x i8> @test_vstrczb_8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vstrczb_8: +; CHECK: vstrczb %v24, %v24, %v26, %v28, 8 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 8) + ret <16 x i8> %res +} + +; VSTRCZB with IN RT. +define <16 x i8> @test_vstrczb_12(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vstrczb_12: +; CHECK: vstrczb %v24, %v24, %v26, %v28, 12 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 12) + ret <16 x i8> %res +} + +; VSTRCZB with CS -- should be ignored. +define <16 x i8> @test_vstrczb_1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vstrczb_1: +; CHECK: vstrczb %v24, %v24, %v26, %v28, 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 1) + ret <16 x i8> %res +} + +; VSTRCZH. +define <8 x i16> @test_vstrczh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK-LABEL: test_vstrczh: +; CHECK: vstrczh %v24, %v24, %v26, %v28, 4 +; CHECK: br %r14 + %res = call <8 x i16> @llvm.s390.vstrczh(<8 x i16> %a, <8 x i16> %b, + <8 x i16> %c, i32 4) + ret <8 x i16> %res +} + +; VSTRCZF. +define <4 x i32> @test_vstrczf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: test_vstrczf: +; CHECK: vstrczf %v24, %v24, %v26, %v28, 8 +; CHECK: br %r14 + %res = call <4 x i32> @llvm.s390.vstrczf(<4 x i32> %a, <4 x i32> %b, + <4 x i32> %c, i32 8) + ret <4 x i32> %res +} + +; VSTRCZBS. +define <16 x i8> @test_vstrczbs(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, + i32 *%ccptr) { +; CHECK-LABEL: test_vstrczbs: +; CHECK: vstrczbs %v24, %v24, %v26, %v28, 0 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<16 x i8>, i32} @llvm.s390.vstrczbs(<16 x i8> %a, <16 x i8> %b, + <16 x i8> %c, i32 0) + %res = extractvalue {<16 x i8>, i32} %call, 0 + %cc = extractvalue {<16 x i8>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <16 x i8> %res +} + +; VSTRCZHS. +define <8 x i16> @test_vstrczhs(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, + i32 *%ccptr) { +; CHECK-LABEL: test_vstrczhs: +; CHECK: vstrczhs %v24, %v24, %v26, %v28, 4 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<8 x i16>, i32} @llvm.s390.vstrczhs(<8 x i16> %a, <8 x i16> %b, + <8 x i16> %c, i32 4) + %res = extractvalue {<8 x i16>, i32} %call, 0 + %cc = extractvalue {<8 x i16>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <8 x i16> %res +} + +; VSTRCZFS. +define <4 x i32> @test_vstrczfs(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, + i32 *%ccptr) { +; CHECK-LABEL: test_vstrczfs: +; CHECK: vstrczfs %v24, %v24, %v26, %v28, 8 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: srl [[REG]], 28 +; CHECK: st [[REG]], 0(%r2) +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vstrczfs(<4 x i32> %a, <4 x i32> %b, + <4 x i32> %c, i32 8) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + store i32 %cc, i32 *%ccptr + ret <4 x i32> %res +} + +; VFCEDBS with no processing of the result. +define i32 @test_vfcedbs(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_vfcedbs: +; CHECK: vfcedbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a, + <2 x double> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + ret i32 %res +} + +; VFCEDBS, returning 1 if any elements are equal (CC != 3). +define i32 @test_vfcedbs_any_bool(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_vfcedbs_any_bool: +; CHECK: vfcedbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: afi %r2, -536870912 +; CHECK: srl %r2, 31 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a, + <2 x double> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp ne i32 %res, 3 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VFCEDBS, storing to %ptr if any elements are equal. +define <2 x i64> @test_vfcedbs_any_store(<2 x double> %a, <2 x double> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vfcedbs_any_store: +; CHECK-NOT: %r +; CHECK: vfcedbs %v24, %v24, %v26 +; CHECK-NEXT: {{bor|bnler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a, + <2 x double> %b) + %res = extractvalue {<2 x i64>, i32} %call, 0 + %cc = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp ule i32 %cc, 2 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <2 x i64> %res +} + +; VFCHDBS with no processing of the result. +define i32 @test_vfchdbs(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_vfchdbs: +; CHECK: vfchdbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a, + <2 x double> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + ret i32 %res +} + +; VFCHDBS, returning 1 if not all elements are higher. +define i32 @test_vfchdbs_notall_bool(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_vfchdbs_notall_bool: +; CHECK: vfchdbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: risblg %r2, [[REG]], 31, 159, 36 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a, + <2 x double> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp sge i32 %res, 1 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VFCHDBS, storing to %ptr if not all elements are higher. +define <2 x i64> @test_vfchdbs_notall_store(<2 x double> %a, <2 x double> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vfchdbs_notall_store: +; CHECK-NOT: %r +; CHECK: vfchdbs %v24, %v24, %v26 +; CHECK-NEXT: {{bher|ber}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a, + <2 x double> %b) + %res = extractvalue {<2 x i64>, i32} %call, 0 + %cc = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp ugt i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <2 x i64> %res +} + +; VFCHEDBS with no processing of the result. +define i32 @test_vfchedbs(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_vfchedbs: +; CHECK: vfchedbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a, + <2 x double> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + ret i32 %res +} + +; VFCHEDBS, returning 1 if neither element is higher or equal. +define i32 @test_vfchedbs_none_bool(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_vfchedbs_none_bool: +; CHECK: vfchedbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: risblg %r2, [[REG]], 31, 159, 35 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a, + <2 x double> %b) + %res = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp eq i32 %res, 3 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VFCHEDBS, storing to %ptr if neither element is higher or equal. +define <2 x i64> @test_vfchedbs_none_store(<2 x double> %a, <2 x double> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vfchedbs_none_store: +; CHECK-NOT: %r +; CHECK: vfchedbs %v24, %v24, %v26 +; CHECK-NEXT: {{bnor|bler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a, + <2 x double> %b) + %res = extractvalue {<2 x i64>, i32} %call, 0 + %cc = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp uge i32 %cc, 3 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <2 x i64> %res +} + +; VFTCIDB with the lowest useful class selector and no processing of the result. +define i32 @test_vftcidb(<2 x double> %a) { +; CHECK-LABEL: test_vftcidb: +; CHECK: vftcidb {{%v[0-9]+}}, %v24, 1 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double> %a, i32 1) + %res = extractvalue {<2 x i64>, i32} %call, 1 + ret i32 %res +} + +; VFTCIDB with the highest useful class selector, returning 1 if all elements +; have the right class (CC == 0). +define i32 @test_vftcidb_all_bool(<2 x double> %a) { +; CHECK-LABEL: test_vftcidb_all_bool: +; CHECK: vftcidb {{%v[0-9]+}}, %v24, 4094 +; CHECK: afi %r2, -268435456 +; CHECK: srl %r2, 31 +; CHECK: br %r14 + %call = call {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double> %a, i32 4094) + %res = extractvalue {<2 x i64>, i32} %call, 1 + %cmp = icmp eq i32 %res, 0 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VFIDB with a rounding mode not usable via standard intrinsics. +define <2 x double> @test_vfidb_0_4(<2 x double> %a) { +; CHECK-LABEL: test_vfidb_0_4: +; CHECK: vfidb %v24, %v24, 0, 4 +; CHECK: br %r14 + %res = call <2 x double> @llvm.s390.vfidb(<2 x double> %a, i32 0, i32 4) + ret <2 x double> %res +} + +; VFIDB with IEEE-inexact exception suppressed. +define <2 x double> @test_vfidb_4_0(<2 x double> %a) { +; CHECK-LABEL: test_vfidb_4_0: +; CHECK: vfidb %v24, %v24, 4, 0 +; CHECK: br %r14 + %res = call <2 x double> @llvm.s390.vfidb(<2 x double> %a, i32 4, i32 0) + ret <2 x double> %res +} + diff --git a/test/CodeGen/SystemZ/vec-intrinsics-02.ll b/test/CodeGen/SystemZ/vec-intrinsics-02.ll new file mode 100644 index 000000000000..84c6a0784031 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-intrinsics-02.ll @@ -0,0 +1,441 @@ +; Test vector intrinsics added with z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare <2 x i64> @llvm.s390.vbperm(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.s390.vmslg(<2 x i64>, <2 x i64>, <16 x i8>, i32) +declare <16 x i8> @llvm.s390.vlrl(i32, i8 *) +declare void @llvm.s390.vstrl(<16 x i8>, i32, i8 *) + +declare {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float>, <4 x float>) +declare {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float>, <4 x float>) +declare {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float>, <4 x float>) +declare {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float>, i32) +declare <4 x float> @llvm.s390.vfisb(<4 x float>, i32, i32) + +declare <2 x double> @llvm.s390.vfmaxdb(<2 x double>, <2 x double>, i32) +declare <2 x double> @llvm.s390.vfmindb(<2 x double>, <2 x double>, i32) +declare <4 x float> @llvm.s390.vfmaxsb(<4 x float>, <4 x float>, i32) +declare <4 x float> @llvm.s390.vfminsb(<4 x float>, <4 x float>, i32) + +; VBPERM. +define <2 x i64> @test_vbperm(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vbperm: +; CHECK: vbperm %v24, %v24, %v26 +; CHECK: br %r14 + %res = call <2 x i64> @llvm.s390.vbperm(<16 x i8> %a, <16 x i8> %b) + ret <2 x i64> %res +} + +; VMSLG with no shifts. +define <16 x i8> @test_vmslg1(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vmslg1: +; CHECK: vmslg %v24, %v24, %v26, %v28, 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c, i32 0) + ret <16 x i8> %res +} + +; VMSLG with both shifts. +define <16 x i8> @test_vmslg2(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) { +; CHECK-LABEL: test_vmslg2: +; CHECK: vmslg %v24, %v24, %v26, %v28, 12 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c, i32 12) + ret <16 x i8> %res +} + +; VLRLR with the lowest in-range displacement. +define <16 x i8> @test_vlrlr1(i8 *%ptr, i32 %length) { +; CHECK-LABEL: test_vlrlr1: +; CHECK: vlrlr %v24, %r3, 0(%r2) +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr) + ret <16 x i8> %res +} + +; VLRLR with the highest in-range displacement. +define <16 x i8> @test_vlrlr2(i8 *%base, i32 %length) { +; CHECK-LABEL: test_vlrlr2: +; CHECK: vlrlr %v24, %r3, 4095(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4095 + %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr) + ret <16 x i8> %res +} + +; VLRLR with an out-of-range displacement. +define <16 x i8> @test_vlrlr3(i8 *%base, i32 %length) { +; CHECK-LABEL: test_vlrlr3: +; CHECK: vlrlr %v24, %r3, 0({{%r[1-5]}}) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr) + ret <16 x i8> %res +} + +; Check that VLRLR doesn't allow an index. +define <16 x i8> @test_vlrlr4(i8 *%base, i64 %index, i32 %length) { +; CHECK-LABEL: test_vlrlr4: +; CHECK: vlrlr %v24, %r4, 0({{%r[1-5]}}) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 %index + %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr) + ret <16 x i8> %res +} + +; VLRL with the lowest in-range displacement. +define <16 x i8> @test_vlrl1(i8 *%ptr) { +; CHECK-LABEL: test_vlrl1: +; CHECK: vlrl %v24, 0(%r2), 0 +; CHECK: br %r14 + %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr) + ret <16 x i8> %res +} + +; VLRL with the highest in-range displacement. +define <16 x i8> @test_vlrl2(i8 *%base) { +; CHECK-LABEL: test_vlrl2: +; CHECK: vlrl %v24, 4095(%r2), 0 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4095 + %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr) + ret <16 x i8> %res +} + +; VLRL with an out-of-range displacement. +define <16 x i8> @test_vlrl3(i8 *%base) { +; CHECK-LABEL: test_vlrl3: +; CHECK: vlrl %v24, 0({{%r[1-5]}}), 0 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr) + ret <16 x i8> %res +} + +; Check that VLRL doesn't allow an index. +define <16 x i8> @test_vlrl4(i8 *%base, i64 %index) { +; CHECK-LABEL: test_vlrl4: +; CHECK: vlrl %v24, 0({{%r[1-5]}}), 0 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 %index + %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr) + ret <16 x i8> %res +} + +; VSTRLR with the lowest in-range displacement. +define void @test_vstrlr1(<16 x i8> %vec, i8 *%ptr, i32 %length) { +; CHECK-LABEL: test_vstrlr1: +; CHECK: vstrlr %v24, %r3, 0(%r2) +; CHECK: br %r14 + call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr) + ret void +} + +; VSTRLR with the highest in-range displacement. +define void @test_vstrlr2(<16 x i8> %vec, i8 *%base, i32 %length) { +; CHECK-LABEL: test_vstrlr2: +; CHECK: vstrlr %v24, %r3, 4095(%r2) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4095 + call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr) + ret void +} + +; VSTRLR with an out-of-range displacement. +define void @test_vstrlr3(<16 x i8> %vec, i8 *%base, i32 %length) { +; CHECK-LABEL: test_vstrlr3: +; CHECK: vstrlr %v24, %r3, 0({{%r[1-5]}}) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr) + ret void +} + +; Check that VSTRLR doesn't allow an index. +define void @test_vstrlr4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) { +; CHECK-LABEL: test_vstrlr4: +; CHECK: vstrlr %v24, %r4, 0({{%r[1-5]}}) +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 %index + call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr) + ret void +} + +; VSTRL with the lowest in-range displacement. +define void @test_vstrl1(<16 x i8> %vec, i8 *%ptr) { +; CHECK-LABEL: test_vstrl1: +; CHECK: vstrl %v24, 0(%r2), 8 +; CHECK: br %r14 + call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr) + ret void +} + +; VSTRL with the highest in-range displacement. +define void @test_vstrl2(<16 x i8> %vec, i8 *%base) { +; CHECK-LABEL: test_vstrl2: +; CHECK: vstrl %v24, 4095(%r2), 8 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4095 + call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr) + ret void +} + +; VSTRL with an out-of-range displacement. +define void @test_vstrl3(<16 x i8> %vec, i8 *%base) { +; CHECK-LABEL: test_vstrl3: +; CHECK: vstrl %v24, 0({{%r[1-5]}}), 8 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 4096 + call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr) + ret void +} + +; Check that VSTRL doesn't allow an index. +define void @test_vstrl4(<16 x i8> %vec, i8 *%base, i64 %index) { +; CHECK-LABEL: test_vstrl4: +; CHECK: vstrl %v24, 0({{%r[1-5]}}), 8 +; CHECK: br %r14 + %ptr = getelementptr i8, i8 *%base, i64 %index + call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr) + ret void +} + +; VFCESBS with no processing of the result. +define i32 @test_vfcesbs(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_vfcesbs: +; CHECK: vfcesbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a, + <4 x float> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + ret i32 %res +} + +; VFCESBS, returning 1 if any elements are equal (CC != 3). +define i32 @test_vfcesbs_any_bool(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_vfcesbs_any_bool: +; CHECK: vfcesbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: afi %r2, -536870912 +; CHECK: srl %r2, 31 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a, + <4 x float> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp ne i32 %res, 3 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VFCESBS, storing to %ptr if any elements are equal. +define <4 x i32> @test_vfcesbs_any_store(<4 x float> %a, <4 x float> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vfcesbs_any_store: +; CHECK-NOT: %r +; CHECK: vfcesbs %v24, %v24, %v26 +; CHECK-NEXT: {{bor|bnler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a, + <4 x float> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp ule i32 %cc, 2 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <4 x i32> %res +} + +; VFCHSBS with no processing of the result. +define i32 @test_vfchsbs(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_vfchsbs: +; CHECK: vfchsbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a, + <4 x float> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + ret i32 %res +} + +; VFCHSBS, returning 1 if not all elements are higher. +define i32 @test_vfchsbs_notall_bool(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_vfchsbs_notall_bool: +; CHECK: vfchsbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: risblg %r2, [[REG]], 31, 159, 36 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a, + <4 x float> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp sge i32 %res, 1 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VFCHSBS, storing to %ptr if not all elements are higher. +define <4 x i32> @test_vfchsbs_notall_store(<4 x float> %a, <4 x float> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vfchsbs_notall_store: +; CHECK-NOT: %r +; CHECK: vfchsbs %v24, %v24, %v26 +; CHECK-NEXT: {{bher|ber}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a, + <4 x float> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp ugt i32 %cc, 0 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <4 x i32> %res +} + +; VFCHESBS with no processing of the result. +define i32 @test_vfchesbs(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_vfchesbs: +; CHECK: vfchesbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a, + <4 x float> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + ret i32 %res +} + +; VFCHESBS, returning 1 if neither element is higher or equal. +define i32 @test_vfchesbs_none_bool(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_vfchesbs_none_bool: +; CHECK: vfchesbs {{%v[0-9]+}}, %v24, %v26 +; CHECK: ipm [[REG:%r[0-5]]] +; CHECK: risblg %r2, [[REG]], 31, 159, 35 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a, + <4 x float> %b) + %res = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp eq i32 %res, 3 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VFCHESBS, storing to %ptr if neither element is higher or equal. +define <4 x i32> @test_vfchesbs_none_store(<4 x float> %a, <4 x float> %b, + i32 *%ptr) { +; CHECK-LABEL: test_vfchesbs_none_store: +; CHECK-NOT: %r +; CHECK: vfchesbs %v24, %v24, %v26 +; CHECK-NEXT: {{bnor|bler}} %r14 +; CHECK: mvhi 0(%r2), 0 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a, + <4 x float> %b) + %res = extractvalue {<4 x i32>, i32} %call, 0 + %cc = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp uge i32 %cc, 3 + br i1 %cmp, label %store, label %exit + +store: + store i32 0, i32 *%ptr + br label %exit + +exit: + ret <4 x i32> %res +} + +; VFTCISB with the lowest useful class selector and no processing of the result. +define i32 @test_vftcisb(<4 x float> %a) { +; CHECK-LABEL: test_vftcisb: +; CHECK: vftcisb {{%v[0-9]+}}, %v24, 1 +; CHECK: ipm %r2 +; CHECK: srl %r2, 28 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float> %a, i32 1) + %res = extractvalue {<4 x i32>, i32} %call, 1 + ret i32 %res +} + +; VFTCISB with the highest useful class selector, returning 1 if all elements +; have the right class (CC == 0). +define i32 @test_vftcisb_all_bool(<4 x float> %a) { +; CHECK-LABEL: test_vftcisb_all_bool: +; CHECK: vftcisb {{%v[0-9]+}}, %v24, 4094 +; CHECK: afi %r2, -268435456 +; CHECK: srl %r2, 31 +; CHECK: br %r14 + %call = call {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float> %a, i32 4094) + %res = extractvalue {<4 x i32>, i32} %call, 1 + %cmp = icmp eq i32 %res, 0 + %ext = zext i1 %cmp to i32 + ret i32 %ext +} + +; VFISB with a rounding mode not usable via standard intrinsics. +define <4 x float> @test_vfisb_0_4(<4 x float> %a) { +; CHECK-LABEL: test_vfisb_0_4: +; CHECK: vfisb %v24, %v24, 0, 4 +; CHECK: br %r14 + %res = call <4 x float> @llvm.s390.vfisb(<4 x float> %a, i32 0, i32 4) + ret <4 x float> %res +} + +; VFISB with IEEE-inexact exception suppressed. +define <4 x float> @test_vfisb_4_0(<4 x float> %a) { +; CHECK-LABEL: test_vfisb_4_0: +; CHECK: vfisb %v24, %v24, 4, 0 +; CHECK: br %r14 + %res = call <4 x float> @llvm.s390.vfisb(<4 x float> %a, i32 4, i32 0) + ret <4 x float> %res +} + +; VFMAXDB. +define <2 x double> @test_vfmaxdb(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_vfmaxdb: +; CHECK: vfmaxdb %v24, %v24, %v26, 4 +; CHECK: br %r14 + %res = call <2 x double> @llvm.s390.vfmaxdb(<2 x double> %a, <2 x double> %b, i32 4) + ret <2 x double> %res +} + +; VFMINDB. +define <2 x double> @test_vfmindb(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_vfmindb: +; CHECK: vfmindb %v24, %v24, %v26, 4 +; CHECK: br %r14 + %res = call <2 x double> @llvm.s390.vfmindb(<2 x double> %a, <2 x double> %b, i32 4) + ret <2 x double> %res +} + +; VFMAXSB. +define <4 x float> @test_vfmaxsb(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_vfmaxsb: +; CHECK: vfmaxsb %v24, %v24, %v26, 4 +; CHECK: br %r14 + %res = call <4 x float> @llvm.s390.vfmaxsb(<4 x float> %a, <4 x float> %b, i32 4) + ret <4 x float> %res +} + +; VFMINSB. +define <4 x float> @test_vfminsb(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_vfminsb: +; CHECK: vfminsb %v24, %v24, %v26, 4 +; CHECK: br %r14 + %res = call <4 x float> @llvm.s390.vfminsb(<4 x float> %a, <4 x float> %b, i32 4) + ret <4 x float> %res +} + diff --git a/test/CodeGen/SystemZ/vec-intrinsics.ll b/test/CodeGen/SystemZ/vec-intrinsics.ll deleted file mode 100644 index 6f5eb0691aa8..000000000000 --- a/test/CodeGen/SystemZ/vec-intrinsics.ll +++ /dev/null @@ -1,3335 +0,0 @@ -; Test vector intrinsics. -; -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s - -declare i32 @llvm.s390.lcbb(i8 *, i32) -declare <16 x i8> @llvm.s390.vlbb(i8 *, i32) -declare <16 x i8> @llvm.s390.vll(i32, i8 *) -declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32) -declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>) -declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>) -declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>) -declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>) -declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>) -declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>) -declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>) -declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>) -declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>) -declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>) -declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>) -declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>) -declare void @llvm.s390.vstl(<16 x i8>, i32, i8 *) -declare <8 x i16> @llvm.s390.vuphb(<16 x i8>) -declare <4 x i32> @llvm.s390.vuphh(<8 x i16>) -declare <2 x i64> @llvm.s390.vuphf(<4 x i32>) -declare <8 x i16> @llvm.s390.vuplhb(<16 x i8>) -declare <4 x i32> @llvm.s390.vuplhh(<8 x i16>) -declare <2 x i64> @llvm.s390.vuplhf(<4 x i32>) -declare <8 x i16> @llvm.s390.vuplb(<16 x i8>) -declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>) -declare <2 x i64> @llvm.s390.vuplf(<4 x i32>) -declare <8 x i16> @llvm.s390.vupllb(<16 x i8>) -declare <4 x i32> @llvm.s390.vupllh(<8 x i16>) -declare <2 x i64> @llvm.s390.vupllf(<4 x i32>) -declare <16 x i8> @llvm.s390.vaccb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vacch(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vaccf(<4 x i32>, <4 x i32>) -declare <2 x i64> @llvm.s390.vaccg(<2 x i64>, <2 x i64>) -declare <16 x i8> @llvm.s390.vaq(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vacq(<16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vaccq(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vacccq(<16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vavgb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vavgh(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vavgf(<4 x i32>, <4 x i32>) -declare <2 x i64> @llvm.s390.vavgg(<2 x i64>, <2 x i64>) -declare <16 x i8> @llvm.s390.vavglb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vavglh(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vavglf(<4 x i32>, <4 x i32>) -declare <2 x i64> @llvm.s390.vavglg(<2 x i64>, <2 x i64>) -declare <4 x i32> @llvm.s390.vcksm(<4 x i32>, <4 x i32>) -declare <8 x i16> @llvm.s390.vgfmb(<16 x i8>, <16 x i8>) -declare <4 x i32> @llvm.s390.vgfmh(<8 x i16>, <8 x i16>) -declare <2 x i64> @llvm.s390.vgfmf(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vgfmg(<2 x i64>, <2 x i64>) -declare <8 x i16> @llvm.s390.vgfmab(<16 x i8>, <16 x i8>, <8 x i16>) -declare <4 x i32> @llvm.s390.vgfmah(<8 x i16>, <8 x i16>, <4 x i32>) -declare <2 x i64> @llvm.s390.vgfmaf(<4 x i32>, <4 x i32>, <2 x i64>) -declare <16 x i8> @llvm.s390.vgfmag(<2 x i64>, <2 x i64>, <16 x i8>) -declare <16 x i8> @llvm.s390.vmahb(<16 x i8>, <16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vmahh(<8 x i16>, <8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vmahf(<4 x i32>, <4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vmalhb(<16 x i8>, <16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vmalhh(<8 x i16>, <8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vmalhf(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i16> @llvm.s390.vmaeb(<16 x i8>, <16 x i8>, <8 x i16>) -declare <4 x i32> @llvm.s390.vmaeh(<8 x i16>, <8 x i16>, <4 x i32>) -declare <2 x i64> @llvm.s390.vmaef(<4 x i32>, <4 x i32>, <2 x i64>) -declare <8 x i16> @llvm.s390.vmaleb(<16 x i8>, <16 x i8>, <8 x i16>) -declare <4 x i32> @llvm.s390.vmaleh(<8 x i16>, <8 x i16>, <4 x i32>) -declare <2 x i64> @llvm.s390.vmalef(<4 x i32>, <4 x i32>, <2 x i64>) -declare <8 x i16> @llvm.s390.vmaob(<16 x i8>, <16 x i8>, <8 x i16>) -declare <4 x i32> @llvm.s390.vmaoh(<8 x i16>, <8 x i16>, <4 x i32>) -declare <2 x i64> @llvm.s390.vmaof(<4 x i32>, <4 x i32>, <2 x i64>) -declare <8 x i16> @llvm.s390.vmalob(<16 x i8>, <16 x i8>, <8 x i16>) -declare <4 x i32> @llvm.s390.vmaloh(<8 x i16>, <8 x i16>, <4 x i32>) -declare <2 x i64> @llvm.s390.vmalof(<4 x i32>, <4 x i32>, <2 x i64>) -declare <16 x i8> @llvm.s390.vmhb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vmhh(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vmhf(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vmlhb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vmlhh(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vmlhf(<4 x i32>, <4 x i32>) -declare <8 x i16> @llvm.s390.vmeb(<16 x i8>, <16 x i8>) -declare <4 x i32> @llvm.s390.vmeh(<8 x i16>, <8 x i16>) -declare <2 x i64> @llvm.s390.vmef(<4 x i32>, <4 x i32>) -declare <8 x i16> @llvm.s390.vmleb(<16 x i8>, <16 x i8>) -declare <4 x i32> @llvm.s390.vmleh(<8 x i16>, <8 x i16>) -declare <2 x i64> @llvm.s390.vmlef(<4 x i32>, <4 x i32>) -declare <8 x i16> @llvm.s390.vmob(<16 x i8>, <16 x i8>) -declare <4 x i32> @llvm.s390.vmoh(<8 x i16>, <8 x i16>) -declare <2 x i64> @llvm.s390.vmof(<4 x i32>, <4 x i32>) -declare <8 x i16> @llvm.s390.vmlob(<16 x i8>, <16 x i8>) -declare <4 x i32> @llvm.s390.vmloh(<8 x i16>, <8 x i16>) -declare <2 x i64> @llvm.s390.vmlof(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.verllvb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.verllvh(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.verllvf(<4 x i32>, <4 x i32>) -declare <2 x i64> @llvm.s390.verllvg(<2 x i64>, <2 x i64>) -declare <16 x i8> @llvm.s390.verllb(<16 x i8>, i32) -declare <8 x i16> @llvm.s390.verllh(<8 x i16>, i32) -declare <4 x i32> @llvm.s390.verllf(<4 x i32>, i32) -declare <2 x i64> @llvm.s390.verllg(<2 x i64>, i32) -declare <16 x i8> @llvm.s390.verimb(<16 x i8>, <16 x i8>, <16 x i8>, i32) -declare <8 x i16> @llvm.s390.verimh(<8 x i16>, <8 x i16>, <8 x i16>, i32) -declare <4 x i32> @llvm.s390.verimf(<4 x i32>, <4 x i32>, <4 x i32>, i32) -declare <2 x i64> @llvm.s390.verimg(<2 x i64>, <2 x i64>, <2 x i64>, i32) -declare <16 x i8> @llvm.s390.vsl(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vslb(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vsra(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vsrab(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vsrl(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vsrlb(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32) -declare <16 x i8> @llvm.s390.vscbib(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vscbih(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vscbif(<4 x i32>, <4 x i32>) -declare <2 x i64> @llvm.s390.vscbig(<2 x i64>, <2 x i64>) -declare <16 x i8> @llvm.s390.vsq(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vsbiq(<16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vscbiq(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vsbcbiq(<16 x i8>, <16 x i8>, <16 x i8>) -declare <4 x i32> @llvm.s390.vsumb(<16 x i8>, <16 x i8>) -declare <4 x i32> @llvm.s390.vsumh(<8 x i16>, <8 x i16>) -declare <2 x i64> @llvm.s390.vsumgh(<8 x i16>, <8 x i16>) -declare <2 x i64> @llvm.s390.vsumgf(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vsumqf(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vsumqg(<2 x i64>, <2 x i64>) -declare i32 @llvm.s390.vtm(<16 x i8>, <16 x i8>) -declare {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8>, <16 x i8>) -declare {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16>, <8 x i16>) -declare {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32>, <4 x i32>) -declare {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64>, <2 x i64>) -declare {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8>, <16 x i8>) -declare {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16>, <8 x i16>) -declare {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32>, <4 x i32>) -declare {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64>, <2 x i64>) -declare {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8>, <16 x i8>) -declare {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16>, <8 x i16>) -declare {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32>, <4 x i32>) -declare {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64>, <2 x i64>) -declare <16 x i8> @llvm.s390.vfaeb(<16 x i8>, <16 x i8>, i32) -declare <8 x i16> @llvm.s390.vfaeh(<8 x i16>, <8 x i16>, i32) -declare <4 x i32> @llvm.s390.vfaef(<4 x i32>, <4 x i32>, i32) -declare {<16 x i8>, i32} @llvm.s390.vfaebs(<16 x i8>, <16 x i8>, i32) -declare {<8 x i16>, i32} @llvm.s390.vfaehs(<8 x i16>, <8 x i16>, i32) -declare {<4 x i32>, i32} @llvm.s390.vfaefs(<4 x i32>, <4 x i32>, i32) -declare <16 x i8> @llvm.s390.vfaezb(<16 x i8>, <16 x i8>, i32) -declare <8 x i16> @llvm.s390.vfaezh(<8 x i16>, <8 x i16>, i32) -declare <4 x i32> @llvm.s390.vfaezf(<4 x i32>, <4 x i32>, i32) -declare {<16 x i8>, i32} @llvm.s390.vfaezbs(<16 x i8>, <16 x i8>, i32) -declare {<8 x i16>, i32} @llvm.s390.vfaezhs(<8 x i16>, <8 x i16>, i32) -declare {<4 x i32>, i32} @llvm.s390.vfaezfs(<4 x i32>, <4 x i32>, i32) -declare <16 x i8> @llvm.s390.vfeeb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vfeeh(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vfeef(<4 x i32>, <4 x i32>) -declare {<16 x i8>, i32} @llvm.s390.vfeebs(<16 x i8>, <16 x i8>) -declare {<8 x i16>, i32} @llvm.s390.vfeehs(<8 x i16>, <8 x i16>) -declare {<4 x i32>, i32} @llvm.s390.vfeefs(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vfeezb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vfeezh(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vfeezf(<4 x i32>, <4 x i32>) -declare {<16 x i8>, i32} @llvm.s390.vfeezbs(<16 x i8>, <16 x i8>) -declare {<8 x i16>, i32} @llvm.s390.vfeezhs(<8 x i16>, <8 x i16>) -declare {<4 x i32>, i32} @llvm.s390.vfeezfs(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vfeneb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vfeneh(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vfenef(<4 x i32>, <4 x i32>) -declare {<16 x i8>, i32} @llvm.s390.vfenebs(<16 x i8>, <16 x i8>) -declare {<8 x i16>, i32} @llvm.s390.vfenehs(<8 x i16>, <8 x i16>) -declare {<4 x i32>, i32} @llvm.s390.vfenefs(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vfenezb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.vfenezh(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.vfenezf(<4 x i32>, <4 x i32>) -declare {<16 x i8>, i32} @llvm.s390.vfenezbs(<16 x i8>, <16 x i8>) -declare {<8 x i16>, i32} @llvm.s390.vfenezhs(<8 x i16>, <8 x i16>) -declare {<4 x i32>, i32} @llvm.s390.vfenezfs(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vistrb(<16 x i8>) -declare <8 x i16> @llvm.s390.vistrh(<8 x i16>) -declare <4 x i32> @llvm.s390.vistrf(<4 x i32>) -declare {<16 x i8>, i32} @llvm.s390.vistrbs(<16 x i8>) -declare {<8 x i16>, i32} @llvm.s390.vistrhs(<8 x i16>) -declare {<4 x i32>, i32} @llvm.s390.vistrfs(<4 x i32>) -declare <16 x i8> @llvm.s390.vstrcb(<16 x i8>, <16 x i8>, <16 x i8>, i32) -declare <8 x i16> @llvm.s390.vstrch(<8 x i16>, <8 x i16>, <8 x i16>, i32) -declare <4 x i32> @llvm.s390.vstrcf(<4 x i32>, <4 x i32>, <4 x i32>, i32) -declare {<16 x i8>, i32} @llvm.s390.vstrcbs(<16 x i8>, <16 x i8>, <16 x i8>, - i32) -declare {<8 x i16>, i32} @llvm.s390.vstrchs(<8 x i16>, <8 x i16>, <8 x i16>, - i32) -declare {<4 x i32>, i32} @llvm.s390.vstrcfs(<4 x i32>, <4 x i32>, <4 x i32>, - i32) -declare <16 x i8> @llvm.s390.vstrczb(<16 x i8>, <16 x i8>, <16 x i8>, i32) -declare <8 x i16> @llvm.s390.vstrczh(<8 x i16>, <8 x i16>, <8 x i16>, i32) -declare <4 x i32> @llvm.s390.vstrczf(<4 x i32>, <4 x i32>, <4 x i32>, i32) -declare {<16 x i8>, i32} @llvm.s390.vstrczbs(<16 x i8>, <16 x i8>, <16 x i8>, - i32) -declare {<8 x i16>, i32} @llvm.s390.vstrczhs(<8 x i16>, <8 x i16>, <8 x i16>, - i32) -declare {<4 x i32>, i32} @llvm.s390.vstrczfs(<4 x i32>, <4 x i32>, <4 x i32>, - i32) -declare {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double>, <2 x double>) -declare {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double>, <2 x double>) -declare {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double>, <2 x double>) -declare {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double>, i32) -declare <2 x double> @llvm.s390.vfidb(<2 x double>, i32, i32) - -; LCBB with the lowest M3 operand. -define i32 @test_lcbb1(i8 *%ptr) { -; CHECK-LABEL: test_lcbb1: -; CHECK: lcbb %r2, 0(%r2), 0 -; CHECK: br %r14 - %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 0) - ret i32 %res -} - -; LCBB with the highest M3 operand. -define i32 @test_lcbb2(i8 *%ptr) { -; CHECK-LABEL: test_lcbb2: -; CHECK: lcbb %r2, 0(%r2), 15 -; CHECK: br %r14 - %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 15) - ret i32 %res -} - -; LCBB with a displacement and index. -define i32 @test_lcbb3(i8 *%base, i64 %index) { -; CHECK-LABEL: test_lcbb3: -; CHECK: lcbb %r2, 4095({{%r2,%r3|%r3,%r2}}), 4 -; CHECK: br %r14 - %add = add i64 %index, 4095 - %ptr = getelementptr i8, i8 *%base, i64 %add - %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 4) - ret i32 %res -} - -; LCBB with an out-of-range displacement. -define i32 @test_lcbb4(i8 *%base) { -; CHECK-LABEL: test_lcbb4: -; CHECK: lcbb %r2, 0({{%r[1-5]}}), 5 -; CHECK: br %r14 - %ptr = getelementptr i8, i8 *%base, i64 4096 - %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 5) - ret i32 %res -} - -; VLBB with the lowest M3 operand. -define <16 x i8> @test_vlbb1(i8 *%ptr) { -; CHECK-LABEL: test_vlbb1: -; CHECK: vlbb %v24, 0(%r2), 0 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 0) - ret <16 x i8> %res -} - -; VLBB with the highest M3 operand. -define <16 x i8> @test_vlbb2(i8 *%ptr) { -; CHECK-LABEL: test_vlbb2: -; CHECK: vlbb %v24, 0(%r2), 15 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 15) - ret <16 x i8> %res -} - -; VLBB with a displacement and index. -define <16 x i8> @test_vlbb3(i8 *%base, i64 %index) { -; CHECK-LABEL: test_vlbb3: -; CHECK: vlbb %v24, 4095({{%r2,%r3|%r3,%r2}}), 4 -; CHECK: br %r14 - %add = add i64 %index, 4095 - %ptr = getelementptr i8, i8 *%base, i64 %add - %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 4) - ret <16 x i8> %res -} - -; VLBB with an out-of-range displacement. -define <16 x i8> @test_vlbb4(i8 *%base) { -; CHECK-LABEL: test_vlbb4: -; CHECK: vlbb %v24, 0({{%r[1-5]}}), 5 -; CHECK: br %r14 - %ptr = getelementptr i8, i8 *%base, i64 4096 - %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 5) - ret <16 x i8> %res -} - -; VLL with the lowest in-range displacement. -define <16 x i8> @test_vll1(i8 *%ptr, i32 %length) { -; CHECK-LABEL: test_vll1: -; CHECK: vll %v24, %r3, 0(%r2) -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr) - ret <16 x i8> %res -} - -; VLL with the highest in-range displacement. -define <16 x i8> @test_vll2(i8 *%base, i32 %length) { -; CHECK-LABEL: test_vll2: -; CHECK: vll %v24, %r3, 4095(%r2) -; CHECK: br %r14 - %ptr = getelementptr i8, i8 *%base, i64 4095 - %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr) - ret <16 x i8> %res -} - -; VLL with an out-of-range displacementa. -define <16 x i8> @test_vll3(i8 *%base, i32 %length) { -; CHECK-LABEL: test_vll3: -; CHECK: vll %v24, %r3, 0({{%r[1-5]}}) -; CHECK: br %r14 - %ptr = getelementptr i8, i8 *%base, i64 4096 - %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr) - ret <16 x i8> %res -} - -; Check that VLL doesn't allow an index. -define <16 x i8> @test_vll4(i8 *%base, i64 %index, i32 %length) { -; CHECK-LABEL: test_vll4: -; CHECK: vll %v24, %r4, 0({{%r[1-5]}}) -; CHECK: br %r14 - %ptr = getelementptr i8, i8 *%base, i64 %index - %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr) - ret <16 x i8> %res -} - -; VPDI taking element 0 from each half. -define <2 x i64> @test_vpdi1(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vpdi1: -; CHECK: vpdi %v24, %v24, %v26, 0 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vpdi(<2 x i64> %a, <2 x i64> %b, i32 0) - ret <2 x i64> %res -} - -; VPDI taking element 1 from each half. -define <2 x i64> @test_vpdi2(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vpdi2: -; CHECK: vpdi %v24, %v24, %v26, 10 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vpdi(<2 x i64> %a, <2 x i64> %b, i32 10) - ret <2 x i64> %res -} - -; VPERM. -define <16 x i8> @test_vperm(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vperm: -; CHECK: vperm %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vperm(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res -} - -; VPKSH. -define <16 x i8> @test_vpksh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vpksh: -; CHECK: vpksh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vpksh(<8 x i16> %a, <8 x i16> %b) - ret <16 x i8> %res -} - -; VPKSF. -define <8 x i16> @test_vpksf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vpksf: -; CHECK: vpksf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vpksf(<4 x i32> %a, <4 x i32> %b) - ret <8 x i16> %res -} - -; VPKSG. -define <4 x i32> @test_vpksg(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vpksg: -; CHECK: vpksg %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vpksg(<2 x i64> %a, <2 x i64> %b) - ret <4 x i32> %res -} - -; VPKSHS with no processing of the result. -define <16 x i8> @test_vpkshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vpkshs: -; CHECK: vpkshs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VPKSHS, storing to %ptr if all values were saturated. -define <16 x i8> @test_vpkshs_all_store(<8 x i16> %a, <8 x i16> %b, i32 *%ptr) { -; CHECK-LABEL: test_vpkshs_all_store: -; CHECK: vpkshs %v24, %v24, %v26 -; CHECK-NEXT: {{bnor|bler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - %cmp = icmp uge i32 %cc, 3 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <16 x i8> %res -} - -; VPKSFS with no processing of the result. -define <8 x i16> @test_vpksfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vpksfs: -; CHECK: vpksfs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VPKSFS, storing to %ptr if any values were saturated. -define <8 x i16> @test_vpksfs_any_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) { -; CHECK-LABEL: test_vpksfs_any_store: -; CHECK: vpksfs %v24, %v24, %v26 -; CHECK-NEXT: {{bher|ber}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - %cmp = icmp ugt i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <8 x i16> %res -} - -; VPKSGS with no processing of the result. -define <4 x i32> @test_vpksgs(<2 x i64> %a, <2 x i64> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vpksgs: -; CHECK: vpksgs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VPKSGS, storing to %ptr if no elements were saturated -define <4 x i32> @test_vpksgs_none_store(<2 x i64> %a, <2 x i64> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vpksgs_none_store: -; CHECK: vpksgs %v24, %v24, %v26 -; CHECK-NEXT: {{bnher|bner}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - %cmp = icmp sle i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <4 x i32> %res -} - -; VPKLSH. -define <16 x i8> @test_vpklsh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vpklsh: -; CHECK: vpklsh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vpklsh(<8 x i16> %a, <8 x i16> %b) - ret <16 x i8> %res -} - -; VPKLSF. -define <8 x i16> @test_vpklsf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vpklsf: -; CHECK: vpklsf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> %a, <4 x i32> %b) - ret <8 x i16> %res -} - -; VPKLSG. -define <4 x i32> @test_vpklsg(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vpklsg: -; CHECK: vpklsg %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> %a, <2 x i64> %b) - ret <4 x i32> %res -} - -; VPKLSHS with no processing of the result. -define <16 x i8> @test_vpklshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vpklshs: -; CHECK: vpklshs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VPKLSHS, storing to %ptr if all values were saturated. -define <16 x i8> @test_vpklshs_all_store(<8 x i16> %a, <8 x i16> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vpklshs_all_store: -; CHECK: vpklshs %v24, %v24, %v26 -; CHECK-NEXT: {{bnor|bler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - %cmp = icmp eq i32 %cc, 3 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <16 x i8> %res -} - -; VPKLSFS with no processing of the result. -define <8 x i16> @test_vpklsfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vpklsfs: -; CHECK: vpklsfs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VPKLSFS, storing to %ptr if any values were saturated. -define <8 x i16> @test_vpklsfs_any_store(<4 x i32> %a, <4 x i32> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vpklsfs_any_store: -; CHECK: vpklsfs %v24, %v24, %v26 -; CHECK-NEXT: {{bher|ber}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - %cmp = icmp ne i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <8 x i16> %res -} - -; VPKLSGS with no processing of the result. -define <4 x i32> @test_vpklsgs(<2 x i64> %a, <2 x i64> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vpklsgs: -; CHECK: vpklsgs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VPKLSGS, storing to %ptr if no elements were saturated -define <4 x i32> @test_vpklsgs_none_store(<2 x i64> %a, <2 x i64> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vpklsgs_none_store: -; CHECK: vpklsgs %v24, %v24, %v26 -; CHECK-NEXT: {{bnher|bner}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - %cmp = icmp eq i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <4 x i32> %res -} - -; VSTL with the lowest in-range displacement. -define void @test_vstl1(<16 x i8> %vec, i8 *%ptr, i32 %length) { -; CHECK-LABEL: test_vstl1: -; CHECK: vstl %v24, %r3, 0(%r2) -; CHECK: br %r14 - call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr) - ret void -} - -; VSTL with the highest in-range displacement. -define void @test_vstl2(<16 x i8> %vec, i8 *%base, i32 %length) { -; CHECK-LABEL: test_vstl2: -; CHECK: vstl %v24, %r3, 4095(%r2) -; CHECK: br %r14 - %ptr = getelementptr i8, i8 *%base, i64 4095 - call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr) - ret void -} - -; VSTL with an out-of-range displacement. -define void @test_vstl3(<16 x i8> %vec, i8 *%base, i32 %length) { -; CHECK-LABEL: test_vstl3: -; CHECK: vstl %v24, %r3, 0({{%r[1-5]}}) -; CHECK: br %r14 - %ptr = getelementptr i8, i8 *%base, i64 4096 - call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr) - ret void -} - -; Check that VSTL doesn't allow an index. -define void @test_vstl4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) { -; CHECK-LABEL: test_vstl4: -; CHECK: vstl %v24, %r4, 0({{%r[1-5]}}) -; CHECK: br %r14 - %ptr = getelementptr i8, i8 *%base, i64 %index - call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr) - ret void -} - -; VUPHB. -define <8 x i16> @test_vuphb(<16 x i8> %a) { -; CHECK-LABEL: test_vuphb: -; CHECK: vuphb %v24, %v24 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vuphb(<16 x i8> %a) - ret <8 x i16> %res -} - -; VUPHH. -define <4 x i32> @test_vuphh(<8 x i16> %a) { -; CHECK-LABEL: test_vuphh: -; CHECK: vuphh %v24, %v24 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vuphh(<8 x i16> %a) - ret <4 x i32> %res -} - -; VUPHF. -define <2 x i64> @test_vuphf(<4 x i32> %a) { -; CHECK-LABEL: test_vuphf: -; CHECK: vuphf %v24, %v24 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vuphf(<4 x i32> %a) - ret <2 x i64> %res -} - -; VUPLHB. -define <8 x i16> @test_vuplhb(<16 x i8> %a) { -; CHECK-LABEL: test_vuplhb: -; CHECK: vuplhb %v24, %v24 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vuplhb(<16 x i8> %a) - ret <8 x i16> %res -} - -; VUPLHH. -define <4 x i32> @test_vuplhh(<8 x i16> %a) { -; CHECK-LABEL: test_vuplhh: -; CHECK: vuplhh %v24, %v24 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vuplhh(<8 x i16> %a) - ret <4 x i32> %res -} - -; VUPLHF. -define <2 x i64> @test_vuplhf(<4 x i32> %a) { -; CHECK-LABEL: test_vuplhf: -; CHECK: vuplhf %v24, %v24 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> %a) - ret <2 x i64> %res -} - -; VUPLB. -define <8 x i16> @test_vuplb(<16 x i8> %a) { -; CHECK-LABEL: test_vuplb: -; CHECK: vuplb %v24, %v24 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vuplb(<16 x i8> %a) - ret <8 x i16> %res -} - -; VUPLHW. -define <4 x i32> @test_vuplhw(<8 x i16> %a) { -; CHECK-LABEL: test_vuplhw: -; CHECK: vuplhw %v24, %v24 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vuplhw(<8 x i16> %a) - ret <4 x i32> %res -} - -; VUPLF. -define <2 x i64> @test_vuplf(<4 x i32> %a) { -; CHECK-LABEL: test_vuplf: -; CHECK: vuplf %v24, %v24 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vuplf(<4 x i32> %a) - ret <2 x i64> %res -} - -; VUPLLB. -define <8 x i16> @test_vupllb(<16 x i8> %a) { -; CHECK-LABEL: test_vupllb: -; CHECK: vupllb %v24, %v24 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vupllb(<16 x i8> %a) - ret <8 x i16> %res -} - -; VUPLLH. -define <4 x i32> @test_vupllh(<8 x i16> %a) { -; CHECK-LABEL: test_vupllh: -; CHECK: vupllh %v24, %v24 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vupllh(<8 x i16> %a) - ret <4 x i32> %res -} - -; VUPLLF. -define <2 x i64> @test_vupllf(<4 x i32> %a) { -; CHECK-LABEL: test_vupllf: -; CHECK: vupllf %v24, %v24 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vupllf(<4 x i32> %a) - ret <2 x i64> %res -} - -; VACCB. -define <16 x i8> @test_vaccb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vaccb: -; CHECK: vaccb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vaccb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VACCH. -define <8 x i16> @test_vacch(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vacch: -; CHECK: vacch %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vacch(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VACCF. -define <4 x i32> @test_vaccf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaccf: -; CHECK: vaccf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vaccf(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VACCG. -define <2 x i64> @test_vaccg(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaccg: -; CHECK: vaccg %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vaccg(<2 x i64> %a, <2 x i64> %b) - ret <2 x i64> %res -} - -; VAQ. -define <16 x i8> @test_vaq(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vaq: -; CHECK: vaq %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vaq(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VACQ. -define <16 x i8> @test_vacq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vacq: -; CHECK: vacq %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vacq(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res -} - -; VACCQ. -define <16 x i8> @test_vaccq(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vaccq: -; CHECK: vaccq %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vaccq(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VACCCQ. -define <16 x i8> @test_vacccq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vacccq: -; CHECK: vacccq %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vacccq(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res -} - -; VAVGB. -define <16 x i8> @test_vavgb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vavgb: -; CHECK: vavgb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vavgb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VAVGH. -define <8 x i16> @test_vavgh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vavgh: -; CHECK: vavgh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vavgh(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VAVGF. -define <4 x i32> @test_vavgf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vavgf: -; CHECK: vavgf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vavgf(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VAVGG. -define <2 x i64> @test_vavgg(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vavgg: -; CHECK: vavgg %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vavgg(<2 x i64> %a, <2 x i64> %b) - ret <2 x i64> %res -} - -; VAVGLB. -define <16 x i8> @test_vavglb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vavglb: -; CHECK: vavglb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vavglb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VAVGLH. -define <8 x i16> @test_vavglh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vavglh: -; CHECK: vavglh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vavglh(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VAVGLF. -define <4 x i32> @test_vavglf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vavglf: -; CHECK: vavglf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vavglf(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VAVGLG. -define <2 x i64> @test_vavglg(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vavglg: -; CHECK: vavglg %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vavglg(<2 x i64> %a, <2 x i64> %b) - ret <2 x i64> %res -} - -; VCKSM. -define <4 x i32> @test_vcksm(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vcksm: -; CHECK: vcksm %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vcksm(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VGFMB. -define <8 x i16> @test_vgfmb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vgfmb: -; CHECK: vgfmb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vgfmb(<16 x i8> %a, <16 x i8> %b) - ret <8 x i16> %res -} - -; VGFMH. -define <4 x i32> @test_vgfmh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vgfmh: -; CHECK: vgfmh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vgfmh(<8 x i16> %a, <8 x i16> %b) - ret <4 x i32> %res -} - -; VGFMF. -define <2 x i64> @test_vgfmf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vgfmf: -; CHECK: vgfmf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vgfmf(<4 x i32> %a, <4 x i32> %b) - ret <2 x i64> %res -} - -; VGFMG. -define <16 x i8> @test_vgfmg(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vgfmg: -; CHECK: vgfmg %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vgfmg(<2 x i64> %a, <2 x i64> %b) - ret <16 x i8> %res -} - -; VGFMAB. -define <8 x i16> @test_vgfmab(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) { -; CHECK-LABEL: test_vgfmab: -; CHECK: vgfmab %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vgfmab(<16 x i8> %a, <16 x i8> %b, - <8 x i16> %c) - ret <8 x i16> %res -} - -; VGFMAH. -define <4 x i32> @test_vgfmah(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) { -; CHECK-LABEL: test_vgfmah: -; CHECK: vgfmah %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vgfmah(<8 x i16> %a, <8 x i16> %b, - <4 x i32> %c) - ret <4 x i32> %res -} - -; VGFMAF. -define <2 x i64> @test_vgfmaf(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { -; CHECK-LABEL: test_vgfmaf: -; CHECK: vgfmaf %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vgfmaf(<4 x i32> %a, <4 x i32> %b, - <2 x i64> %c) - ret <2 x i64> %res -} - -; VGFMAG. -define <16 x i8> @test_vgfmag(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vgfmag: -; CHECK: vgfmag %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vgfmag(<2 x i64> %a, <2 x i64> %b, - <16 x i8> %c) - ret <16 x i8> %res -} - -; VMAHB. -define <16 x i8> @test_vmahb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vmahb: -; CHECK: vmahb %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vmahb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res -} - -; VMAHH. -define <8 x i16> @test_vmahh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { -; CHECK-LABEL: test_vmahh: -; CHECK: vmahh %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmahh(<8 x i16> %a, <8 x i16> %b, - <8 x i16> %c) - ret <8 x i16> %res -} - -; VMAHF. -define <4 x i32> @test_vmahf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { -; CHECK-LABEL: test_vmahf: -; CHECK: vmahf %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmahf(<4 x i32> %a, <4 x i32> %b, - <4 x i32> %c) - ret <4 x i32> %res -} - -; VMALHB. -define <16 x i8> @test_vmalhb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vmalhb: -; CHECK: vmalhb %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vmalhb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res -} - -; VMALHH. -define <8 x i16> @test_vmalhh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { -; CHECK-LABEL: test_vmalhh: -; CHECK: vmalhh %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmalhh(<8 x i16> %a, <8 x i16> %b, - <8 x i16> %c) - ret <8 x i16> %res -} - -; VMALHF. -define <4 x i32> @test_vmalhf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { -; CHECK-LABEL: test_vmalhf: -; CHECK: vmalhf %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmalhf(<4 x i32> %a, <4 x i32> %b, - <4 x i32> %c) - ret <4 x i32> %res -} - -; VMAEB. -define <8 x i16> @test_vmaeb(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) { -; CHECK-LABEL: test_vmaeb: -; CHECK: vmaeb %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmaeb(<16 x i8> %a, <16 x i8> %b, - <8 x i16> %c) - ret <8 x i16> %res -} - -; VMAEH. -define <4 x i32> @test_vmaeh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) { -; CHECK-LABEL: test_vmaeh: -; CHECK: vmaeh %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmaeh(<8 x i16> %a, <8 x i16> %b, - <4 x i32> %c) - ret <4 x i32> %res -} - -; VMAEF. -define <2 x i64> @test_vmaef(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { -; CHECK-LABEL: test_vmaef: -; CHECK: vmaef %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vmaef(<4 x i32> %a, <4 x i32> %b, - <2 x i64> %c) - ret <2 x i64> %res -} - -; VMALEB. -define <8 x i16> @test_vmaleb(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) { -; CHECK-LABEL: test_vmaleb: -; CHECK: vmaleb %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmaleb(<16 x i8> %a, <16 x i8> %b, - <8 x i16> %c) - ret <8 x i16> %res -} - -; VMALEH. -define <4 x i32> @test_vmaleh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) { -; CHECK-LABEL: test_vmaleh: -; CHECK: vmaleh %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmaleh(<8 x i16> %a, <8 x i16> %b, - <4 x i32> %c) - ret <4 x i32> %res -} - -; VMALEF. -define <2 x i64> @test_vmalef(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { -; CHECK-LABEL: test_vmalef: -; CHECK: vmalef %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vmalef(<4 x i32> %a, <4 x i32> %b, - <2 x i64> %c) - ret <2 x i64> %res -} - -; VMAOB. -define <8 x i16> @test_vmaob(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) { -; CHECK-LABEL: test_vmaob: -; CHECK: vmaob %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmaob(<16 x i8> %a, <16 x i8> %b, - <8 x i16> %c) - ret <8 x i16> %res -} - -; VMAOH. -define <4 x i32> @test_vmaoh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) { -; CHECK-LABEL: test_vmaoh: -; CHECK: vmaoh %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmaoh(<8 x i16> %a, <8 x i16> %b, - <4 x i32> %c) - ret <4 x i32> %res -} - -; VMAOF. -define <2 x i64> @test_vmaof(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { -; CHECK-LABEL: test_vmaof: -; CHECK: vmaof %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vmaof(<4 x i32> %a, <4 x i32> %b, - <2 x i64> %c) - ret <2 x i64> %res -} - -; VMALOB. -define <8 x i16> @test_vmalob(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) { -; CHECK-LABEL: test_vmalob: -; CHECK: vmalob %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmalob(<16 x i8> %a, <16 x i8> %b, - <8 x i16> %c) - ret <8 x i16> %res -} - -; VMALOH. -define <4 x i32> @test_vmaloh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) { -; CHECK-LABEL: test_vmaloh: -; CHECK: vmaloh %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmaloh(<8 x i16> %a, <8 x i16> %b, - <4 x i32> %c) - ret <4 x i32> %res -} - -; VMALOF. -define <2 x i64> @test_vmalof(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { -; CHECK-LABEL: test_vmalof: -; CHECK: vmalof %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vmalof(<4 x i32> %a, <4 x i32> %b, - <2 x i64> %c) - ret <2 x i64> %res -} - -; VMHB. -define <16 x i8> @test_vmhb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vmhb: -; CHECK: vmhb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vmhb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VMHH. -define <8 x i16> @test_vmhh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vmhh: -; CHECK: vmhh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmhh(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VMHF. -define <4 x i32> @test_vmhf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vmhf: -; CHECK: vmhf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmhf(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VMLHB. -define <16 x i8> @test_vmlhb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vmlhb: -; CHECK: vmlhb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vmlhb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VMLHH. -define <8 x i16> @test_vmlhh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vmlhh: -; CHECK: vmlhh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmlhh(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VMLHF. -define <4 x i32> @test_vmlhf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vmlhf: -; CHECK: vmlhf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmlhf(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VMEB. -define <8 x i16> @test_vmeb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vmeb: -; CHECK: vmeb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmeb(<16 x i8> %a, <16 x i8> %b) - ret <8 x i16> %res -} - -; VMEH. -define <4 x i32> @test_vmeh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vmeh: -; CHECK: vmeh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmeh(<8 x i16> %a, <8 x i16> %b) - ret <4 x i32> %res -} - -; VMEF. -define <2 x i64> @test_vmef(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vmef: -; CHECK: vmef %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vmef(<4 x i32> %a, <4 x i32> %b) - ret <2 x i64> %res -} - -; VMLEB. -define <8 x i16> @test_vmleb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vmleb: -; CHECK: vmleb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmleb(<16 x i8> %a, <16 x i8> %b) - ret <8 x i16> %res -} - -; VMLEH. -define <4 x i32> @test_vmleh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vmleh: -; CHECK: vmleh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmleh(<8 x i16> %a, <8 x i16> %b) - ret <4 x i32> %res -} - -; VMLEF. -define <2 x i64> @test_vmlef(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vmlef: -; CHECK: vmlef %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vmlef(<4 x i32> %a, <4 x i32> %b) - ret <2 x i64> %res -} - -; VMOB. -define <8 x i16> @test_vmob(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vmob: -; CHECK: vmob %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmob(<16 x i8> %a, <16 x i8> %b) - ret <8 x i16> %res -} - -; VMOH. -define <4 x i32> @test_vmoh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vmoh: -; CHECK: vmoh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmoh(<8 x i16> %a, <8 x i16> %b) - ret <4 x i32> %res -} - -; VMOF. -define <2 x i64> @test_vmof(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vmof: -; CHECK: vmof %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vmof(<4 x i32> %a, <4 x i32> %b) - ret <2 x i64> %res -} - -; VMLOB. -define <8 x i16> @test_vmlob(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vmlob: -; CHECK: vmlob %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vmlob(<16 x i8> %a, <16 x i8> %b) - ret <8 x i16> %res -} - -; VMLOH. -define <4 x i32> @test_vmloh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vmloh: -; CHECK: vmloh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vmloh(<8 x i16> %a, <8 x i16> %b) - ret <4 x i32> %res -} - -; VMLOF. -define <2 x i64> @test_vmlof(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vmlof: -; CHECK: vmlof %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vmlof(<4 x i32> %a, <4 x i32> %b) - ret <2 x i64> %res -} - -; VERLLVB. -define <16 x i8> @test_verllvb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_verllvb: -; CHECK: verllvb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.verllvb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VERLLVH. -define <8 x i16> @test_verllvh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_verllvh: -; CHECK: verllvh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.verllvh(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VERLLVF. -define <4 x i32> @test_verllvf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_verllvf: -; CHECK: verllvf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.verllvf(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VERLLVG. -define <2 x i64> @test_verllvg(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_verllvg: -; CHECK: verllvg %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.verllvg(<2 x i64> %a, <2 x i64> %b) - ret <2 x i64> %res -} - -; VERLLB. -define <16 x i8> @test_verllb(<16 x i8> %a, i32 %b) { -; CHECK-LABEL: test_verllb: -; CHECK: verllb %v24, %v24, 0(%r2) -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 %b) - ret <16 x i8> %res -} - -; VERLLH. -define <8 x i16> @test_verllh(<8 x i16> %a, i32 %b) { -; CHECK-LABEL: test_verllh: -; CHECK: verllh %v24, %v24, 0(%r2) -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.verllh(<8 x i16> %a, i32 %b) - ret <8 x i16> %res -} - -; VERLLF. -define <4 x i32> @test_verllf(<4 x i32> %a, i32 %b) { -; CHECK-LABEL: test_verllf: -; CHECK: verllf %v24, %v24, 0(%r2) -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.verllf(<4 x i32> %a, i32 %b) - ret <4 x i32> %res -} - -; VERLLG. -define <2 x i64> @test_verllg(<2 x i64> %a, i32 %b) { -; CHECK-LABEL: test_verllg: -; CHECK: verllg %v24, %v24, 0(%r2) -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.verllg(<2 x i64> %a, i32 %b) - ret <2 x i64> %res -} - -; VERLLB with the smallest count. -define <16 x i8> @test_verllb_1(<16 x i8> %a) { -; CHECK-LABEL: test_verllb_1: -; CHECK: verllb %v24, %v24, 1 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 1) - ret <16 x i8> %res -} - -; VERLLB with the largest count. -define <16 x i8> @test_verllb_4095(<16 x i8> %a) { -; CHECK-LABEL: test_verllb_4095: -; CHECK: verllb %v24, %v24, 4095 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4095) - ret <16 x i8> %res -} - -; VERLLB with the largest count + 1. -define <16 x i8> @test_verllb_4096(<16 x i8> %a) { -; CHECK-LABEL: test_verllb_4096: -; CHECK: lhi [[REG:%r[1-5]]], 4096 -; CHECK: verllb %v24, %v24, 0([[REG]]) -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4096) - ret <16 x i8> %res -} - -; VERIMB. -define <16 x i8> @test_verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_verimb: -; CHECK: verimb %v24, %v26, %v28, 1 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, i32 1) - ret <16 x i8> %res -} - -; VERIMH. -define <8 x i16> @test_verimh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { -; CHECK-LABEL: test_verimh: -; CHECK: verimh %v24, %v26, %v28, 1 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.verimh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, i32 1) - ret <8 x i16> %res -} - -; VERIMF. -define <4 x i32> @test_verimf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { -; CHECK-LABEL: test_verimf: -; CHECK: verimf %v24, %v26, %v28, 1 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.verimf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 1) - ret <4 x i32> %res -} - -; VERIMG. -define <2 x i64> @test_verimg(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { -; CHECK-LABEL: test_verimg: -; CHECK: verimg %v24, %v26, %v28, 1 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.verimg(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, i32 1) - ret <2 x i64> %res -} - -; VERIMB with a different mask. -define <16 x i8> @test_verimb_254(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_verimb_254: -; CHECK: verimb %v24, %v26, %v28, 254 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, i32 254) - ret <16 x i8> %res -} - -; VSL. -define <16 x i8> @test_vsl(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsl: -; CHECK: vsl %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsl(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VSLB. -define <16 x i8> @test_vslb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vslb: -; CHECK: vslb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vslb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VSRA. -define <16 x i8> @test_vsra(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsra: -; CHECK: vsra %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsra(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VSRAB. -define <16 x i8> @test_vsrab(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsrab: -; CHECK: vsrab %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsrab(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VSRL. -define <16 x i8> @test_vsrl(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsrl: -; CHECK: vsrl %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsrl(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VSRLB. -define <16 x i8> @test_vsrlb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsrlb: -; CHECK: vsrlb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VSLDB with the minimum useful value. -define <16 x i8> @test_vsldb_1(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsldb_1: -; CHECK: vsldb %v24, %v24, %v26, 1 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsldb(<16 x i8> %a, <16 x i8> %b, i32 1) - ret <16 x i8> %res -} - -; VSLDB with the maximum value. -define <16 x i8> @test_vsldb_15(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsldb_15: -; CHECK: vsldb %v24, %v24, %v26, 15 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsldb(<16 x i8> %a, <16 x i8> %b, i32 15) - ret <16 x i8> %res -} - -; VSCBIB. -define <16 x i8> @test_vscbib(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vscbib: -; CHECK: vscbib %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vscbib(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VSCBIH. -define <8 x i16> @test_vscbih(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vscbih: -; CHECK: vscbih %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vscbih(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VSCBIF. -define <4 x i32> @test_vscbif(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vscbif: -; CHECK: vscbif %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vscbif(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VSCBIG. -define <2 x i64> @test_vscbig(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vscbig: -; CHECK: vscbig %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vscbig(<2 x i64> %a, <2 x i64> %b) - ret <2 x i64> %res -} - -; VSQ. -define <16 x i8> @test_vsq(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsq: -; CHECK: vsq %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsq(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VSBIQ. -define <16 x i8> @test_vsbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vsbiq: -; CHECK: vsbiq %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsbiq(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res -} - -; VSCBIQ. -define <16 x i8> @test_vscbiq(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vscbiq: -; CHECK: vscbiq %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vscbiq(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VSBCBIQ. -define <16 x i8> @test_vsbcbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vsbcbiq: -; CHECK: vsbcbiq %v24, %v24, %v26, %v28 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsbcbiq(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res -} - -; VSUMB. -define <4 x i32> @test_vsumb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsumb: -; CHECK: vsumb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vsumb(<16 x i8> %a, <16 x i8> %b) - ret <4 x i32> %res -} - -; VSUMH. -define <4 x i32> @test_vsumh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsumh: -; CHECK: vsumh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vsumh(<8 x i16> %a, <8 x i16> %b) - ret <4 x i32> %res -} - -; VSUMGH. -define <2 x i64> @test_vsumgh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsumgh: -; CHECK: vsumgh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vsumgh(<8 x i16> %a, <8 x i16> %b) - ret <2 x i64> %res -} - -; VSUMGF. -define <2 x i64> @test_vsumgf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsumgf: -; CHECK: vsumgf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <2 x i64> @llvm.s390.vsumgf(<4 x i32> %a, <4 x i32> %b) - ret <2 x i64> %res -} - -; VSUMQF. -define <16 x i8> @test_vsumqf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsumqf: -; CHECK: vsumqf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsumqf(<4 x i32> %a, <4 x i32> %b) - ret <16 x i8> %res -} - -; VSUMQG. -define <16 x i8> @test_vsumqg(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsumqg: -; CHECK: vsumqg %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vsumqg(<2 x i64> %a, <2 x i64> %b) - ret <16 x i8> %res -} - -; VTM with no processing of the result. -define i32 @test_vtm(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vtm: -; CHECK: vtm %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b) - ret i32 %res -} - -; VTM, storing to %ptr if all bits are set. -define void @test_vtm_all_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) { -; CHECK-LABEL: test_vtm_all_store: -; CHECK-NOT: %r -; CHECK: vtm %v24, %v26 -; CHECK-NEXT: {{bnor|bler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b) - %cmp = icmp sge i32 %res, 3 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret void -} - -; VCEQBS with no processing of the result. -define i32 @test_vceqbs(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vceqbs: -; CHECK: vceqbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 1 - ret i32 %res -} - -; VCEQBS, returning 1 if any elements are equal (CC != 3). -define i32 @test_vceqbs_any_bool(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vceqbs_any_bool: -; CHECK: vceqbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: afi %r2, -536870912 -; CHECK: srl %r2, 31 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 1 - %cmp = icmp ne i32 %res, 3 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCEQBS, storing to %ptr if any elements are equal. -define <16 x i8> @test_vceqbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) { -; CHECK-LABEL: test_vceqbs_any_store: -; CHECK-NOT: %r -; CHECK: vceqbs %v24, %v24, %v26 -; CHECK-NEXT: {{bor|bnler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - %cmp = icmp ule i32 %cc, 2 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <16 x i8> %res -} - -; VCEQHS with no processing of the result. -define i32 @test_vceqhs(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vceqhs: -; CHECK: vceqhs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 1 - ret i32 %res -} - -; VCEQHS, returning 1 if not all elements are equal. -define i32 @test_vceqhs_notall_bool(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vceqhs_notall_bool: -; CHECK: vceqhs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: risblg %r2, [[REG]], 31, 159, 36 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 1 - %cmp = icmp sge i32 %res, 1 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCEQHS, storing to %ptr if not all elements are equal. -define <8 x i16> @test_vceqhs_notall_store(<8 x i16> %a, <8 x i16> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vceqhs_notall_store: -; CHECK-NOT: %r -; CHECK: vceqhs %v24, %v24, %v26 -; CHECK-NEXT: {{bher|ber}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - %cmp = icmp ugt i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <8 x i16> %res -} - -; VCEQFS with no processing of the result. -define i32 @test_vceqfs(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vceqfs: -; CHECK: vceqfs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 1 - ret i32 %res -} - -; VCEQFS, returning 1 if no elements are equal. -define i32 @test_vceqfs_none_bool(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vceqfs_none_bool: -; CHECK: vceqfs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: risblg %r2, [[REG]], 31, 159, 35 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 1 - %cmp = icmp eq i32 %res, 3 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCEQFS, storing to %ptr if no elements are equal. -define <4 x i32> @test_vceqfs_none_store(<4 x i32> %a, <4 x i32> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vceqfs_none_store: -; CHECK-NOT: %r -; CHECK: vceqfs %v24, %v24, %v26 -; CHECK-NEXT: {{bnor|bler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - %cmp = icmp uge i32 %cc, 3 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <4 x i32> %res -} - -; VCEQGS with no processing of the result. -define i32 @test_vceqgs(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vceqgs: -; CHECK: vceqgs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - ret i32 %res -} - -; VCEQGS returning 1 if all elements are equal (CC == 0). -define i32 @test_vceqgs_all_bool(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vceqgs_all_bool: -; CHECK: vceqgs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: afi %r2, -268435456 -; CHECK: srl %r2, 31 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp ult i32 %res, 1 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCEQGS, storing to %ptr if all elements are equal. -define <2 x i64> @test_vceqgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) { -; CHECK-LABEL: test_vceqgs_all_store: -; CHECK-NOT: %r -; CHECK: vceqgs %v24, %v24, %v26 -; CHECK-NEXT: {{bnher|bner}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<2 x i64>, i32} %call, 0 - %cc = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp sle i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <2 x i64> %res -} - -; VCHBS with no processing of the result. -define i32 @test_vchbs(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vchbs: -; CHECK: vchbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 1 - ret i32 %res -} - -; VCHBS, returning 1 if any elements are higher (CC != 3). -define i32 @test_vchbs_any_bool(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vchbs_any_bool: -; CHECK: vchbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: afi %r2, -536870912 -; CHECK: srl %r2, 31 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 1 - %cmp = icmp ne i32 %res, 3 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCHBS, storing to %ptr if any elements are higher. -define <16 x i8> @test_vchbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) { -; CHECK-LABEL: test_vchbs_any_store: -; CHECK-NOT: %r -; CHECK: vchbs %v24, %v24, %v26 -; CHECK-NEXT: {{bor|bnler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - %cmp = icmp ule i32 %cc, 2 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <16 x i8> %res -} - -; VCHHS with no processing of the result. -define i32 @test_vchhs(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vchhs: -; CHECK: vchhs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 1 - ret i32 %res -} - -; VCHHS, returning 1 if not all elements are higher. -define i32 @test_vchhs_notall_bool(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vchhs_notall_bool: -; CHECK: vchhs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: risblg %r2, [[REG]], 31, 159, 36 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 1 - %cmp = icmp sge i32 %res, 1 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCHHS, storing to %ptr if not all elements are higher. -define <8 x i16> @test_vchhs_notall_store(<8 x i16> %a, <8 x i16> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vchhs_notall_store: -; CHECK-NOT: %r -; CHECK: vchhs %v24, %v24, %v26 -; CHECK-NEXT: {{bher|ber}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - %cmp = icmp ugt i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <8 x i16> %res -} - -; VCHFS with no processing of the result. -define i32 @test_vchfs(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vchfs: -; CHECK: vchfs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 1 - ret i32 %res -} - -; VCHFS, returning 1 if no elements are higher. -define i32 @test_vchfs_none_bool(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vchfs_none_bool: -; CHECK: vchfs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: risblg %r2, [[REG]], 31, 159, 35 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 1 - %cmp = icmp eq i32 %res, 3 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCHFS, storing to %ptr if no elements are higher. -define <4 x i32> @test_vchfs_none_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) { -; CHECK-LABEL: test_vchfs_none_store: -; CHECK-NOT: %r -; CHECK: vchfs %v24, %v24, %v26 -; CHECK-NEXT: {{bnor|bler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - %cmp = icmp uge i32 %cc, 3 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <4 x i32> %res -} - -; VCHGS with no processing of the result. -define i32 @test_vchgs(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vchgs: -; CHECK: vchgs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - ret i32 %res -} - -; VCHGS returning 1 if all elements are higher (CC == 0). -define i32 @test_vchgs_all_bool(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vchgs_all_bool: -; CHECK: vchgs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: afi %r2, -268435456 -; CHECK: srl %r2, 31 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp ult i32 %res, 1 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCHGS, storing to %ptr if all elements are higher. -define <2 x i64> @test_vchgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) { -; CHECK-LABEL: test_vchgs_all_store: -; CHECK-NOT: %r -; CHECK: vchgs %v24, %v24, %v26 -; CHECK-NEXT: {{bnher|bner}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<2 x i64>, i32} %call, 0 - %cc = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp sle i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <2 x i64> %res -} - -; VCHLBS with no processing of the result. -define i32 @test_vchlbs(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vchlbs: -; CHECK: vchlbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 1 - ret i32 %res -} - -; VCHLBS, returning 1 if any elements are higher (CC != 3). -define i32 @test_vchlbs_any_bool(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vchlbs_any_bool: -; CHECK: vchlbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: afi %r2, -536870912 -; CHECK: srl %r2, 31 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 1 - %cmp = icmp ne i32 %res, 3 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCHLBS, storing to %ptr if any elements are higher. -define <16 x i8> @test_vchlbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) { -; CHECK-LABEL: test_vchlbs_any_store: -; CHECK-NOT: %r -; CHECK: vchlbs %v24, %v24, %v26 -; CHECK-NEXT: {{bor|bnler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - %cmp = icmp sle i32 %cc, 2 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <16 x i8> %res -} - -; VCHLHS with no processing of the result. -define i32 @test_vchlhs(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vchlhs: -; CHECK: vchlhs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 1 - ret i32 %res -} - -; VCHLHS, returning 1 if not all elements are higher. -define i32 @test_vchlhs_notall_bool(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vchlhs_notall_bool: -; CHECK: vchlhs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: risblg %r2, [[REG]], 31, 159, 36 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 1 - %cmp = icmp uge i32 %res, 1 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCHLHS, storing to %ptr if not all elements are higher. -define <8 x i16> @test_vchlhs_notall_store(<8 x i16> %a, <8 x i16> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vchlhs_notall_store: -; CHECK-NOT: %r -; CHECK: vchlhs %v24, %v24, %v26 -; CHECK-NEXT: {{bher|ber}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - %cmp = icmp sgt i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <8 x i16> %res -} - -; VCHLFS with no processing of the result. -define i32 @test_vchlfs(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vchlfs: -; CHECK: vchlfs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 1 - ret i32 %res -} - -; VCHLFS, returning 1 if no elements are higher. -define i32 @test_vchlfs_none_bool(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vchlfs_none_bool: -; CHECK: vchlfs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: risblg %r2, [[REG]], 31, 159, 35 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 1 - %cmp = icmp eq i32 %res, 3 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCHLFS, storing to %ptr if no elements are higher. -define <4 x i32> @test_vchlfs_none_store(<4 x i32> %a, <4 x i32> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vchlfs_none_store: -; CHECK-NOT: %r -; CHECK: vchlfs %v24, %v24, %v26 -; CHECK-NEXT: {{bnor|bler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - %cmp = icmp sge i32 %cc, 3 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <4 x i32> %res -} - -; VCHLGS with no processing of the result. -define i32 @test_vchlgs(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vchlgs: -; CHECK: vchlgs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - ret i32 %res -} - -; VCHLGS returning 1 if all elements are higher (CC == 0). -define i32 @test_vchlgs_all_bool(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vchlgs_all_bool: -; CHECK: vchlgs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: afi %r2, -268435456 -; CHECK: srl %r2, 31 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp slt i32 %res, 1 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VCHLGS, storing to %ptr if all elements are higher. -define <2 x i64> @test_vchlgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) { -; CHECK-LABEL: test_vchlgs_all_store: -; CHECK-NOT: %r -; CHECK: vchlgs %v24, %v24, %v26 -; CHECK-NEXT: {{bnher|bner}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b) - %res = extractvalue {<2 x i64>, i32} %call, 0 - %cc = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp ule i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <2 x i64> %res -} - -; VFAEB with !IN !RT. -define <16 x i8> @test_vfaeb_0(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfaeb_0: -; CHECK: vfaeb %v24, %v24, %v26, 0 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 0) - ret <16 x i8> %res -} - -; VFAEB with !IN RT. -define <16 x i8> @test_vfaeb_4(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfaeb_4: -; CHECK: vfaeb %v24, %v24, %v26, 4 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 4) - ret <16 x i8> %res -} - -; VFAEB with IN !RT. -define <16 x i8> @test_vfaeb_8(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfaeb_8: -; CHECK: vfaeb %v24, %v24, %v26, 8 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 8) - ret <16 x i8> %res -} - -; VFAEB with IN RT. -define <16 x i8> @test_vfaeb_12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfaeb_12: -; CHECK: vfaeb %v24, %v24, %v26, 12 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 12) - ret <16 x i8> %res -} - -; VFAEB with CS -- should be ignored. -define <16 x i8> @test_vfaeb_1(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfaeb_1: -; CHECK: vfaeb %v24, %v24, %v26, 0 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 1) - ret <16 x i8> %res -} - -; VFAEH. -define <8 x i16> @test_vfaeh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vfaeh: -; CHECK: vfaeh %v24, %v24, %v26, 4 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vfaeh(<8 x i16> %a, <8 x i16> %b, i32 4) - ret <8 x i16> %res -} - -; VFAEF. -define <4 x i32> @test_vfaef(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vfaef: -; CHECK: vfaef %v24, %v24, %v26, 8 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vfaef(<4 x i32> %a, <4 x i32> %b, i32 8) - ret <4 x i32> %res -} - -; VFAEBS. -define <16 x i8> @test_vfaebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfaebs: -; CHECK: vfaebs %v24, %v24, %v26, 0 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vfaebs(<16 x i8> %a, <16 x i8> %b, - i32 0) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VFAEHS. -define <8 x i16> @test_vfaehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfaehs: -; CHECK: vfaehs %v24, %v24, %v26, 4 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vfaehs(<8 x i16> %a, <8 x i16> %b, - i32 4) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VFAEFS. -define <4 x i32> @test_vfaefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfaefs: -; CHECK: vfaefs %v24, %v24, %v26, 8 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vfaefs(<4 x i32> %a, <4 x i32> %b, - i32 8) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VFAEZB with !IN !RT. -define <16 x i8> @test_vfaezb_0(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfaezb_0: -; CHECK: vfaezb %v24, %v24, %v26, 0 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 0) - ret <16 x i8> %res -} - -; VFAEZB with !IN RT. -define <16 x i8> @test_vfaezb_4(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfaezb_4: -; CHECK: vfaezb %v24, %v24, %v26, 4 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 4) - ret <16 x i8> %res -} - -; VFAEZB with IN !RT. -define <16 x i8> @test_vfaezb_8(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfaezb_8: -; CHECK: vfaezb %v24, %v24, %v26, 8 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 8) - ret <16 x i8> %res -} - -; VFAEZB with IN RT. -define <16 x i8> @test_vfaezb_12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfaezb_12: -; CHECK: vfaezb %v24, %v24, %v26, 12 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 12) - ret <16 x i8> %res -} - -; VFAEZB with CS -- should be ignored. -define <16 x i8> @test_vfaezb_1(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfaezb_1: -; CHECK: vfaezb %v24, %v24, %v26, 0 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 1) - ret <16 x i8> %res -} - -; VFAEZH. -define <8 x i16> @test_vfaezh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vfaezh: -; CHECK: vfaezh %v24, %v24, %v26, 4 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vfaezh(<8 x i16> %a, <8 x i16> %b, i32 4) - ret <8 x i16> %res -} - -; VFAEZF. -define <4 x i32> @test_vfaezf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vfaezf: -; CHECK: vfaezf %v24, %v24, %v26, 8 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vfaezf(<4 x i32> %a, <4 x i32> %b, i32 8) - ret <4 x i32> %res -} - -; VFAEZBS. -define <16 x i8> @test_vfaezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfaezbs: -; CHECK: vfaezbs %v24, %v24, %v26, 0 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vfaezbs(<16 x i8> %a, <16 x i8> %b, - i32 0) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VFAEZHS. -define <8 x i16> @test_vfaezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfaezhs: -; CHECK: vfaezhs %v24, %v24, %v26, 4 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vfaezhs(<8 x i16> %a, <8 x i16> %b, - i32 4) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VFAEZFS. -define <4 x i32> @test_vfaezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfaezfs: -; CHECK: vfaezfs %v24, %v24, %v26, 8 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vfaezfs(<4 x i32> %a, <4 x i32> %b, - i32 8) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VFEEB. -define <16 x i8> @test_vfeeb_0(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfeeb_0: -; CHECK: vfeeb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfeeb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VFEEH. -define <8 x i16> @test_vfeeh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vfeeh: -; CHECK: vfeeh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vfeeh(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VFEEF. -define <4 x i32> @test_vfeef(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vfeef: -; CHECK: vfeef %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vfeef(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VFEEBS. -define <16 x i8> @test_vfeebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfeebs: -; CHECK: vfeebs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vfeebs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VFEEHS. -define <8 x i16> @test_vfeehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfeehs: -; CHECK: vfeehs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vfeehs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VFEEFS. -define <4 x i32> @test_vfeefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfeefs: -; CHECK: vfeefs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vfeefs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VFEEZB. -define <16 x i8> @test_vfeezb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfeezb: -; CHECK: vfeezb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfeezb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VFEEZH. -define <8 x i16> @test_vfeezh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vfeezh: -; CHECK: vfeezh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vfeezh(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VFEEZF. -define <4 x i32> @test_vfeezf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vfeezf: -; CHECK: vfeezf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vfeezf(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VFEEZBS. -define <16 x i8> @test_vfeezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfeezbs: -; CHECK: vfeezbs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vfeezbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VFEEZHS. -define <8 x i16> @test_vfeezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfeezhs: -; CHECK: vfeezhs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vfeezhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VFEEZFS. -define <4 x i32> @test_vfeezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfeezfs: -; CHECK: vfeezfs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vfeezfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VFENEB. -define <16 x i8> @test_vfeneb_0(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfeneb_0: -; CHECK: vfeneb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfeneb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VFENEH. -define <8 x i16> @test_vfeneh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vfeneh: -; CHECK: vfeneh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vfeneh(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VFENEF. -define <4 x i32> @test_vfenef(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vfenef: -; CHECK: vfenef %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vfenef(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VFENEBS. -define <16 x i8> @test_vfenebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfenebs: -; CHECK: vfenebs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vfenebs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VFENEHS. -define <8 x i16> @test_vfenehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfenehs: -; CHECK: vfenehs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vfenehs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VFENEFS. -define <4 x i32> @test_vfenefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfenefs: -; CHECK: vfenefs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vfenefs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VFENEZB. -define <16 x i8> @test_vfenezb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vfenezb: -; CHECK: vfenezb %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vfenezb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VFENEZH. -define <8 x i16> @test_vfenezh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vfenezh: -; CHECK: vfenezh %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vfenezh(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VFENEZF. -define <4 x i32> @test_vfenezf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vfenezf: -; CHECK: vfenezf %v24, %v24, %v26 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vfenezf(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VFENEZBS. -define <16 x i8> @test_vfenezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfenezbs: -; CHECK: vfenezbs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vfenezbs(<16 x i8> %a, <16 x i8> %b) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VFENEZHS. -define <8 x i16> @test_vfenezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfenezhs: -; CHECK: vfenezhs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vfenezhs(<8 x i16> %a, <8 x i16> %b) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VFENEZFS. -define <4 x i32> @test_vfenezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) { -; CHECK-LABEL: test_vfenezfs: -; CHECK: vfenezfs %v24, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vfenezfs(<4 x i32> %a, <4 x i32> %b) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VISTRB. -define <16 x i8> @test_vistrb(<16 x i8> %a) { -; CHECK-LABEL: test_vistrb: -; CHECK: vistrb %v24, %v24 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vistrb(<16 x i8> %a) - ret <16 x i8> %res -} - -; VISTRH. -define <8 x i16> @test_vistrh(<8 x i16> %a) { -; CHECK-LABEL: test_vistrh: -; CHECK: vistrh %v24, %v24 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vistrh(<8 x i16> %a) - ret <8 x i16> %res -} - -; VISTRF. -define <4 x i32> @test_vistrf(<4 x i32> %a) { -; CHECK-LABEL: test_vistrf: -; CHECK: vistrf %v24, %v24 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vistrf(<4 x i32> %a) - ret <4 x i32> %res -} - -; VISTRBS. -define <16 x i8> @test_vistrbs(<16 x i8> %a, i32 *%ccptr) { -; CHECK-LABEL: test_vistrbs: -; CHECK: vistrbs %v24, %v24 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vistrbs(<16 x i8> %a) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VISTRHS. -define <8 x i16> @test_vistrhs(<8 x i16> %a, i32 *%ccptr) { -; CHECK-LABEL: test_vistrhs: -; CHECK: vistrhs %v24, %v24 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vistrhs(<8 x i16> %a) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VISTRFS. -define <4 x i32> @test_vistrfs(<4 x i32> %a, i32 *%ccptr) { -; CHECK-LABEL: test_vistrfs: -; CHECK: vistrfs %v24, %v24 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vistrfs(<4 x i32> %a) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VSTRCB with !IN !RT. -define <16 x i8> @test_vstrcb_0(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vstrcb_0: -; CHECK: vstrcb %v24, %v24, %v26, %v28, 0 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 0) - ret <16 x i8> %res -} - -; VSTRCB with !IN RT. -define <16 x i8> @test_vstrcb_4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vstrcb_4: -; CHECK: vstrcb %v24, %v24, %v26, %v28, 4 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 4) - ret <16 x i8> %res -} - -; VSTRCB with IN !RT. -define <16 x i8> @test_vstrcb_8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vstrcb_8: -; CHECK: vstrcb %v24, %v24, %v26, %v28, 8 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 8) - ret <16 x i8> %res -} - -; VSTRCB with IN RT. -define <16 x i8> @test_vstrcb_12(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vstrcb_12: -; CHECK: vstrcb %v24, %v24, %v26, %v28, 12 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 12) - ret <16 x i8> %res -} - -; VSTRCB with CS -- should be ignored. -define <16 x i8> @test_vstrcb_1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vstrcb_1: -; CHECK: vstrcb %v24, %v24, %v26, %v28, 0 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 1) - ret <16 x i8> %res -} - -; VSTRCH. -define <8 x i16> @test_vstrch(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { -; CHECK-LABEL: test_vstrch: -; CHECK: vstrch %v24, %v24, %v26, %v28, 4 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vstrch(<8 x i16> %a, <8 x i16> %b, - <8 x i16> %c, i32 4) - ret <8 x i16> %res -} - -; VSTRCF. -define <4 x i32> @test_vstrcf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { -; CHECK-LABEL: test_vstrcf: -; CHECK: vstrcf %v24, %v24, %v26, %v28, 8 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vstrcf(<4 x i32> %a, <4 x i32> %b, - <4 x i32> %c, i32 8) - ret <4 x i32> %res -} - -; VSTRCBS. -define <16 x i8> @test_vstrcbs(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, - i32 *%ccptr) { -; CHECK-LABEL: test_vstrcbs: -; CHECK: vstrcbs %v24, %v24, %v26, %v28, 0 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vstrcbs(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 0) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VSTRCHS. -define <8 x i16> @test_vstrchs(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, - i32 *%ccptr) { -; CHECK-LABEL: test_vstrchs: -; CHECK: vstrchs %v24, %v24, %v26, %v28, 4 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vstrchs(<8 x i16> %a, <8 x i16> %b, - <8 x i16> %c, i32 4) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VSTRCFS. -define <4 x i32> @test_vstrcfs(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, - i32 *%ccptr) { -; CHECK-LABEL: test_vstrcfs: -; CHECK: vstrcfs %v24, %v24, %v26, %v28, 8 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vstrcfs(<4 x i32> %a, <4 x i32> %b, - <4 x i32> %c, i32 8) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VSTRCZB with !IN !RT. -define <16 x i8> @test_vstrczb_0(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vstrczb_0: -; CHECK: vstrczb %v24, %v24, %v26, %v28, 0 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 0) - ret <16 x i8> %res -} - -; VSTRCZB with !IN RT. -define <16 x i8> @test_vstrczb_4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vstrczb_4: -; CHECK: vstrczb %v24, %v24, %v26, %v28, 4 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 4) - ret <16 x i8> %res -} - -; VSTRCZB with IN !RT. -define <16 x i8> @test_vstrczb_8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vstrczb_8: -; CHECK: vstrczb %v24, %v24, %v26, %v28, 8 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 8) - ret <16 x i8> %res -} - -; VSTRCZB with IN RT. -define <16 x i8> @test_vstrczb_12(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vstrczb_12: -; CHECK: vstrczb %v24, %v24, %v26, %v28, 12 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 12) - ret <16 x i8> %res -} - -; VSTRCZB with CS -- should be ignored. -define <16 x i8> @test_vstrczb_1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { -; CHECK-LABEL: test_vstrczb_1: -; CHECK: vstrczb %v24, %v24, %v26, %v28, 0 -; CHECK: br %r14 - %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 1) - ret <16 x i8> %res -} - -; VSTRCZH. -define <8 x i16> @test_vstrczh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { -; CHECK-LABEL: test_vstrczh: -; CHECK: vstrczh %v24, %v24, %v26, %v28, 4 -; CHECK: br %r14 - %res = call <8 x i16> @llvm.s390.vstrczh(<8 x i16> %a, <8 x i16> %b, - <8 x i16> %c, i32 4) - ret <8 x i16> %res -} - -; VSTRCZF. -define <4 x i32> @test_vstrczf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { -; CHECK-LABEL: test_vstrczf: -; CHECK: vstrczf %v24, %v24, %v26, %v28, 8 -; CHECK: br %r14 - %res = call <4 x i32> @llvm.s390.vstrczf(<4 x i32> %a, <4 x i32> %b, - <4 x i32> %c, i32 8) - ret <4 x i32> %res -} - -; VSTRCZBS. -define <16 x i8> @test_vstrczbs(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, - i32 *%ccptr) { -; CHECK-LABEL: test_vstrczbs: -; CHECK: vstrczbs %v24, %v24, %v26, %v28, 0 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<16 x i8>, i32} @llvm.s390.vstrczbs(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c, i32 0) - %res = extractvalue {<16 x i8>, i32} %call, 0 - %cc = extractvalue {<16 x i8>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <16 x i8> %res -} - -; VSTRCZHS. -define <8 x i16> @test_vstrczhs(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, - i32 *%ccptr) { -; CHECK-LABEL: test_vstrczhs: -; CHECK: vstrczhs %v24, %v24, %v26, %v28, 4 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<8 x i16>, i32} @llvm.s390.vstrczhs(<8 x i16> %a, <8 x i16> %b, - <8 x i16> %c, i32 4) - %res = extractvalue {<8 x i16>, i32} %call, 0 - %cc = extractvalue {<8 x i16>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <8 x i16> %res -} - -; VSTRCZFS. -define <4 x i32> @test_vstrczfs(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, - i32 *%ccptr) { -; CHECK-LABEL: test_vstrczfs: -; CHECK: vstrczfs %v24, %v24, %v26, %v28, 8 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: srl [[REG]], 28 -; CHECK: st [[REG]], 0(%r2) -; CHECK: br %r14 - %call = call {<4 x i32>, i32} @llvm.s390.vstrczfs(<4 x i32> %a, <4 x i32> %b, - <4 x i32> %c, i32 8) - %res = extractvalue {<4 x i32>, i32} %call, 0 - %cc = extractvalue {<4 x i32>, i32} %call, 1 - store i32 %cc, i32 *%ccptr - ret <4 x i32> %res -} - -; VFCEDBS with no processing of the result. -define i32 @test_vfcedbs(<2 x double> %a, <2 x double> %b) { -; CHECK-LABEL: test_vfcedbs: -; CHECK: vfcedbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a, - <2 x double> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - ret i32 %res -} - -; VFCEDBS, returning 1 if any elements are equal (CC != 3). -define i32 @test_vfcedbs_any_bool(<2 x double> %a, <2 x double> %b) { -; CHECK-LABEL: test_vfcedbs_any_bool: -; CHECK: vfcedbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: afi %r2, -536870912 -; CHECK: srl %r2, 31 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a, - <2 x double> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp ne i32 %res, 3 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VFCEDBS, storing to %ptr if any elements are equal. -define <2 x i64> @test_vfcedbs_any_store(<2 x double> %a, <2 x double> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vfcedbs_any_store: -; CHECK-NOT: %r -; CHECK: vfcedbs %v24, %v24, %v26 -; CHECK-NEXT: {{bor|bnler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a, - <2 x double> %b) - %res = extractvalue {<2 x i64>, i32} %call, 0 - %cc = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp ule i32 %cc, 2 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <2 x i64> %res -} - -; VFCHDBS with no processing of the result. -define i32 @test_vfchdbs(<2 x double> %a, <2 x double> %b) { -; CHECK-LABEL: test_vfchdbs: -; CHECK: vfchdbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a, - <2 x double> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - ret i32 %res -} - -; VFCHDBS, returning 1 if not all elements are higher. -define i32 @test_vfchdbs_notall_bool(<2 x double> %a, <2 x double> %b) { -; CHECK-LABEL: test_vfchdbs_notall_bool: -; CHECK: vfchdbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: risblg %r2, [[REG]], 31, 159, 36 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a, - <2 x double> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp sge i32 %res, 1 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VFCHDBS, storing to %ptr if not all elements are higher. -define <2 x i64> @test_vfchdbs_notall_store(<2 x double> %a, <2 x double> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vfchdbs_notall_store: -; CHECK-NOT: %r -; CHECK: vfchdbs %v24, %v24, %v26 -; CHECK-NEXT: {{bher|ber}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a, - <2 x double> %b) - %res = extractvalue {<2 x i64>, i32} %call, 0 - %cc = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp ugt i32 %cc, 0 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <2 x i64> %res -} - -; VFCHEDBS with no processing of the result. -define i32 @test_vfchedbs(<2 x double> %a, <2 x double> %b) { -; CHECK-LABEL: test_vfchedbs: -; CHECK: vfchedbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a, - <2 x double> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - ret i32 %res -} - -; VFCHEDBS, returning 1 if neither element is higher or equal. -define i32 @test_vfchedbs_none_bool(<2 x double> %a, <2 x double> %b) { -; CHECK-LABEL: test_vfchedbs_none_bool: -; CHECK: vfchedbs {{%v[0-9]+}}, %v24, %v26 -; CHECK: ipm [[REG:%r[0-5]]] -; CHECK: risblg %r2, [[REG]], 31, 159, 35 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a, - <2 x double> %b) - %res = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp eq i32 %res, 3 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VFCHEDBS, storing to %ptr if neither element is higher or equal. -define <2 x i64> @test_vfchedbs_none_store(<2 x double> %a, <2 x double> %b, - i32 *%ptr) { -; CHECK-LABEL: test_vfchedbs_none_store: -; CHECK-NOT: %r -; CHECK: vfchedbs %v24, %v24, %v26 -; CHECK-NEXT: {{bnor|bler}} %r14 -; CHECK: mvhi 0(%r2), 0 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a, - <2 x double> %b) - %res = extractvalue {<2 x i64>, i32} %call, 0 - %cc = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp uge i32 %cc, 3 - br i1 %cmp, label %store, label %exit - -store: - store i32 0, i32 *%ptr - br label %exit - -exit: - ret <2 x i64> %res -} - -; VFTCIDB with the lowest useful class selector and no processing of the result. -define i32 @test_vftcidb(<2 x double> %a) { -; CHECK-LABEL: test_vftcidb: -; CHECK: vftcidb {{%v[0-9]+}}, %v24, 1 -; CHECK: ipm %r2 -; CHECK: srl %r2, 28 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double> %a, i32 1) - %res = extractvalue {<2 x i64>, i32} %call, 1 - ret i32 %res -} - -; VFTCIDB with the highest useful class selector, returning 1 if all elements -; have the right class (CC == 0). -define i32 @test_vftcidb_all_bool(<2 x double> %a) { -; CHECK-LABEL: test_vftcidb_all_bool: -; CHECK: vftcidb {{%v[0-9]+}}, %v24, 4094 -; CHECK: afi %r2, -268435456 -; CHECK: srl %r2, 31 -; CHECK: br %r14 - %call = call {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double> %a, i32 4094) - %res = extractvalue {<2 x i64>, i32} %call, 1 - %cmp = icmp eq i32 %res, 0 - %ext = zext i1 %cmp to i32 - ret i32 %ext -} - -; VFIDB with a rounding mode not usable via standard intrinsics. -define <2 x double> @test_vfidb_0_4(<2 x double> %a) { -; CHECK-LABEL: test_vfidb_0_4: -; CHECK: vfidb %v24, %v24, 0, 4 -; CHECK: br %r14 - %res = call <2 x double> @llvm.s390.vfidb(<2 x double> %a, i32 0, i32 4) - ret <2 x double> %res -} - -; VFIDB with IEEE-inexact exception suppressed. -define <2 x double> @test_vfidb_4_0(<2 x double> %a) { -; CHECK-LABEL: test_vfidb_4_0: -; CHECK: vfidb %v24, %v24, 4, 0 -; CHECK: br %r14 - %res = call <2 x double> @llvm.s390.vfidb(<2 x double> %a, i32 4, i32 0) - ret <2 x double> %res -} - diff --git a/test/CodeGen/SystemZ/vec-max-05.ll b/test/CodeGen/SystemZ/vec-max-05.ll new file mode 100644 index 000000000000..591d3bf36f16 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-max-05.ll @@ -0,0 +1,175 @@ +; Test vector maximum on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare double @fmax(double, double) +declare double @llvm.maxnum.f64(double, double) +declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) + +declare float @fmaxf(float, float) +declare float @llvm.maxnum.f32(float, float) +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) + +declare fp128 @fmaxl(fp128, fp128) +declare fp128 @llvm.maxnum.f128(fp128, fp128) + +; Test the fmax library function. +define double @f1(double %dummy, double %val1, double %val2) { +; CHECK-LABEL: f1: +; CHECK: wfmaxdb %f0, %f2, %f4, 4 +; CHECK: br %r14 + %ret = call double @fmax(double %val1, double %val2) readnone + ret double %ret +} + +; Test the f64 maxnum intrinsic. +define double @f2(double %dummy, double %val1, double %val2) { +; CHECK-LABEL: f2: +; CHECK: wfmaxdb %f0, %f2, %f4, 4 +; CHECK: br %r14 + %ret = call double @llvm.maxnum.f64(double %val1, double %val2) + ret double %ret +} + +; Test a f64 constant compare/select resulting in maxnum. +define double @f3(double %dummy, double %val) { +; CHECK-LABEL: f3: +; CHECK: lzdr [[REG:%f[0-9]+]] +; CHECK: wfmaxdb %f0, %f2, [[REG]], 4 +; CHECK: br %r14 + %cmp = fcmp ogt double %val, 0.0 + %ret = select i1 %cmp, double %val, double 0.0 + ret double %ret +} + +; Test a f64 constant compare/select resulting in maxnan. +define double @f4(double %dummy, double %val) { +; CHECK-LABEL: f4: +; CHECK: lzdr [[REG:%f[0-9]+]] +; CHECK: wfmaxdb %f0, %f2, [[REG]], 1 +; CHECK: br %r14 + %cmp = fcmp ugt double %val, 0.0 + %ret = select i1 %cmp, double %val, double 0.0 + ret double %ret +} + +; Test the v2f64 maxnum intrinsic. +define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1, + <2 x double> %val2) { +; CHECK-LABEL: f5: +; CHECK: vfmaxdb %v24, %v26, %v28, 4 +; CHECK: br %r14 + %ret = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %val1, <2 x double> %val2) + ret <2 x double> %ret +} + +; Test the fmaxf library function. +define float @f11(float %dummy, float %val1, float %val2) { +; CHECK-LABEL: f11: +; CHECK: wfmaxsb %f0, %f2, %f4, 4 +; CHECK: br %r14 + %ret = call float @fmaxf(float %val1, float %val2) readnone + ret float %ret +} + +; Test the f32 maxnum intrinsic. +define float @f12(float %dummy, float %val1, float %val2) { +; CHECK-LABEL: f12: +; CHECK: wfmaxsb %f0, %f2, %f4, 4 +; CHECK: br %r14 + %ret = call float @llvm.maxnum.f32(float %val1, float %val2) + ret float %ret +} + +; Test a f32 constant compare/select resulting in maxnum. +define float @f13(float %dummy, float %val) { +; CHECK-LABEL: f13: +; CHECK: lzer [[REG:%f[0-9]+]] +; CHECK: wfmaxsb %f0, %f2, [[REG]], 4 +; CHECK: br %r14 + %cmp = fcmp ogt float %val, 0.0 + %ret = select i1 %cmp, float %val, float 0.0 + ret float %ret +} + +; Test a f32 constant compare/select resulting in maxnan. +define float @f14(float %dummy, float %val) { +; CHECK-LABEL: f14: +; CHECK: lzer [[REG:%f[0-9]+]] +; CHECK: wfmaxsb %f0, %f2, [[REG]], 1 +; CHECK: br %r14 + %cmp = fcmp ugt float %val, 0.0 + %ret = select i1 %cmp, float %val, float 0.0 + ret float %ret +} + +; Test the v4f32 maxnum intrinsic. +define <4 x float> @f15(<4 x float> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f15: +; CHECK: vfmaxsb %v24, %v26, %v28, 4 +; CHECK: br %r14 + %ret = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %val1, <4 x float> %val2) + ret <4 x float> %ret +} + +; Test the fmaxl library function. +define void @f21(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) { +; CHECK-LABEL: f21: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4 +; CHECK: vst [[RES]], 0(%r4) +; CHECK: br %r14 + %val1 = load fp128, fp128* %ptr1 + %val2 = load fp128, fp128* %ptr2 + %res = call fp128 @fmaxl(fp128 %val1, fp128 %val2) readnone + store fp128 %res, fp128* %dst + ret void +} + +; Test the f128 maxnum intrinsic. +define void @f22(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) { +; CHECK-LABEL: f22: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4 +; CHECK: vst [[RES]], 0(%r4) +; CHECK: br %r14 + %val1 = load fp128, fp128* %ptr1 + %val2 = load fp128, fp128* %ptr2 + %res = call fp128 @llvm.maxnum.f128(fp128 %val1, fp128 %val2) + store fp128 %res, fp128* %dst + ret void +} + +; Test a f128 constant compare/select resulting in maxnum. +define void @f23(fp128 *%ptr, fp128 *%dst) { +; CHECK-LABEL: f23: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vzero [[REG2:%v[0-9]+]] +; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4 +; CHECK: vst [[RES]], 0(%r3) +; CHECK: br %r14 + %val = load fp128, fp128* %ptr + %cmp = fcmp ogt fp128 %val, 0xL00000000000000000000000000000000 + %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000 + store fp128 %res, fp128* %dst + ret void +} + +; Test a f128 constant compare/select resulting in maxnan. +define void @f24(fp128 *%ptr, fp128 *%dst) { +; CHECK-LABEL: f24: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vzero [[REG2:%v[0-9]+]] +; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 1 +; CHECK: vst [[RES]], 0(%r3) +; CHECK: br %r14 + %val = load fp128, fp128* %ptr + %cmp = fcmp ugt fp128 %val, 0xL00000000000000000000000000000000 + %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000 + store fp128 %res, fp128* %dst + ret void +} + diff --git a/test/CodeGen/SystemZ/vec-min-05.ll b/test/CodeGen/SystemZ/vec-min-05.ll new file mode 100644 index 000000000000..3eef9016cd08 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-min-05.ll @@ -0,0 +1,175 @@ +; Test vector minimum on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare double @fmin(double, double) +declare double @llvm.minnum.f64(double, double) +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) + +declare float @fminf(float, float) +declare float @llvm.minnum.f32(float, float) +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) + +declare fp128 @fminl(fp128, fp128) +declare fp128 @llvm.minnum.f128(fp128, fp128) + +; Test the fmin library function. +define double @f1(double %dummy, double %val1, double %val2) { +; CHECK-LABEL: f1: +; CHECK: wfmindb %f0, %f2, %f4, 4 +; CHECK: br %r14 + %ret = call double @fmin(double %val1, double %val2) readnone + ret double %ret +} + +; Test the f64 minnum intrinsic. +define double @f2(double %dummy, double %val1, double %val2) { +; CHECK-LABEL: f2: +; CHECK: wfmindb %f0, %f2, %f4, 4 +; CHECK: br %r14 + %ret = call double @llvm.minnum.f64(double %val1, double %val2) + ret double %ret +} + +; Test a f64 constant compare/select resulting in minnum. +define double @f3(double %dummy, double %val) { +; CHECK-LABEL: f3: +; CHECK: lzdr [[REG:%f[0-9]+]] +; CHECK: wfmindb %f0, %f2, [[REG]], 4 +; CHECK: br %r14 + %cmp = fcmp olt double %val, 0.0 + %ret = select i1 %cmp, double %val, double 0.0 + ret double %ret +} + +; Test a f64 constant compare/select resulting in minnan. +define double @f4(double %dummy, double %val) { +; CHECK-LABEL: f4: +; CHECK: lzdr [[REG:%f[0-9]+]] +; CHECK: wfmindb %f0, %f2, [[REG]], 1 +; CHECK: br %r14 + %cmp = fcmp ult double %val, 0.0 + %ret = select i1 %cmp, double %val, double 0.0 + ret double %ret +} + +; Test the v2f64 minnum intrinsic. +define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1, + <2 x double> %val2) { +; CHECK-LABEL: f5: +; CHECK: vfmindb %v24, %v26, %v28, 4 +; CHECK: br %r14 + %ret = call <2 x double> @llvm.minnum.v2f64(<2 x double> %val1, <2 x double> %val2) + ret <2 x double> %ret +} + +; Test the fminf library function. +define float @f11(float %dummy, float %val1, float %val2) { +; CHECK-LABEL: f11: +; CHECK: wfminsb %f0, %f2, %f4, 4 +; CHECK: br %r14 + %ret = call float @fminf(float %val1, float %val2) readnone + ret float %ret +} + +; Test the f32 minnum intrinsic. +define float @f12(float %dummy, float %val1, float %val2) { +; CHECK-LABEL: f12: +; CHECK: wfminsb %f0, %f2, %f4, 4 +; CHECK: br %r14 + %ret = call float @llvm.minnum.f32(float %val1, float %val2) + ret float %ret +} + +; Test a f32 constant compare/select resulting in minnum. +define float @f13(float %dummy, float %val) { +; CHECK-LABEL: f13: +; CHECK: lzer [[REG:%f[0-9]+]] +; CHECK: wfminsb %f0, %f2, [[REG]], 4 +; CHECK: br %r14 + %cmp = fcmp olt float %val, 0.0 + %ret = select i1 %cmp, float %val, float 0.0 + ret float %ret +} + +; Test a f32 constant compare/select resulting in minnan. +define float @f14(float %dummy, float %val) { +; CHECK-LABEL: f14: +; CHECK: lzer [[REG:%f[0-9]+]] +; CHECK: wfminsb %f0, %f2, [[REG]], 1 +; CHECK: br %r14 + %cmp = fcmp ult float %val, 0.0 + %ret = select i1 %cmp, float %val, float 0.0 + ret float %ret +} + +; Test the v4f32 minnum intrinsic. +define <4 x float> @f15(<4 x float> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f15: +; CHECK: vfminsb %v24, %v26, %v28, 4 +; CHECK: br %r14 + %ret = call <4 x float> @llvm.minnum.v4f32(<4 x float> %val1, <4 x float> %val2) + ret <4 x float> %ret +} + +; Test the fminl library function. +define void @f21(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) { +; CHECK-LABEL: f21: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4 +; CHECK: vst [[RES]], 0(%r4) +; CHECK: br %r14 + %val1 = load fp128, fp128* %ptr1 + %val2 = load fp128, fp128* %ptr2 + %res = call fp128 @fminl(fp128 %val1, fp128 %val2) readnone + store fp128 %res, fp128* %dst + ret void +} + +; Test the f128 minnum intrinsic. +define void @f22(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) { +; CHECK-LABEL: f22: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) +; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4 +; CHECK: vst [[RES]], 0(%r4) +; CHECK: br %r14 + %val1 = load fp128, fp128* %ptr1 + %val2 = load fp128, fp128* %ptr2 + %res = call fp128 @llvm.minnum.f128(fp128 %val1, fp128 %val2) + store fp128 %res, fp128* %dst + ret void +} + +; Test a f128 constant compare/select resulting in minnum. +define void @f23(fp128 *%ptr, fp128 *%dst) { +; CHECK-LABEL: f23: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vzero [[REG2:%v[0-9]+]] +; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4 +; CHECK: vst [[RES]], 0(%r3) +; CHECK: br %r14 + %val = load fp128, fp128* %ptr + %cmp = fcmp olt fp128 %val, 0xL00000000000000000000000000000000 + %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000 + store fp128 %res, fp128* %dst + ret void +} + +; Test a f128 constant compare/select resulting in minnan. +define void @f24(fp128 *%ptr, fp128 *%dst) { +; CHECK-LABEL: f24: +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) +; CHECK-DAG: vzero [[REG2:%v[0-9]+]] +; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 1 +; CHECK: vst [[RES]], 0(%r3) +; CHECK: br %r14 + %val = load fp128, fp128* %ptr + %cmp = fcmp ult fp128 %val, 0xL00000000000000000000000000000000 + %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000 + store fp128 %res, fp128* %dst + ret void +} + diff --git a/test/CodeGen/SystemZ/vec-move-18.ll b/test/CodeGen/SystemZ/vec-move-18.ll new file mode 100644 index 000000000000..5d3d09d83ef1 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-18.ll @@ -0,0 +1,24 @@ +; Test insertions of memory values into 0 on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test VLLEZLF. +define <4 x i32> @f1(i32 *%ptr) { +; CHECK-LABEL: f1: +; CHECK: vllezlf %v24, 0(%r2) +; CHECK: br %r14 + %val = load i32, i32 *%ptr + %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 0 + ret <4 x i32> %ret +} + +; Test VLLEZLF with a float. +define <4 x float> @f2(float *%ptr) { +; CHECK-LABEL: f2: +; CHECK: vllezlf %v24, 0(%r2) +; CHECK: br %r14 + %val = load float, float *%ptr + %ret = insertelement <4 x float> zeroinitializer, float %val, i32 0 + ret <4 x float> %ret +} + diff --git a/test/CodeGen/SystemZ/vec-mul-03.ll b/test/CodeGen/SystemZ/vec-mul-03.ll new file mode 100644 index 000000000000..3733db9fb339 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-mul-03.ll @@ -0,0 +1,24 @@ +; Test vector multiplication on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test a v4f32 multiplication. +define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f1: +; CHECK: vfmsb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = fmul <4 x float> %val1, %val2 + ret <4 x float> %ret +} + +; Test an f32 multiplication that uses vector registers. +define float @f2(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f2: +; CHECK: wfmsb %f0, %v24, %v26 +; CHECK: br %r14 + %scalar1 = extractelement <4 x float> %val1, i32 0 + %scalar2 = extractelement <4 x float> %val2, i32 0 + %ret = fmul float %scalar1, %scalar2 + ret float %ret +} diff --git a/test/CodeGen/SystemZ/vec-mul-04.ll b/test/CodeGen/SystemZ/vec-mul-04.ll new file mode 100644 index 000000000000..d96f0b6a745a --- /dev/null +++ b/test/CodeGen/SystemZ/vec-mul-04.ll @@ -0,0 +1,31 @@ +; Test vector multiply-and-add on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) + +; Test a v4f32 multiply-and-add. +define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1, + <4 x float> %val2, <4 x float> %val3) { +; CHECK-LABEL: f1: +; CHECK: vfmasb %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1, + <4 x float> %val2, + <4 x float> %val3) + ret <4 x float> %ret +} + +; Test a v4f32 multiply-and-subtract. +define <4 x float> @f2(<4 x float> %dummy, <4 x float> %val1, + <4 x float> %val2, <4 x float> %val3) { +; CHECK-LABEL: f2: +; CHECK: vfmssb %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %negval3 = fsub <4 x float> , %val3 + %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1, + <4 x float> %val2, + <4 x float> %negval3) + ret <4 x float> %ret +} diff --git a/test/CodeGen/SystemZ/vec-mul-05.ll b/test/CodeGen/SystemZ/vec-mul-05.ll new file mode 100644 index 000000000000..90a1f7a7efdf --- /dev/null +++ b/test/CodeGen/SystemZ/vec-mul-05.ll @@ -0,0 +1,63 @@ +; Test vector negative multiply-and-add on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) + +; Test a v2f64 negative multiply-and-add. +define <2 x double> @f1(<2 x double> %dummy, <2 x double> %val1, + <2 x double> %val2, <2 x double> %val3) { +; CHECK-LABEL: f1: +; CHECK: vfnmadb %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %ret = call <2 x double> @llvm.fma.v2f64 (<2 x double> %val1, + <2 x double> %val2, + <2 x double> %val3) + %negret = fsub <2 x double> , %ret + ret <2 x double> %negret +} + +; Test a v2f64 negative multiply-and-subtract. +define <2 x double> @f2(<2 x double> %dummy, <2 x double> %val1, + <2 x double> %val2, <2 x double> %val3) { +; CHECK-LABEL: f2: +; CHECK: vfnmsdb %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %negval3 = fsub <2 x double> , %val3 + %ret = call <2 x double> @llvm.fma.v2f64 (<2 x double> %val1, + <2 x double> %val2, + <2 x double> %negval3) + %negret = fsub <2 x double> , %ret + ret <2 x double> %negret +} + +; Test a v4f32 negative multiply-and-add. +define <4 x float> @f3(<4 x float> %dummy, <4 x float> %val1, + <4 x float> %val2, <4 x float> %val3) { +; CHECK-LABEL: f3: +; CHECK: vfnmasb %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1, + <4 x float> %val2, + <4 x float> %val3) + %negret = fsub <4 x float> , %ret + ret <4 x float> %negret +} + +; Test a v4f32 negative multiply-and-subtract. +define <4 x float> @f4(<4 x float> %dummy, <4 x float> %val1, + <4 x float> %val2, <4 x float> %val3) { +; CHECK-LABEL: f4: +; CHECK: vfnmssb %v24, %v26, %v28, %v30 +; CHECK: br %r14 + %negval3 = fsub <4 x float> , %val3 + %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1, + <4 x float> %val2, + <4 x float> %negval3) + %negret = fsub <4 x float> , %ret + ret <4 x float> %negret +} diff --git a/test/CodeGen/SystemZ/vec-neg-02.ll b/test/CodeGen/SystemZ/vec-neg-02.ll new file mode 100644 index 000000000000..07ce037542fd --- /dev/null +++ b/test/CodeGen/SystemZ/vec-neg-02.ll @@ -0,0 +1,23 @@ +; Test vector negation on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test a v4f32 negation. +define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val) { +; CHECK-LABEL: f1: +; CHECK: vflcsb %v24, %v26 +; CHECK: br %r14 + %ret = fsub <4 x float> , %val + ret <4 x float> %ret +} + +; Test an f32 negation that uses vector registers. +define float @f2(<4 x float> %val) { +; CHECK-LABEL: f2: +; CHECK: wflcsb %f0, %v24 +; CHECK: br %r14 + %scalar = extractelement <4 x float> %val, i32 0 + %ret = fsub float -0.0, %scalar + ret float %ret +} diff --git a/test/CodeGen/SystemZ/vec-or-03.ll b/test/CodeGen/SystemZ/vec-or-03.ll new file mode 100644 index 000000000000..010629d880d1 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-or-03.ll @@ -0,0 +1,91 @@ +; Test vector OR-NOT on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test a v16i8 OR-NOT. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: voc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <16 x i8> %val2, + %ret = or <16 x i8> %val1, %not + ret <16 x i8> %ret +} + +; ...and again with the reverse. +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f2: +; CHECK: voc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <16 x i8> %val1, + %ret = or <16 x i8> %not, %val2 + ret <16 x i8> %ret +} + +; Test a v8i16 OR-NOT. +define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f3: +; CHECK: voc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <8 x i16> %val2, + %ret = or <8 x i16> %val1, %not + ret <8 x i16> %ret +} + +; ...and again with the reverse. +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f4: +; CHECK: voc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <8 x i16> %val1, + %ret = or <8 x i16> %not, %val2 + ret <8 x i16> %ret +} + +; Test a v4i32 OR-NOT. +define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f5: +; CHECK: voc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <4 x i32> %val2, + %ret = or <4 x i32> %val1, %not + ret <4 x i32> %ret +} + +; ...and again with the reverse. +define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f6: +; CHECK: voc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <4 x i32> %val1, + %ret = or <4 x i32> %not, %val2 + ret <4 x i32> %ret +} + +; Test a v2i64 OR-NOT. +define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f7: +; CHECK: voc %v24, %v26, %v28 +; CHECK: br %r14 + %not = xor <2 x i64> %val2, + %ret = or <2 x i64> %val1, %not + ret <2 x i64> %ret +} + +; ...and again with the reverse. +define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f8: +; CHECK: voc %v24, %v28, %v26 +; CHECK: br %r14 + %not = xor <2 x i64> %val1, + %ret = or <2 x i64> %not, %val2 + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-round-02.ll b/test/CodeGen/SystemZ/vec-round-02.ll new file mode 100644 index 000000000000..bcd66ea803d1 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-round-02.ll @@ -0,0 +1,118 @@ +; Test v4f32 rounding on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare float @llvm.rint.f32(float) +declare float @llvm.nearbyint.f32(float) +declare float @llvm.floor.f32(float) +declare float @llvm.ceil.f32(float) +declare float @llvm.trunc.f32(float) +declare float @llvm.round.f32(float) +declare <4 x float> @llvm.rint.v4f32(<4 x float>) +declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) +declare <4 x float> @llvm.floor.v4f32(<4 x float>) +declare <4 x float> @llvm.ceil.v4f32(<4 x float>) +declare <4 x float> @llvm.trunc.v4f32(<4 x float>) +declare <4 x float> @llvm.round.v4f32(<4 x float>) + +define <4 x float> @f1(<4 x float> %val) { +; CHECK-LABEL: f1: +; CHECK: vfisb %v24, %v24, 0, 0 +; CHECK: br %r14 + %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %val) + ret <4 x float> %res +} + +define <4 x float> @f2(<4 x float> %val) { +; CHECK-LABEL: f2: +; CHECK: vfisb %v24, %v24, 4, 0 +; CHECK: br %r14 + %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %val) + ret <4 x float> %res +} + +define <4 x float> @f3(<4 x float> %val) { +; CHECK-LABEL: f3: +; CHECK: vfisb %v24, %v24, 4, 7 +; CHECK: br %r14 + %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %val) + ret <4 x float> %res +} + +define <4 x float> @f4(<4 x float> %val) { +; CHECK-LABEL: f4: +; CHECK: vfisb %v24, %v24, 4, 6 +; CHECK: br %r14 + %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %val) + ret <4 x float> %res +} + +define <4 x float> @f5(<4 x float> %val) { +; CHECK-LABEL: f5: +; CHECK: vfisb %v24, %v24, 4, 5 +; CHECK: br %r14 + %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %val) + ret <4 x float> %res +} + +define <4 x float> @f6(<4 x float> %val) { +; CHECK-LABEL: f6: +; CHECK: vfisb %v24, %v24, 4, 1 +; CHECK: br %r14 + %res = call <4 x float> @llvm.round.v4f32(<4 x float> %val) + ret <4 x float> %res +} + +define float @f7(<4 x float> %val) { +; CHECK-LABEL: f7: +; CHECK: wfisb %f0, %v24, 0, 0 +; CHECK: br %r14 + %scalar = extractelement <4 x float> %val, i32 0 + %res = call float @llvm.rint.f32(float %scalar) + ret float %res +} + +define float @f8(<4 x float> %val) { +; CHECK-LABEL: f8: +; CHECK: wfisb %f0, %v24, 4, 0 +; CHECK: br %r14 + %scalar = extractelement <4 x float> %val, i32 0 + %res = call float @llvm.nearbyint.f32(float %scalar) + ret float %res +} + +define float @f9(<4 x float> %val) { +; CHECK-LABEL: f9: +; CHECK: wfisb %f0, %v24, 4, 7 +; CHECK: br %r14 + %scalar = extractelement <4 x float> %val, i32 0 + %res = call float @llvm.floor.f32(float %scalar) + ret float %res +} + +define float @f10(<4 x float> %val) { +; CHECK-LABEL: f10: +; CHECK: wfisb %f0, %v24, 4, 6 +; CHECK: br %r14 + %scalar = extractelement <4 x float> %val, i32 0 + %res = call float @llvm.ceil.f32(float %scalar) + ret float %res +} + +define float @f11(<4 x float> %val) { +; CHECK-LABEL: f11: +; CHECK: wfisb %f0, %v24, 4, 5 +; CHECK: br %r14 + %scalar = extractelement <4 x float> %val, i32 0 + %res = call float @llvm.trunc.f32(float %scalar) + ret float %res +} + +define float @f12(<4 x float> %val) { +; CHECK-LABEL: f12: +; CHECK: wfisb %f0, %v24, 4, 1 +; CHECK: br %r14 + %scalar = extractelement <4 x float> %val, i32 0 + %res = call float @llvm.round.f32(float %scalar) + ret float %res +} diff --git a/test/CodeGen/SystemZ/vec-sqrt-02.ll b/test/CodeGen/SystemZ/vec-sqrt-02.ll new file mode 100644 index 000000000000..6970d9db6698 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-sqrt-02.ll @@ -0,0 +1,23 @@ +; Test f32 and v4f32 square root on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare float @llvm.sqrt.f32(float) +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define <4 x float> @f1(<4 x float> %val) { +; CHECK-LABEL: f1: +; CHECK: vfsqsb %v24, %v24 +; CHECK: br %r14 + %ret = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %val) + ret <4 x float> %ret +} + +define float @f2(<4 x float> %val) { +; CHECK-LABEL: f2: +; CHECK: wfsqsb %f0, %v24 +; CHECK: br %r14 + %scalar = extractelement <4 x float> %val, i32 0 + %ret = call float @llvm.sqrt.f32(float %scalar) + ret float %ret +} diff --git a/test/CodeGen/SystemZ/vec-sub-02.ll b/test/CodeGen/SystemZ/vec-sub-02.ll new file mode 100644 index 000000000000..83c76b5d4aa6 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-sub-02.ll @@ -0,0 +1,31 @@ +; Test vector subtraction on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test a v4f32 subtraction. +define <4 x float> @f6(<4 x float> %dummy, <4 x float> %val1, + <4 x float> %val2) { +; CHECK-LABEL: f6: +; CHECK: vfssb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = fsub <4 x float> %val1, %val2 + ret <4 x float> %ret +} + +; Test an f32 subtraction that uses vector registers. +define float @f7(<4 x float> %val1, <4 x float> %val2) { +; CHECK-LABEL: f7: +; CHECK: wfssb %f0, %v24, %v26 +; CHECK: br %r14 + %scalar1 = extractelement <4 x float> %val1, i32 0 + %scalar2 = extractelement <4 x float> %val2, i32 0 + %ret = fsub float %scalar1, %scalar2 + ret float %ret +} + +; Test a v2f32 subtraction, which gets promoted to v4f32. +define <2 x float> @f14(<2 x float> %val1, <2 x float> %val2) { +; No particular output expected, but must compile. + %ret = fsub <2 x float> %val1, %val2 + ret <2 x float> %ret +} diff --git a/test/CodeGen/SystemZ/vec-xor-02.ll b/test/CodeGen/SystemZ/vec-xor-02.ll new file mode 100644 index 000000000000..b4b5a96ba254 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-xor-02.ll @@ -0,0 +1,47 @@ +; Test vector NOT-XOR on z14. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Test a v16i8 NOT-XOR. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) { +; CHECK-LABEL: f1: +; CHECK: vnx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <16 x i8> %val1, %val2 + %not = xor <16 x i8> %ret, + ret <16 x i8> %not +} + +; Test a v8i16 NOT-XOR. +define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) { +; CHECK-LABEL: f2: +; CHECK: vnx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <8 x i16> %val1, %val2 + %not = xor <8 x i16> %ret, + ret <8 x i16> %not +} + +; Test a v4i32 NOT-XOR. +define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) { +; CHECK-LABEL: f3: +; CHECK: vnx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <4 x i32> %val1, %val2 + %not = xor <4 x i32> %ret, + ret <4 x i32> %not +} + +; Test a v2i64 NOT-XOR. +define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) { +; CHECK-LABEL: f4: +; CHECK: vnx %v24, %v26, %v28 +; CHECK: br %r14 + %ret = xor <2 x i64> %val1, %val2 + %not = xor <2 x i64> %ret, + ret <2 x i64> %not +} diff --git a/test/CodeGen/Thumb/litpoolremat.ll b/test/CodeGen/Thumb/litpoolremat.ll new file mode 100644 index 000000000000..6ed9b0c2a7ce --- /dev/null +++ b/test/CodeGen/Thumb/litpoolremat.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mtriple=thumb-apple-darwin | FileCheck %s + +declare void @consume_value(i32) #1 + +declare i32 @get_value(...) #1 + +declare void @consume_three_values(i32, i32, i32) #1 + +; Function Attrs: nounwind uwtable +define void @should_not_spill() #0 { + tail call void @consume_value(i32 1764) #2 + %1 = tail call i32 (...) @get_value() #2 + %2 = tail call i32 (...) @get_value() #2 + %3 = tail call i32 (...) @get_value() #2 + tail call void @consume_value(i32 %1) #2 + tail call void @consume_value(i32 %2) #2 + tail call void @consume_value(i32 %3) #2 + tail call void @consume_value(i32 1764) #2 + tail call void @consume_three_values(i32 %1, i32 %2, i32 %3) #2 + ret void +} + +; CHECK: ldr r0, LCPI0_0 +; CHECK-NOT: str r0 +; CHECK: bl +; CHECK: ldr r0, LCPI0_0 +; CHECK-LABEL: LCPI0_0: +; CHECK-NEXT: .long 1764 diff --git a/test/CodeGen/Thumb/select.ll b/test/CodeGen/Thumb/select.ll index fe69a39e350c..75dbeab5ad0f 100644 --- a/test/CodeGen/Thumb/select.ll +++ b/test/CodeGen/Thumb/select.ll @@ -74,9 +74,9 @@ define double @f7(double %a, double %b) { } ; CHECK-LABEL: f7: ; CHECK: blt -; CHECK: blt +; CHECK: {{blt|bge}} ; CHECK: __ltdf2 ; CHECK-EABI-LABEL: f7: ; CHECK-EABI: __aeabi_dcmplt ; CHECK-EABI: bne -; CHECK-EABI: bne +; CHECK-EABI: {{bne|beq}} diff --git a/test/CodeGen/WebAssembly/indirect-import.ll b/test/CodeGen/WebAssembly/indirect-import.ll index 1bde65bcbbba..7cac31a2aef5 100644 --- a/test/CodeGen/WebAssembly/indirect-import.ll +++ b/test/CodeGen/WebAssembly/indirect-import.ll @@ -19,9 +19,9 @@ entry: %vs = alloca void (%struct.big*)*, align 4 %s = alloca void (%struct.big*)*, align 4 -; CHECK: i32.const {{.+}}=, extern_fd@FUNCTION +; CHECK-DAG: i32.const {{.+}}=, extern_fd@FUNCTION +; CHECK-DAG: i32.const {{.+}}=, extern_vj@FUNCTION store float (double)* @extern_fd, float (double)** %fd, align 4 -; CHECK: i32.const {{.+}}=, extern_vj@FUNCTION store void (i64)* @extern_vj, void (i64)** %vj, align 4 %0 = load void (i64)*, void (i64)** %vj, align 4 call void %0(i64 1) @@ -36,10 +36,9 @@ entry: %2 = load i32 (i64, i32, double, float)*, i32 (i64, i32, double, float)** %ijidf, align 4 %call = call i32 %2(i64 1, i32 2, double 3.000000e+00, float 4.000000e+00) -; CHECK: i32.const {{.+}}=, extern_struct@FUNCTION +; CHECK-DAG: i32.const {{.+}}=, extern_struct@FUNCTION +; CHECK-DAG: i32.const {{.+}}=, extern_sret@FUNCTION store void (%struct.big*)* @extern_struct, void (%struct.big*)** %vs, align 4 - -; CHECK: i32.const {{.+}}=, extern_sret@FUNCTION store void (%struct.big*)* @extern_sret, void (%struct.big*)** %s, align 4 %3 = load float (double)*, float (double)** %fd, align 4 %4 = ptrtoint float (double)* %3 to i32 diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll index c160b391f6e8..2580771eb2cf 100644 --- a/test/CodeGen/WebAssembly/userstack.ll +++ b/test/CodeGen/WebAssembly/userstack.ll @@ -36,13 +36,13 @@ define void @alloca3264() { ; CHECK-NEXT: tee_local $push[[L5:.+]]=, [[SP:.+]], $pop[[L6]] %r1 = alloca i32 %r2 = alloca double - ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 0 - ; CHECK-NEXT: i32.store 12($pop[[L5]]), $pop[[L0]] store i32 0, i32* %r1 - ; CHECK-NEXT: get_local $push[[L2:.+]]=, [[SP]]{{$}} - ; CHECK-NEXT: i64.const $push[[L1:.+]]=, 0 - ; CHECK-NEXT: i64.store 0($pop[[L2]]), $pop[[L1]] store double 0.0, double* %r2 + ; CHECK-NEXT: i64.const $push[[L1:.+]]=, 0 + ; CHECK-NEXT: i64.store 0($pop[[L5]]), $pop[[L1]] + ; CHECK-NEXT: get_local $push[[L2:.+]]=, [[SP]]{{$}} + ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 0 + ; CHECK-NEXT: i32.store 12($pop[[L2]]), $pop[[L0]] ; CHECK-NEXT: return ret void } diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll index 7da85d3a9a1d..fa71bffaf8c6 100644 --- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll +++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -mattr=+cmov | FileCheck %s +; RUN: llc < %s -march=x86 -mattr=+cmov -x86-cmov-converter=false | FileCheck %s ; ; Test scheduling a multi-use compare. We should neither spill flags ; nor clone the compare. diff --git a/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll b/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll index 8d387136da9c..37f01845db79 100644 --- a/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll +++ b/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s ; CHECK-NOT: -{{[1-9][0-9]*}}(%rsp) -define x86_64_win64cc x86_fp80 @a(i64 %x) nounwind readnone { +define win64cc x86_fp80 @a(i64 %x) nounwind readnone { entry: %conv = sitofp i64 %x to x86_fp80 ; [#uses=1] ret x86_fp80 %conv diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll index ba5de8eb5fcb..e812cbe3270a 100644 --- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -83,10 +83,11 @@ define void @full_test() { ; X32-NEXT: cmpeqps %xmm2, %xmm1 ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm4 -; X32-NEXT: extractps $1, %xmm4, {{[0-9]+}}(%esp) ; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) +; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; diff --git a/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll b/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll index 9dff4e596caa..72807922a22b 100644 --- a/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll +++ b/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck --check-prefix=CHECK %s +; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s declare x86_regcallcc i32 @callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0); diff --git a/test/CodeGen/X86/alias-static-alloca.ll b/test/CodeGen/X86/alias-static-alloca.ll new file mode 100644 index 000000000000..f4ca7e39f4fc --- /dev/null +++ b/test/CodeGen/X86/alias-static-alloca.ll @@ -0,0 +1,37 @@ +; RUN: llc -o - -mtriple=x86_64-linux-gnu %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; We should be able to bypass the load values to their corresponding +; stores here. + +; CHECK-LABEL: foo +; CHECK-DAG: movl %esi, -8(%rsp) +; CHECK-DAG: movl %ecx, -16(%rsp) +; CHECK-DAG: movl %edi, -4(%rsp) +; CHECK-DAG: movl %edx, -12(%rsp) +; CHECK: leal +; CHECK: addl +; CHECK: addl +; CHECK: retq + +define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) { +entry: + %a0 = alloca i32 + %a1 = alloca i32 + %a2 = alloca i32 + %a3 = alloca i32 + store i32 %b, i32* %a1 + store i32 %d, i32* %a3 + store i32 %a, i32* %a0 + store i32 %c, i32* %a2 + %l0 = load i32, i32* %a0 + %l1 = load i32, i32* %a1 + %l2 = load i32, i32* %a2 + %l3 = load i32, i32* %a3 + %add0 = add nsw i32 %l0, %l1 + %add1 = add nsw i32 %add0, %l2 + %add2 = add nsw i32 %add1, %l3 + ret i32 %add2 +} diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll index d5d3fa6db5e8..1a6fde371f09 100644 --- a/test/CodeGen/X86/atomic-minmax-i6432.ll +++ b/test/CodeGen/X86/atomic-minmax-i6432.ll @@ -9,32 +9,32 @@ define void @atomic_maxmin_i6432() { ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl ; LINUX: sbbl -; LINUX: cmovne -; LINUX: cmovne +; LINUX: jne +; LINUX: jne ; LINUX: lock cmpxchg8b ; LINUX: jne [[LABEL]] %2 = atomicrmw min i64* @sc64, i64 6 acquire ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl ; LINUX: sbbl -; LINUX: cmovne -; LINUX: cmovne +; LINUX: jne +; LINUX: jne ; LINUX: lock cmpxchg8b ; LINUX: jne [[LABEL]] %3 = atomicrmw umax i64* @sc64, i64 7 acquire ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl ; LINUX: sbbl -; LINUX: cmovne -; LINUX: cmovne +; LINUX: jne +; LINUX: jne ; LINUX: lock cmpxchg8b ; LINUX: jne [[LABEL]] %4 = atomicrmw umin i64* @sc64, i64 8 acquire ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl ; LINUX: sbbl -; LINUX: cmovne -; LINUX: cmovne +; LINUX: jne +; LINUX: jne ; LINUX: lock cmpxchg8b ; LINUX: jne [[LABEL]] ret void diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll index 77bbdec826a5..c6300708bcc1 100644 --- a/test/CodeGen/X86/atomic128.ll +++ b/test/CodeGen/X86/atomic128.ll @@ -167,14 +167,24 @@ define void @fetch_and_min(i128* %p, i128 %bits) { ; CHECK-NEXT: sbbq %rdx, %rcx ; CHECK-NEXT: setge %cl ; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: jne LBB5_3 +; CHECK-NEXT: ## BB#2: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: cmovneq %rax, %rbx +; CHECK-NEXT: LBB5_3: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: jne LBB5_5 +; CHECK-NEXT: ## BB#4: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: movq %r8, %rcx -; CHECK-NEXT: cmovneq %rdx, %rcx +; CHECK-NEXT: LBB5_5: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: lock cmpxchg16b (%rdi) ; CHECK-NEXT: jne LBB5_1 -; CHECK-NEXT: ## BB#2: ## %atomicrmw.end +; CHECK-NEXT: ## BB#6: ## %atomicrmw.end ; CHECK-NEXT: movq %rax, {{.*}}(%rip) ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip) ; CHECK-NEXT: popq %rbx @@ -203,14 +213,24 @@ define void @fetch_and_max(i128* %p, i128 %bits) { ; CHECK-NEXT: sbbq %r8, %rcx ; CHECK-NEXT: setge %cl ; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: jne LBB6_3 +; CHECK-NEXT: ## BB#2: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1 ; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: cmovneq %rax, %rbx +; CHECK-NEXT: LBB6_3: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1 ; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: jne LBB6_5 +; CHECK-NEXT: ## BB#4: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1 ; CHECK-NEXT: movq %r8, %rcx -; CHECK-NEXT: cmovneq %rdx, %rcx +; CHECK-NEXT: LBB6_5: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1 ; CHECK-NEXT: lock cmpxchg16b (%rdi) ; CHECK-NEXT: jne LBB6_1 -; CHECK-NEXT: ## BB#2: ## %atomicrmw.end +; CHECK-NEXT: ## BB#6: ## %atomicrmw.end ; CHECK-NEXT: movq %rax, {{.*}}(%rip) ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip) ; CHECK-NEXT: popq %rbx @@ -239,14 +259,24 @@ define void @fetch_and_umin(i128* %p, i128 %bits) { ; CHECK-NEXT: sbbq %rdx, %rcx ; CHECK-NEXT: setae %cl ; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: jne LBB7_3 +; CHECK-NEXT: ## BB#2: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1 ; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: cmovneq %rax, %rbx +; CHECK-NEXT: LBB7_3: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1 ; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: jne LBB7_5 +; CHECK-NEXT: ## BB#4: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1 ; CHECK-NEXT: movq %r8, %rcx -; CHECK-NEXT: cmovneq %rdx, %rcx +; CHECK-NEXT: LBB7_5: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1 ; CHECK-NEXT: lock cmpxchg16b (%rdi) ; CHECK-NEXT: jne LBB7_1 -; CHECK-NEXT: ## BB#2: ## %atomicrmw.end +; CHECK-NEXT: ## BB#6: ## %atomicrmw.end ; CHECK-NEXT: movq %rax, {{.*}}(%rip) ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip) ; CHECK-NEXT: popq %rbx @@ -275,14 +305,24 @@ define void @fetch_and_umax(i128* %p, i128 %bits) { ; CHECK-NEXT: sbbq %rdx, %rcx ; CHECK-NEXT: setb %cl ; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: jne LBB8_3 +; CHECK-NEXT: ## BB#2: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1 ; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: cmovneq %rax, %rbx +; CHECK-NEXT: LBB8_3: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1 ; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: jne LBB8_5 +; CHECK-NEXT: ## BB#4: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1 ; CHECK-NEXT: movq %r8, %rcx -; CHECK-NEXT: cmovneq %rdx, %rcx +; CHECK-NEXT: LBB8_5: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1 ; CHECK-NEXT: lock cmpxchg16b (%rdi) ; CHECK-NEXT: jne LBB8_1 -; CHECK-NEXT: ## BB#2: ## %atomicrmw.end +; CHECK-NEXT: ## BB#6: ## %atomicrmw.end ; CHECK-NEXT: movq %rax, {{.*}}(%rip) ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip) ; CHECK-NEXT: popq %rbx diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll index a12a412fb94d..953f3bdd06e8 100644 --- a/test/CodeGen/X86/avx-schedule.ll +++ b/test/CodeGen/X86/avx-schedule.ll @@ -27,9 +27,9 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_addpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fadd <4 x double> %1, %2 @@ -57,9 +57,9 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_addps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fadd <8 x float> %1, %2 @@ -87,9 +87,9 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ; ZNVER1-LABEL: test_addsubpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %1, <4 x double> %2) @@ -118,9 +118,9 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ; ZNVER1-LABEL: test_addsubps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %1, <8 x float> %2) @@ -152,10 +152,10 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ; ZNVER1-LABEL: test_andnotpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = xor <4 x i64> %1, @@ -193,10 +193,10 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ; ZNVER1-LABEL: test_andnotps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = xor <4 x i64> %1, @@ -234,10 +234,10 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_andpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = and <4 x i64> %1, %2 @@ -273,10 +273,10 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_andps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = and <4 x i64> %1, %2 @@ -313,9 +313,9 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl ; ZNVER1-LABEL: test_blendpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fadd <4 x double> %a1, %1 @@ -345,8 +345,8 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> * ; ZNVER1-LABEL: test_blendps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50] -; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> @@ -374,9 +374,9 @@ define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ; ZNVER1-LABEL: test_blendvpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) %2 = load <4 x double>, <4 x double> *%a3, align 32 %3 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %1, <4 x double> %2, <4 x double> %a2) @@ -405,9 +405,9 @@ define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ; ZNVER1-LABEL: test_blendvps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) %2 = load <8 x float>, <8 x float> *%a3, align 32 %3 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %1, <8 x float> %2, <8 x float> %a2) @@ -433,8 +433,8 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) { ; ; ZNVER1-LABEL: test_broadcastf128: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x float>, <4 x float> *%a0, align 32 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> ret <8 x float> %2 @@ -458,8 +458,8 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) { ; ; ZNVER1-LABEL: test_broadcastsd_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load double, double *%a0, align 8 %2 = insertelement <4 x double> undef, double %1, i32 0 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer @@ -484,8 +484,8 @@ define <4 x float> @test_broadcastss(float *%a0) { ; ; ZNVER1-LABEL: test_broadcastss: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load float, float *%a0, align 4 %2 = insertelement <4 x float> undef, float %1, i32 0 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer @@ -510,8 +510,8 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) { ; ; ZNVER1-LABEL: test_broadcastss_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load float, float *%a0, align 4 %2 = insertelement <8 x float> undef, float %1, i32 0 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer @@ -543,9 +543,9 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ZNVER1-LABEL: test_cmppd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; ZNVER1-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fcmp oeq <4 x double> %a0, %2 @@ -581,9 +581,9 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ZNVER1-LABEL: test_cmpps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; ZNVER1-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fcmp oeq <8 x float> %a0, %2 @@ -618,10 +618,10 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { ; ; ZNVER1-LABEL: test_cvtdq2pd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp <4 x i32> %a0 to <4 x double> %2 = load <4 x i32>, <4 x i32> *%a1, align 16 %3 = sitofp <4 x i32> %2 to <4 x double> @@ -655,10 +655,10 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) { ; ; ZNVER1-LABEL: test_cvtdq2ps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp <8 x i32> %a0 to <8 x float> %2 = load <8 x i32>, <8 x i32> *%a1, align 16 %3 = sitofp <8 x i32> %2 to <8 x float> @@ -690,10 +690,10 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_cvtpd2dq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [5:1.00] ; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <4 x double> %a0 to <4 x i32> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = fptosi <4 x double> %2 to <4 x i32> @@ -725,10 +725,10 @@ define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_cvtpd2ps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00] ; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptrunc <4 x double> %a0 to <4 x float> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = fptrunc <4 x double> %2 to <4 x float> @@ -760,10 +760,10 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_cvtps2dq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <8 x float> %a0 to <8 x i32> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = fptosi <8 x float> %2 to <8 x i32> @@ -792,9 +792,9 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_divpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00] -; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fdiv <4 x double> %1, %2 @@ -822,9 +822,9 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_divps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00] -; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fdiv <8 x float> %1, %2 @@ -853,8 +853,8 @@ define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2 ; ZNVER1-LABEL: test_dpps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %1, <8 x float> %2, i8 7) @@ -886,9 +886,9 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa ; ZNVER1-LABEL: test_extractf128: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:0.50] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <4 x i32> %2 = shufflevector <8 x float> %a1, <8 x float> undef, <4 x i32> store <4 x float> %2, <4 x float> *%a2 @@ -916,9 +916,9 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; ; ZNVER1-LABEL: test_haddpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %1, <4 x double> %2) @@ -947,9 +947,9 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *% ; ; ZNVER1-LABEL: test_haddps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %1, <8 x float> %2) @@ -978,9 +978,9 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; ; ZNVER1-LABEL: test_hsubpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %1, <4 x double> %2) @@ -1009,9 +1009,9 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *% ; ; ZNVER1-LABEL: test_hsubps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %2) @@ -1044,9 +1044,9 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float ; ZNVER1-LABEL: test_insertf128: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50] -; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> %2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> %3 = load <4 x float>, <4 x float> *%a2, align 16 @@ -1074,8 +1074,8 @@ define <32 x i8> @test_lddqu(i8* %a0) { ; ; ZNVER1-LABEL: test_lddqu: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vlddqu (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vlddqu (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0) ret <32 x i8> %1 } @@ -1108,7 +1108,7 @@ define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) { ; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1) call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) ret <2 x double> %1 @@ -1143,7 +1143,7 @@ define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2 ; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1) call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2) ret <4 x double> %1 @@ -1178,7 +1178,7 @@ define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) { ; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1) call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) ret <4 x float> %1 @@ -1213,7 +1213,7 @@ define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) ; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1) call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2) ret <8 x float> %1 @@ -1243,8 +1243,8 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ZNVER1-LABEL: test_maxpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %1, <4 x double> %2) @@ -1274,8 +1274,8 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ZNVER1-LABEL: test_maxps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2) @@ -1305,8 +1305,8 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ZNVER1-LABEL: test_minpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %1, <4 x double> %2) @@ -1336,8 +1336,8 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ZNVER1-LABEL: test_minps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2) @@ -1369,10 +1369,10 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_movapd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x double>, <4 x double> *%a0, align 32 %2 = fadd <4 x double> %1, %1 store <4 x double> %2, <4 x double> *%a1, align 32 @@ -1403,10 +1403,10 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_movaps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x float>, <8 x float> *%a0, align 32 %2 = fadd <8 x float> %1, %1 store <8 x float> %2, <8 x float> *%a1, align 32 @@ -1437,10 +1437,10 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_movddup: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00] +; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [8:0.50] ; ZNVER1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> @@ -1468,9 +1468,9 @@ define i32 @test_movmskpd(<4 x double> %a0) { ; ; ZNVER1-LABEL: test_movmskpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ret i32 %1 } @@ -1496,9 +1496,9 @@ define i32 @test_movmskps(<8 x float> %a0) { ; ; ZNVER1-LABEL: test_movmskps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ret i32 %1 } @@ -1525,9 +1525,9 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_movntpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <4 x double> %a0, %a0 store <4 x double> %1, <4 x double> *%a1, align 32, !nontemporal !0 ret <4 x double> %1 @@ -1554,9 +1554,9 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_movntps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <8 x float> %a0, %a0 store <8 x float> %1, <8 x float> *%a1, align 32, !nontemporal !0 ret <8 x float> %1 @@ -1586,10 +1586,10 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_movshdup: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00] +; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [8:0.50] ; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> @@ -1621,10 +1621,10 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_movsldup: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00] +; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [8:0.50] ; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> @@ -1658,10 +1658,10 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_movupd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x double>, <4 x double> *%a0, align 1 %2 = fadd <4 x double> %1, %1 store <4 x double> %2, <4 x double> *%a1, align 1 @@ -1694,10 +1694,10 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_movups: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x float>, <8 x float> *%a0, align 1 %2 = fadd <8 x float> %1, %1 store <8 x float> %2, <8 x float> *%a1, align 1 @@ -1725,9 +1725,9 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_mulpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00] -; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fmul <4 x double> %1, %2 @@ -1755,9 +1755,9 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_mulps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] -; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fmul <8 x float> %1, %2 @@ -1788,10 +1788,10 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) ; ; ZNVER1-LABEL: orpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = or <4 x i64> %1, %2 @@ -1827,10 +1827,10 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2 ; ; ZNVER1-LABEL: test_orps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = or <4 x i64> %1, %2 @@ -1866,10 +1866,10 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) { ; ; ZNVER1-LABEL: test_permilpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00] +; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [8:0.50] ; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50] ; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> @@ -1901,10 +1901,10 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_permilpd_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00] +; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:0.50] ; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> @@ -1936,10 +1936,10 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) { ; ; ZNVER1-LABEL: test_permilps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00] +; ZNVER1-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50] ; ZNVER1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50] ; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> @@ -1971,10 +1971,10 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_permilps_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00] +; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:0.50] ; ZNVER1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> @@ -2004,8 +2004,8 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64> ; ZNVER1-LABEL: test_permilvarpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> %2) @@ -2035,8 +2035,8 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x ; ZNVER1-LABEL: test_permilvarpd_ymm: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> %2) @@ -2066,8 +2066,8 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> * ; ZNVER1-LABEL: test_permilvarps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> %2) @@ -2097,8 +2097,8 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3 ; ZNVER1-LABEL: test_permilvarps_ymm: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> %2) @@ -2130,10 +2130,10 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_rcpps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:2.00] -; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %2) @@ -2166,10 +2166,10 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_roundpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:1.00] ; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %2, i32 7) @@ -2202,10 +2202,10 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_roundps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:1.00] ; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %2, i32 7) @@ -2238,10 +2238,10 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_rsqrtps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:2.00] -; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:2.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %2) @@ -2275,9 +2275,9 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; ZNVER1-LABEL: test_shufpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50] -; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> @@ -2307,8 +2307,8 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *% ; ZNVER1-LABEL: test_shufps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50] -; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> @@ -2339,10 +2339,10 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_sqrtpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [59:54.00] -; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [54:54.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %2) @@ -2375,10 +2375,10 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_sqrtps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [47:42.00] -; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [42:42.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %2) @@ -2408,9 +2408,9 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_subpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fsub <4 x double> %1, %2 @@ -2438,9 +2438,9 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_subps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fsub <8 x float> %1, %2 @@ -2477,12 +2477,12 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; ; ZNVER1-LABEL: test_testpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %2) @@ -2523,13 +2523,13 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a ; ; ZNVER1-LABEL: test_testpd_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %2) @@ -2568,12 +2568,12 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; ; ZNVER1-LABEL: test_testps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestps (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %2) @@ -2614,13 +2614,13 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) ; ; ZNVER1-LABEL: test_testps_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestps (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestps (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %2) @@ -2654,9 +2654,9 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ZNVER1-LABEL: test_unpckhpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50] -; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> @@ -2686,8 +2686,8 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ZNVER1-LABEL: test_unpckhps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50] -; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> @@ -2719,9 +2719,9 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ZNVER1-LABEL: test_unpcklpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50] -; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> @@ -2751,8 +2751,8 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ZNVER1-LABEL: test_unpcklps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50] -; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> @@ -2783,10 +2783,10 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_xorpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = xor <4 x i64> %1, %2 @@ -2822,10 +2822,10 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_xorps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = xor <4 x i64> %1, %2 @@ -2856,7 +2856,7 @@ define void @test_zeroall() { ; ZNVER1-LABEL: test_zeroall: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vzeroall # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.avx.vzeroall() ret void } @@ -2881,7 +2881,7 @@ define void @test_zeroupper() { ; ZNVER1-LABEL: test_zeroupper: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.avx.vzeroupper() ret void } diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll index 017f54b40b2d..9918d6680256 100644 --- a/test/CodeGen/X86/avx2-arith.ll +++ b/test/CodeGen/X86/avx2-arith.ll @@ -386,13 +386,13 @@ define <8 x i32> @mul_const9(<8 x i32> %x) { define <4 x i32> @mul_const10(<4 x i32> %x) { ; X32-LABEL: mul_const10: ; X32: # BB#0: -; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; X32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009] ; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: mul_const10: ; X64: # BB#0: -; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009] ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq %m = mul <4 x i32> %x, @@ -403,13 +403,13 @@ define <4 x i32> @mul_const10(<4 x i32> %x) { define <4 x i32> @mul_const11(<4 x i32> %x) { ; X32-LABEL: mul_const11: ; X32: # BB#0: -; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; X32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152] ; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: mul_const11: ; X64: # BB#0: -; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152] ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq %m = mul <4 x i32> %x, diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll index 042bc217b97c..a3862d7e27c6 100644 --- a/test/CodeGen/X86/avx2-schedule.ll +++ b/test/CodeGen/X86/avx2-schedule.ll @@ -13,10 +13,10 @@ define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) { ; ; ZNVER1-LABEL: test_pabsb: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpabsb (%rdi), %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpabsb (%rdi), %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) %2 = load <32 x i8>, <32 x i8> *%a1, align 32 %3 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %2) @@ -35,10 +35,10 @@ define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) { ; ; ZNVER1-LABEL: test_pabsd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpabsd (%rdi), %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpabsd (%rdi), %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) %2 = load <8 x i32>, <8 x i32> *%a1, align 32 %3 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %2) @@ -57,10 +57,10 @@ define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) { ; ; ZNVER1-LABEL: test_pabsw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpabsw (%rdi), %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpabsw (%rdi), %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) %2 = load <16 x i16>, <16 x i16> *%a1, align 32 %3 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %2) @@ -78,9 +78,9 @@ define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; ; ZNVER1-LABEL: test_paddb: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <32 x i8> %a0, %a1 %2 = load <32 x i8>, <32 x i8> *%a2, align 32 %3 = add <32 x i8> %1, %2 @@ -96,9 +96,9 @@ define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; ; ZNVER1-LABEL: test_paddd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <8 x i32> %a0, %a1 %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = add <8 x i32> %1, %2 @@ -114,9 +114,9 @@ define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_paddq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = add <4 x i64> %1, %2 @@ -132,9 +132,9 @@ define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; ; ZNVER1-LABEL: test_paddw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <16 x i16> %a0, %a1 %2 = load <16 x i16>, <16 x i16> *%a2, align 32 %3 = add <16 x i16> %1, %2 @@ -151,10 +151,10 @@ define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_pand: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = and <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = and <4 x i64> %1, %2 @@ -172,10 +172,10 @@ define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_pandn: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <4 x i64> %a0, %2 = and <4 x i64> %a1, %1 %3 = load <4 x i64>, <4 x i64> *%a2, align 32 @@ -194,9 +194,9 @@ define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; ; ZNVER1-LABEL: test_pmulld: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <8 x i32> %a0, %a1 %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = mul <8 x i32> %1, %2 @@ -212,9 +212,9 @@ define <16 x i16> @test_pmullw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) ; ; ZNVER1-LABEL: test_pmullw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <16 x i16> %a0, %a1 %2 = load <16 x i16>, <16 x i16> *%a2, align 32 %3 = mul <16 x i16> %1, %2 @@ -231,10 +231,10 @@ define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_por: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = or <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = or <4 x i64> %1, %2 @@ -251,9 +251,9 @@ define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; ; ZNVER1-LABEL: test_psubb: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <32 x i8> %a0, %a1 %2 = load <32 x i8>, <32 x i8> *%a2, align 32 %3 = sub <32 x i8> %1, %2 @@ -269,9 +269,9 @@ define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; ; ZNVER1-LABEL: test_psubd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <8 x i32> %a0, %a1 %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = sub <8 x i32> %1, %2 @@ -287,9 +287,9 @@ define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_psubq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = sub <4 x i64> %1, %2 @@ -305,9 +305,9 @@ define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; ; ZNVER1-LABEL: test_psubw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <16 x i16> %a0, %a1 %2 = load <16 x i16>, <16 x i16> *%a2, align 32 %3 = sub <16 x i16> %1, %2 @@ -324,10 +324,10 @@ define <4 x i64> @test_pxor(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_pxor: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = xor <4 x i64> %1, %2 diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll index 127726ea30da..c77714b9e181 100644 --- a/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/test/CodeGen/X86/avx2-vector-shifts.ll @@ -376,7 +376,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; X32: # BB#0: ; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm2 +; X32-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] ; X32-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X32-NEXT: vzeroupper @@ -386,7 +386,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; X64: # BB#0: ; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] ; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index 140299f5495d..e10a781fabc2 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -1507,7 +1507,7 @@ define <4 x float> @uitofp_4i1_float(<4 x i32> %a) { ; NOVL: # BB#0: ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NOVL-NEXT: retq ; diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index e1a92c60d182..6f4bf061a215 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1630,8 +1630,9 @@ define void @f1(i32 %c) { ; CHECK-LABEL: f1: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: movzbl {{.*}}(%rip), %edi -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorb $1, %al +; CHECK-NEXT: movb {{.*}}(%rip), %al +; CHECK-NEXT: notb %al +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: movb %al, {{.*}}(%rip) ; CHECK-NEXT: xorl $1, %edi ; CHECK-NEXT: jmp _f2 ## TAILCALL diff --git a/test/CodeGen/X86/avx512-rotate.ll b/test/CodeGen/X86/avx512-rotate.ll new file mode 100644 index 000000000000..98fa67ad793d --- /dev/null +++ b/test/CodeGen/X86/avx512-rotate.ll @@ -0,0 +1,256 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX + +declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +; Tests showing replacement of variable rotates with immediate splat versions. + +define <16 x i32> @test_splat_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { +; KNL-LABEL: test_splat_rol_v16i32: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprold $5, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprold $5, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprold $5, %zmm0, %zmm0 +; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_rol_v16i32: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprold $5, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprold $5, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprold $5, %zmm0, %zmm0 +; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 %x2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> zeroinitializer, i16 %x2) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +define <8 x i64>@test_splat_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { +; KNL-LABEL: test_splat_rol_v8i64: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprolq $5, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprolq $5, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprolq $5, %zmm0, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_rol_v8i64: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprolq $5, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprolq $5, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprolq $5, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 %x2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> zeroinitializer, i8 %x2) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +define <16 x i32> @test_splat_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { +; KNL-LABEL: test_splat_ror_v16i32: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprord $5, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprord $5, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprord $5, %zmm0, %zmm0 +; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_ror_v16i32: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprord $5, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprord $5, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprord $5, %zmm0, %zmm0 +; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 %x2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> zeroinitializer, i16 %x2) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +define <8 x i64>@test_splat_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { +; KNL-LABEL: test_splat_ror_v8i64: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprorq $5, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprorq $5, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprorq $5, %zmm0, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_ror_v8i64: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprorq $5, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprorq $5, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprorq $5, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 %x2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> zeroinitializer, i8 %x2) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +; Tests showing replacement of out-of-bounds variable rotates with in-bounds immediate splat versions. + +define <16 x i32> @test_splat_bounds_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { +; KNL-LABEL: test_splat_bounds_rol_v16i32: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprold $1, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprold $31, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprold $30, %zmm0, %zmm0 +; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_bounds_rol_v16i32: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprold $1, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprold $31, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprold $30, %zmm0, %zmm0 +; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 %x2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> zeroinitializer, i16 %x2) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +define <8 x i64>@test_splat_bounds_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { +; KNL-LABEL: test_splat_bounds_rol_v8i64: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprolq $62, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprolq $1, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprolq $63, %zmm0, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_bounds_rol_v8i64: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprolq $62, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprolq $1, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprolq $63, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 %x2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> zeroinitializer, i8 %x2) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +define <16 x i32> @test_splat_bounds_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { +; KNL-LABEL: test_splat_bounds_ror_v16i32: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprord $1, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprord $31, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprord $30, %zmm0, %zmm0 +; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_bounds_ror_v16i32: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprord $1, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprord $31, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprord $30, %zmm0, %zmm0 +; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 %x2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> zeroinitializer, i16 %x2) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +define <8 x i64>@test_splat_bounds_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { +; KNL-LABEL: test_splat_bounds_ror_v8i64: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprorq $62, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprorq $1, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprorq $63, %zmm0, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_bounds_ror_v8i64: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprorq $62, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprorq $1, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprorq $63, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 %x2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> zeroinitializer, i8 %x2) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +; Constant folding + +define <8 x i64> @test_fold_rol_v8i64() { +; CHECK-LABEL: test_fold_rol_v8i64: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [1,2,4,9223372036854775808,2,4611686018427387904,9223372036854775808,9223372036854775808] +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> , <8 x i64> , <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_fold_ror_v8i64() { +; CHECK-LABEL: test_fold_ror_v8i64: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [1,9223372036854775808,4611686018427387904,2,9223372036854775808,4,2,2] +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> , <8 x i64> , <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} diff --git a/test/CodeGen/X86/avx512-shift.ll b/test/CodeGen/X86/avx512-shift.ll index 10883a5a9a62..ce2b010ec0f2 100644 --- a/test/CodeGen/X86/avx512-shift.ll +++ b/test/CodeGen/X86/avx512-shift.ll @@ -1,136 +1,178 @@ -;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +;RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL +;RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX -;CHECK-LABEL: shift_16_i32 -;CHECK: vpsrld -;CHECK: vpslld -;CHECK: vpsrad -;CHECK: ret define <16 x i32> @shift_16_i32(<16 x i32> %a) { +; CHECK-LABEL: shift_16_i32: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrld $1, %zmm0, %zmm0 +; CHECK-NEXT: vpslld $12, %zmm0, %zmm0 +; CHECK-NEXT: vpsrad $12, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = lshr <16 x i32> %a, %c = shl <16 x i32> %b, %d = ashr <16 x i32> %c, ret <16 x i32> %d; } -;CHECK-LABEL: shift_8_i64 -;CHECK: vpsrlq -;CHECK: vpsllq -;CHECK: vpsraq -;CHECK: ret define <8 x i64> @shift_8_i64(<8 x i64> %a) { +; CHECK-LABEL: shift_8_i64: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlq $1, %zmm0, %zmm0 +; CHECK-NEXT: vpsllq $12, %zmm0, %zmm0 +; CHECK-NEXT: vpsraq $12, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = lshr <8 x i64> %a, %c = shl <8 x i64> %b, %d = ashr <8 x i64> %c, ret <8 x i64> %d; } -;SKX-LABEL: shift_4_i64 -;SKX: vpsrlq -;SKX: vpsllq -;SKX: vpsraq -;SKX: ret define <4 x i64> @shift_4_i64(<4 x i64> %a) { +; KNL-LABEL: shift_4_i64: +; KNL: # BB#0: +; KNL-NEXT: vpsrlq $1, %ymm0, %ymm0 +; KNL-NEXT: vpsllq $12, %ymm0, %ymm0 +; KNL-NEXT: vpsraq $12, %zmm0, %zmm0 +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: shift_4_i64: +; SKX: # BB#0: +; SKX-NEXT: vpsrlq $1, %ymm0, %ymm0 +; SKX-NEXT: vpsllq $12, %ymm0, %ymm0 +; SKX-NEXT: vpsraq $12, %ymm0, %ymm0 +; SKX-NEXT: retq %b = lshr <4 x i64> %a, %c = shl <4 x i64> %b, %d = ashr <4 x i64> %c, ret <4 x i64> %d; } -; CHECK-LABEL: variable_shl4 -; CHECK: vpsllvq %zmm -; CHECK: ret define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) { +; CHECK-LABEL: variable_shl4: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = shl <8 x i64> %x, %y ret <8 x i64> %k } -; CHECK-LABEL: variable_shl5 -; CHECK: vpsllvd %zmm -; CHECK: ret define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) { +; CHECK-LABEL: variable_shl5: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = shl <16 x i32> %x, %y ret <16 x i32> %k } -; CHECK-LABEL: variable_srl0 -; CHECK: vpsrlvd -; CHECK: ret define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) { +; CHECK-LABEL: variable_srl0: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = lshr <16 x i32> %x, %y ret <16 x i32> %k } -; CHECK-LABEL: variable_srl2 -; CHECK: psrlvq -; CHECK: ret define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) { +; CHECK-LABEL: variable_srl2: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = lshr <8 x i64> %x, %y ret <8 x i64> %k } -; CHECK-LABEL: variable_sra1 -; CHECK: vpsravd -; CHECK: ret define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) { +; CHECK-LABEL: variable_sra1: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = ashr <16 x i32> %x, %y ret <16 x i32> %k } -; CHECK-LABEL: variable_sra2 -; CHECK: vpsravq %zmm -; CHECK: ret define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) { +; CHECK-LABEL: variable_sra2: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = ashr <8 x i64> %x, %y ret <8 x i64> %k } -; SKX-LABEL: variable_sra3 -; SKX: vpsravq %ymm -; SKX: ret define <4 x i64> @variable_sra3(<4 x i64> %x, <4 x i64> %y) { +; KNL-LABEL: variable_sra3: +; KNL: # BB#0: +; KNL-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: variable_sra3: +; SKX: # BB#0: +; SKX-NEXT: vpsravq %ymm1, %ymm0, %ymm0 +; SKX-NEXT: retq %k = ashr <4 x i64> %x, %y ret <4 x i64> %k } -; SKX-LABEL: variable_sra4 -; SKX: vpsravw %xmm -; SKX: ret define <8 x i16> @variable_sra4(<8 x i16> %x, <8 x i16> %y) { +; KNL-LABEL: variable_sra4: +; KNL: # BB#0: +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; KNL-NEXT: vpmovsxwd %xmm0, %ymm0 +; KNL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: variable_sra4: +; SKX: # BB#0: +; SKX-NEXT: vpsravw %xmm1, %xmm0, %xmm0 +; SKX-NEXT: retq %k = ashr <8 x i16> %x, %y ret <8 x i16> %k } -; CHECK-LABEL: variable_sra01_load -; CHECK: vpsravd (% -; CHECK: ret define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) { +; CHECK-LABEL: variable_sra01_load: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %y1 = load <16 x i32>, <16 x i32>* %y %k = ashr <16 x i32> %x, %y1 ret <16 x i32> %k } -; CHECK-LABEL: variable_shl1_load -; CHECK: vpsllvd (% -; CHECK: ret define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) { +; CHECK-LABEL: variable_shl1_load: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %y1 = load <16 x i32>, <16 x i32>* %y %k = shl <16 x i32> %x, %y1 ret <16 x i32> %k } -; CHECK: variable_srl0_load -; CHECK: vpsrlvd (% -; CHECK: ret + define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) { +; CHECK-LABEL: variable_srl0_load: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %y1 = load <16 x i32>, <16 x i32>* %y %k = lshr <16 x i32> %x, %y1 ret <16 x i32> %k } -; CHECK: variable_srl3_load -; CHECK: vpsrlvq (% -; CHECK: ret define <8 x i64> @variable_srl3_load(<8 x i64> %x, <8 x i64>* %y) { +; CHECK-LABEL: variable_srl3_load: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvq (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %y1 = load <8 x i64>, <8 x i64>* %y %k = lshr <8 x i64> %x, %y1 ret <8 x i64> %k diff --git a/test/CodeGen/X86/bmi-schedule.ll b/test/CodeGen/X86/bmi-schedule.ll new file mode 100644 index 000000000000..75be2d9c0f01 --- /dev/null +++ b/test/CodeGen/X86/bmi-schedule.ll @@ -0,0 +1,529 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+bmi | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i16 @test_andn_i16(i16 zeroext %a0, i16 zeroext %a1, i16 *%a2) { +; GENERIC-LABEL: test_andn_i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: andnl %esi, %edi, %eax +; GENERIC-NEXT: notl %edi +; GENERIC-NEXT: andw (%rdx), %di +; GENERIC-NEXT: addl %edi, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_andn_i16: +; HASWELL: # BB#0: +; HASWELL-NEXT: andnl %esi, %edi, %eax # sched: [1:0.50] +; HASWELL-NEXT: notl %edi # sched: [1:0.25] +; HASWELL-NEXT: andw (%rdx), %di # sched: [5:0.50] +; HASWELL-NEXT: addl %edi, %eax # sched: [1:0.25] +; HASWELL-NEXT: # kill: %AX %AX %EAX +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andn_i16: +; BTVER2: # BB#0: +; BTVER2-NEXT: andnl %esi, %edi, %eax # sched: [1:0.50] +; BTVER2-NEXT: notl %edi # sched: [1:0.50] +; BTVER2-NEXT: andw (%rdx), %di # sched: [4:1.00] +; BTVER2-NEXT: addl %edi, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX %AX %EAX +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andn_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: andnl %esi, %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: notl %edi # sched: [1:0.25] +; ZNVER1-NEXT: andw (%rdx), %di # sched: [5:0.50] +; ZNVER1-NEXT: addl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX %AX %EAX +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i16, i16 *%a2 + %2 = xor i16 %a0, -1 + %3 = and i16 %2, %a1 + %4 = and i16 %2, %1 + %5 = add i16 %3, %4 + ret i16 %5 +} + +define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_andn_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: andnl %esi, %edi, %ecx +; GENERIC-NEXT: andnl (%rdx), %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_andn_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50] +; HASWELL-NEXT: andnl (%rdx), %edi, %eax # sched: [4:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andn_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: andnl (%rdx), %edi, %eax # sched: [4:1.00] +; BTVER2-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andn_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: andnl (%rdx), %edi, %eax # sched: [5:0.50] +; ZNVER1-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.25] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a2 + %2 = xor i32 %a0, -1 + %3 = and i32 %2, %a1 + %4 = and i32 %2, %1 + %5 = add i32 %3, %4 + ret i32 %5 +} + +define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_andn_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: andnq %rsi, %rdi, %rcx +; GENERIC-NEXT: andnq (%rdx), %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_andn_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50] +; HASWELL-NEXT: andnq (%rdx), %rdi, %rax # sched: [4:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andn_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: andnq (%rdx), %rdi, %rax # sched: [4:1.00] +; BTVER2-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andn_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: andnq (%rdx), %rdi, %rax # sched: [5:0.50] +; ZNVER1-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.25] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a2 + %2 = xor i64 %a0, -1 + %3 = and i64 %2, %a1 + %4 = and i64 %2, %1 + %5 = add i64 %3, %4 + ret i64 %5 +} + +define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_bextr_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: bextrl %edi, (%rdx), %ecx +; GENERIC-NEXT: bextrl %edi, %esi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_bextr_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: bextrl %edi, (%rdx), %ecx # sched: [6:0.50] +; HASWELL-NEXT: bextrl %edi, %esi, %eax # sched: [2:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_bextr_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00] +; BTVER2-NEXT: bextrl %edi, %esi, %eax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bextr_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: bextrl %edi, %esi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a2 + %2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %1, i32 %a0) + %3 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a1, i32 %a0) + %4 = add i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.x86.bmi.bextr.32(i32, i32) + +define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_bextr_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: bextrq %rdi, (%rdx), %rcx +; GENERIC-NEXT: bextrq %rdi, %rsi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_bextr_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [6:0.50] +; HASWELL-NEXT: bextrq %rdi, %rsi, %rax # sched: [2:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_bextr_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00] +; BTVER2-NEXT: bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bextr_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a2 + %2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %1, i64 %a0) + %3 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a1, i64 %a0) + %4 = add i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.x86.bmi.bextr.64(i64, i64) + +define i32 @test_blsi_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_blsi_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsil (%rsi), %ecx +; GENERIC-NEXT: blsil %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsi_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsil (%rsi), %ecx # sched: [4:0.50] +; HASWELL-NEXT: blsil %edi, %eax # sched: [1:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsi_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsil (%rsi), %ecx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsil %edi, %eax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsi_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsil (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsil %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = sub i32 0, %1 + %3 = sub i32 0, %a0 + %4 = and i32 %1, %2 + %5 = and i32 %a0, %3 + %6 = add i32 %4, %5 + ret i32 %6 +} + +define i64 @test_blsi_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_blsi_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsiq (%rsi), %rcx +; GENERIC-NEXT: blsiq %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsi_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsiq (%rsi), %rcx # sched: [4:0.50] +; HASWELL-NEXT: blsiq %rdi, %rax # sched: [1:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsi_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsiq (%rsi), %rcx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsiq %rdi, %rax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsi_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsiq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsiq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = sub i64 0, %1 + %3 = sub i64 0, %a0 + %4 = and i64 %1, %2 + %5 = and i64 %a0, %3 + %6 = add i64 %4, %5 + ret i64 %6 +} + +define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_blsmsk_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsmskl (%rsi), %ecx +; GENERIC-NEXT: blsmskl %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsmsk_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsmskl (%rsi), %ecx # sched: [4:0.50] +; HASWELL-NEXT: blsmskl %edi, %eax # sched: [1:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsmsk_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsmskl (%rsi), %ecx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsmskl %edi, %eax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsmsk_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsmskl (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsmskl %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = sub i32 %1, 1 + %3 = sub i32 %a0, 1 + %4 = xor i32 %1, %2 + %5 = xor i32 %a0, %3 + %6 = add i32 %4, %5 + ret i32 %6 +} + +define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_blsmsk_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsmskq (%rsi), %rcx +; GENERIC-NEXT: blsmskq %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsmsk_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsmskq (%rsi), %rcx # sched: [4:0.50] +; HASWELL-NEXT: blsmskq %rdi, %rax # sched: [1:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsmsk_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsmskq (%rsi), %rcx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsmskq %rdi, %rax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsmsk_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsmskq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsmskq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = sub i64 %1, 1 + %3 = sub i64 %a0, 1 + %4 = xor i64 %1, %2 + %5 = xor i64 %a0, %3 + %6 = add i64 %4, %5 + ret i64 %6 +} + +define i32 @test_blsr_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_blsr_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsrl (%rsi), %ecx +; GENERIC-NEXT: blsrl %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsr_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsrl (%rsi), %ecx # sched: [4:0.50] +; HASWELL-NEXT: blsrl %edi, %eax # sched: [1:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsr_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsrl (%rsi), %ecx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsrl %edi, %eax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsr_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsrl (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsrl %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = sub i32 %1, 1 + %3 = sub i32 %a0, 1 + %4 = and i32 %1, %2 + %5 = and i32 %a0, %3 + %6 = add i32 %4, %5 + ret i32 %6 +} + +define i64 @test_blsr_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_blsr_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsrq (%rsi), %rcx +; GENERIC-NEXT: blsrq %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsr_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsrq (%rsi), %rcx # sched: [4:0.50] +; HASWELL-NEXT: blsrq %rdi, %rax # sched: [1:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsr_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsrq (%rsi), %rcx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsrq %rdi, %rax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsr_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsrq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsrq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = sub i64 %1, 1 + %3 = sub i64 %a0, 1 + %4 = and i64 %1, %2 + %5 = and i64 %a0, %3 + %6 = add i64 %4, %5 + ret i64 %6 +} + +define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) { +; GENERIC-LABEL: test_cttz_i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: tzcntw (%rsi), %cx +; GENERIC-NEXT: tzcntw %di, %ax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_cttz_i16: +; HASWELL: # BB#0: +; HASWELL-NEXT: tzcntw (%rsi), %cx # sched: [7:1.00] +; HASWELL-NEXT: tzcntw %di, %ax # sched: [3:1.00] +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: # kill: %AX %AX %EAX +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cttz_i16: +; BTVER2: # BB#0: +; BTVER2-NEXT: tzcntw (%rsi), %cx # sched: [?:0.000000e+00] +; BTVER2-NEXT: tzcntw %di, %ax # sched: [?:0.000000e+00] +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX %AX %EAX +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cttz_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: tzcntw (%rsi), %cx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: tzcntw %di, %ax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX %AX %EAX +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i16, i16 *%a1 + %2 = tail call i16 @llvm.cttz.i16( i16 %1, i1 false ) + %3 = tail call i16 @llvm.cttz.i16( i16 %a0, i1 false ) + %4 = or i16 %2, %3 + ret i16 %4 +} +declare i16 @llvm.cttz.i16(i16, i1) + +define i32 @test_cttz_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_cttz_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: tzcntl (%rsi), %ecx +; GENERIC-NEXT: tzcntl %edi, %eax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_cttz_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: tzcntl (%rsi), %ecx # sched: [7:1.00] +; HASWELL-NEXT: tzcntl %edi, %eax # sched: [3:1.00] +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cttz_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: tzcntl (%rsi), %ecx # sched: [?:0.000000e+00] +; BTVER2-NEXT: tzcntl %edi, %eax # sched: [?:0.000000e+00] +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cttz_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: tzcntl (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: tzcntl %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = tail call i32 @llvm.cttz.i32( i32 %1, i1 false ) + %3 = tail call i32 @llvm.cttz.i32( i32 %a0, i1 false ) + %4 = or i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.cttz.i32(i32, i1) + +define i64 @test_cttz_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_cttz_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: tzcntq (%rsi), %rcx +; GENERIC-NEXT: tzcntq %rdi, %rax +; GENERIC-NEXT: orq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_cttz_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: tzcntq (%rsi), %rcx # sched: [7:1.00] +; HASWELL-NEXT: tzcntq %rdi, %rax # sched: [3:1.00] +; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cttz_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: tzcntq (%rsi), %rcx # sched: [?:0.000000e+00] +; BTVER2-NEXT: tzcntq %rdi, %rax # sched: [?:0.000000e+00] +; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cttz_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: tzcntq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: tzcntq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = tail call i64 @llvm.cttz.i64( i64 %1, i1 false ) + %3 = tail call i64 @llvm.cttz.i64( i64 %a0, i1 false ) + %4 = or i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.cttz.i64(i64, i1) diff --git a/test/CodeGen/X86/bmi2-schedule.ll b/test/CodeGen/X86/bmi2-schedule.ll new file mode 100644 index 000000000000..9666dd85d853 --- /dev/null +++ b/test/CodeGen/X86/bmi2-schedule.ll @@ -0,0 +1,180 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+bmi2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_bzhi_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: bzhil %edi, (%rdx), %ecx +; GENERIC-NEXT: bzhil %edi, %esi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_bzhi_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: bzhil %edi, (%rdx), %ecx # sched: [4:0.50] +; HASWELL-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_bzhi_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bzhil %edi, (%rdx), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: bzhil %edi, %esi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a2 + %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %1, i32 %a0) + %3 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a1, i32 %a0) + %4 = add i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) + +define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_bzhi_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: bzhiq %rdi, (%rdx), %rcx +; GENERIC-NEXT: bzhiq %rdi, %rsi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_bzhi_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [4:0.50] +; HASWELL-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_bzhi_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: bzhiq %rdi, %rsi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a2 + %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %1, i64 %a0) + %3 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a1, i64 %a0) + %4 = add i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) + +define i32 @test_pdep_i32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_pdep_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: pdepl (%rdx), %edi, %ecx +; GENERIC-NEXT: pdepl %esi, %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_pdep_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: pdepl (%rdx), %edi, %ecx # sched: [7:1.00] +; HASWELL-NEXT: pdepl %esi, %edi, %eax # sched: [3:1.00] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_pdep_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: pdepl (%rdx), %edi, %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: pdepl %esi, %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a2 + %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %1) + %3 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1) + %4 = add i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.x86.bmi.pdep.32(i32, i32) + +define i64 @test_pdep_i64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_pdep_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: pdepq (%rdx), %rdi, %rcx +; GENERIC-NEXT: pdepq %rsi, %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_pdep_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [7:1.00] +; HASWELL-NEXT: pdepq %rsi, %rdi, %rax # sched: [3:1.00] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_pdep_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: pdepq %rsi, %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a2 + %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %1) + %3 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1) + %4 = add i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.x86.bmi.pdep.64(i64, i64) + +define i32 @test_pext_i32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_pext_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: pextl (%rdx), %edi, %ecx +; GENERIC-NEXT: pextl %esi, %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_pext_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: pextl (%rdx), %edi, %ecx # sched: [7:1.00] +; HASWELL-NEXT: pextl %esi, %edi, %eax # sched: [3:1.00] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_pext_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: pextl (%rdx), %edi, %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: pextl %esi, %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a2 + %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %1) + %3 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1) + %4 = add i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.x86.bmi.pext.32(i32, i32) + +define i64 @test_pext_i64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_pext_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: pextq (%rdx), %rdi, %rcx +; GENERIC-NEXT: pextq %rsi, %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_pext_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: pextq (%rdx), %rdi, %rcx # sched: [7:1.00] +; HASWELL-NEXT: pextq %rsi, %rdi, %rax # sched: [3:1.00] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_pext_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: pextq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: pextq %rsi, %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a2 + %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %1) + %3 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1) + %4 = add i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.x86.bmi.pext.64(i64, i64) diff --git a/test/CodeGen/X86/bool-ext-inc.ll b/test/CodeGen/X86/bool-ext-inc.ll index e292ccd0be11..7c1042878d59 100644 --- a/test/CodeGen/X86/bool-ext-inc.ll +++ b/test/CodeGen/X86/bool-ext-inc.ll @@ -19,7 +19,7 @@ define i32 @sext_inc(i1 zeroext %x) nounwind { define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind { ; CHECK-LABEL: sext_inc_vec: ; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> @@ -31,7 +31,7 @@ define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: cmpgt_sext_inc_vec: ; CHECK: # BB#0: ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i32> %x, %y @@ -56,7 +56,7 @@ define <4 x i64> @cmpgt_sext_inc_vec256(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: cmpgt_sext_inc_vec256: ; CHECK: # BB#0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] ; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i64> %x, %y @@ -91,7 +91,7 @@ define <4 x i32> @bool_logic_and_math_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %cmp1 = icmp ne <4 x i32> %a, %b diff --git a/test/CodeGen/X86/bswap-rotate.ll b/test/CodeGen/X86/bswap-rotate.ll new file mode 100644 index 000000000000..f686febe5645 --- /dev/null +++ b/test/CodeGen/X86/bswap-rotate.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 + +; Combine BSWAP (lowered to rolw 8) with a second rotate. +; This test checks for combining rotates with inconsistent constant value types. + +define i16 @combine_bswap_rotate(i16 %a0) { +; X86-LABEL: combine_bswap_rotate: +; X86: # BB#0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rolw $9, %ax +; X86-NEXT: retl +; +; X64-LABEL: combine_bswap_rotate: +; X64: # BB#0: +; X64-NEXT: rolw $9, %di +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq + %1 = call i16 @llvm.bswap.i16(i16 %a0) + %2 = shl i16 %1, 1 + %3 = lshr i16 %1, 15 + %4 = or i16 %2, %3 + ret i16 %4 +} + +declare i16 @llvm.bswap.i16(i16) diff --git a/test/CodeGen/X86/clobber-fi0.ll b/test/CodeGen/X86/clobber-fi0.ll index 02f1a1616db2..b69b18531601 100644 --- a/test/CodeGen/X86/clobber-fi0.ll +++ b/test/CodeGen/X86/clobber-fi0.ll @@ -15,22 +15,22 @@ bb: %tmp = alloca i32, align 4 ; [#uses=3 type=i32*] %tmp2 = alloca i32, align 4 ; [#uses=3 type=i32*] %tmp3 = alloca i32 ; [#uses=1 type=i32*] - store i32 1, i32* %tmp, align 4 - store i32 1, i32* %tmp2, align 4 + store volatile i32 1, i32* %tmp, align 4 + store volatile i32 1, i32* %tmp2, align 4 br label %bb4 bb4: ; preds = %bb4, %bb - %tmp6 = load i32, i32* %tmp2, align 4 ; [#uses=1 type=i32] + %tmp6 = load volatile i32, i32* %tmp2, align 4 ; [#uses=1 type=i32] %tmp7 = add i32 %tmp6, -1 ; [#uses=2 type=i32] - store i32 %tmp7, i32* %tmp2, align 4 + store volatile i32 %tmp7, i32* %tmp2, align 4 %tmp8 = icmp eq i32 %tmp7, 0 ; [#uses=1 type=i1] - %tmp9 = load i32, i32* %tmp ; [#uses=1 type=i32] + %tmp9 = load volatile i32, i32* %tmp ; [#uses=1 type=i32] %tmp10 = add i32 %tmp9, -1 ; [#uses=1 type=i32] - store i32 %tmp10, i32* %tmp3 + store volatile i32 %tmp10, i32* %tmp3 br i1 %tmp8, label %bb11, label %bb4 bb11: ; preds = %bb4 - %tmp12 = load i32, i32* %tmp, align 4 ; [#uses=1 type=i32] + %tmp12 = load volatile i32, i32* %tmp, align 4 ; [#uses=1 type=i32] ret i32 %tmp12 } diff --git a/test/CodeGen/X86/combine-rotates.ll b/test/CodeGen/X86/combine-rotates.ll index 713ee5d0f65a..0d74c937af33 100644 --- a/test/CodeGen/X86/combine-rotates.ll +++ b/test/CodeGen/X86/combine-rotates.ll @@ -6,22 +6,12 @@ define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) { ; XOP-LABEL: combine_vec_rot_rot: ; XOP: # BB#0: -; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 -; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 -; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: combine_vec_rot_rot: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, %2 = shl <4 x i32> %x, @@ -40,12 +30,7 @@ define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) { ; ; AVX512-LABEL: combine_vec_rot_rot_splat: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrld $3, %xmm0, %xmm1 -; AVX512-NEXT: vpslld $29, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpsrld $22, %xmm0, %xmm1 -; AVX512-NEXT: vpslld $10, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vprold $7, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, %2 = shl <4 x i32> %x, @@ -63,12 +48,6 @@ define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) { ; ; AVX512-LABEL: combine_vec_rot_rot_splat_zero: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrld $1, %xmm0, %xmm1 -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpsrld $31, %xmm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, %2 = shl <4 x i32> %x, diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll index 3dbff2680c22..a6491a0a8694 100644 --- a/test/CodeGen/X86/combine-shl.ll +++ b/test/CodeGen/X86/combine-shl.ll @@ -392,7 +392,7 @@ define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_gt_lshr0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -437,7 +437,7 @@ define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_le_lshr0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816] ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -481,7 +481,7 @@ define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_ashr0: ; AVX: # BB#0: -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = ashr <4 x i32> %x, @@ -515,7 +515,7 @@ define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) { ; AVX-LABEL: combine_vec_shl_add0: ; AVX: # BB#0: ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = add <4 x i32> %x, @@ -550,7 +550,7 @@ define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_or0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -585,7 +585,7 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_mul0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = mul <4 x i32> %x, diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 21564cdd7353..473fae19f4fd 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -91,7 +91,7 @@ define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_lshr_known_zero1: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq @@ -326,7 +326,7 @@ define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_lshr_shl_mask0: ; AVX: # BB#0: -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823] ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> %x, @@ -376,10 +376,10 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_lshr_lzcnt_bit0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $4, %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <4 x i32> %x, diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll index e1e849929405..b6ae2fa6d157 100644 --- a/test/CodeGen/X86/combine-udiv.ll +++ b/test/CodeGen/X86/combine-udiv.ll @@ -166,7 +166,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2a: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll index 91da268a8d75..4c7716bbaebe 100644 --- a/test/CodeGen/X86/combine-urem.ll +++ b/test/CodeGen/X86/combine-urem.ll @@ -43,7 +43,7 @@ define <4 x i32> @combine_vec_urem_by_pow2a(<4 x i32> %x) { ; ; AVX2-LABEL: combine_vec_urem_by_pow2a: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] ; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = urem <4 x i32> %x, @@ -87,7 +87,7 @@ define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_urem_by_pow2c: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] ; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 @@ -146,7 +146,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_urem_by_pow2d: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 @@ -183,7 +183,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_urem_by_shl_pow2a: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4] ; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 diff --git a/test/CodeGen/X86/f16c-schedule.ll b/test/CodeGen/X86/f16c-schedule.ll new file mode 100644 index 000000000000..15ae4a49d7d3 --- /dev/null +++ b/test/CodeGen/X86/f16c-schedule.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=IVY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define <4 x float> @test_vcvtph2ps_128(<8 x i16> %a0, <8 x i16> *%a1) { +; IVY-LABEL: test_vcvtph2ps_128: +; IVY: # BB#0: +; IVY-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00] +; IVY-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00] +; IVY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; IVY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_vcvtph2ps_128: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00] +; HASWELL-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_vcvtph2ps_128: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_vcvtph2ps_128: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load <8 x i16>, <8 x i16> *%a1 + %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1) + %3 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) + %4 = fadd <4 x float> %2, %3 + ret <4 x float> %4 +} +declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) + +define <8 x float> @test_vcvtph2ps_256(<8 x i16> %a0, <8 x i16> *%a1) { +; IVY-LABEL: test_vcvtph2ps_256: +; IVY: # BB#0: +; IVY-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00] +; IVY-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00] +; IVY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; IVY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_vcvtph2ps_256: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00] +; HASWELL-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [4:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_vcvtph2ps_256: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_vcvtph2ps_256: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load <8 x i16>, <8 x i16> *%a1 + %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1) + %3 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) + %4 = fadd <8 x float> %2, %3 + ret <8 x float> %4 +} +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) + +define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16> *%a2) { +; IVY-LABEL: test_vcvtps2ph_128: +; IVY: # BB#0: +; IVY-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00] +; IVY-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [7:1.00] +; IVY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_vcvtps2ph_128: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_vcvtps2ph_128: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_vcvtps2ph_128: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) + %2 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a1, i32 0) + %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <4 x i32> + store <4 x i16> %3, <4 x i16> *%a2 + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) + +define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16> *%a2) { +; IVY-LABEL: test_vcvtps2ph_256: +; IVY: # BB#0: +; IVY-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00] +; IVY-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00] +; IVY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; IVY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_vcvtps2ph_256: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_vcvtps2ph_256: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_vcvtps2ph_256: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [12:1.00] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) + %2 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a1, i32 0) + store <8 x i16> %2, <8 x i16> *%a2 + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll index 3d5c12c03484..c87353ed1f5a 100644 --- a/test/CodeGen/X86/fast-isel-x86-64.ll +++ b/test/CodeGen/X86/fast-isel-x86-64.ll @@ -316,7 +316,7 @@ define void @allocamaterialize() { ; STDERR-NOT: FastISel missed terminator: ret void ; CHECK-LABEL: win64ccfun -define x86_64_win64cc void @win64ccfun(i32 %i) { +define win64cc void @win64ccfun(i32 %i) { ; CHECK: ret ret void } diff --git a/test/CodeGen/X86/hipe-cc.ll b/test/CodeGen/X86/hipe-cc.ll index fbc4cd9d4f9c..86469dad23f2 100644 --- a/test/CodeGen/X86/hipe-cc.ll +++ b/test/CodeGen/X86/hipe-cc.ll @@ -48,11 +48,7 @@ entry: store i32 %arg0, i32* %arg0_var store i32 %arg1, i32* %arg1_var store i32 %arg2, i32* %arg2_var - - ; CHECK: movl 16(%esp), %esi - ; CHECK-NEXT: movl 12(%esp), %ebp - ; CHECK-NEXT: movl 8(%esp), %eax - ; CHECK-NEXT: movl 4(%esp), %edx + ; These loads are loading the values from their previous stores and are optimized away. %0 = load i32, i32* %hp_var %1 = load i32, i32* %p_var %2 = load i32, i32* %arg0_var diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll index 43e2e1409fde..efe07cf6301e 100644 --- a/test/CodeGen/X86/hipe-cc64.ll +++ b/test/CodeGen/X86/hipe-cc64.ll @@ -57,11 +57,7 @@ entry: store i64 %arg2, i64* %arg2_var store i64 %arg3, i64* %arg3_var - ; CHECK: movq 40(%rsp), %r15 - ; CHECK-NEXT: movq 32(%rsp), %rbp - ; CHECK-NEXT: movq 24(%rsp), %rsi - ; CHECK-NEXT: movq 16(%rsp), %rdx - ; CHECK-NEXT: movq 8(%rsp), %rcx + ; Loads are reading values just writen from corresponding register and are therefore noops. %0 = load i64, i64* %hp_var %1 = load i64, i64* %p_var %2 = load i64, i64* %arg0_var diff --git a/test/CodeGen/X86/lea32-schedule.ll b/test/CodeGen/X86/lea32-schedule.ll new file mode 100644 index 000000000000..e42ce30c5a6d --- /dev/null +++ b/test/CodeGen/X86/lea32-schedule.ll @@ -0,0 +1,653 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i32 @test_lea_offset(i32) { +; GENERIC-LABEL: test_lea_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal -24(%rdi), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_offset: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal -24(%rdi), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_offset: +; SLM: # BB#0: +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal -24(%rdi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_offset: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal -24(%rdi), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal -24(%rdi), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal -24(%rdi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal -24(%rdi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = add nsw i32 %0, -24 + ret i32 %2 +} + +define i32 @test_lea_offset_big(i32) { +; GENERIC-LABEL: test_lea_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal 1024(%rdi), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_offset_big: +; SLM: # BB#0: +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal 1024(%rdi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal 1024(%rdi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = add nsw i32 %0, 1024 + ret i32 %2 +} + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @test_lea_add(i32, i32) { +; GENERIC-LABEL: test_lea_add: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI %ESI %RSI +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI %ESI %RSI +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal (%rdi,%rsi), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI %ESI %RSI +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal (%rdi,%rsi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI %ESI %RSI +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI %ESI %RSI +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI %ESI %RSI +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add nsw i32 %1, %0 + ret i32 %3 +} + +define i32 @test_lea_add_offset(i32, i32) { +; GENERIC-LABEL: test_lea_add_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI %ESI %RSI +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_offset: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI %ESI %RSI +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal 16(%rdi,%rsi), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_offset: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI %ESI %RSI +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_offset: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI %ESI %RSI +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $16, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI %ESI %RSI +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $16, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI %ESI %RSI +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add i32 %0, 16 + %4 = add i32 %3, %1 + ret i32 %4 +} + +define i32 @test_lea_add_offset_big(i32, i32) { +; GENERIC-LABEL: test_lea_add_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI %ESI %RSI +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI %ESI %RSI +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal -4096(%rdi,%rsi), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_offset_big: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI %ESI %RSI +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI %ESI %RSI +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $-4096, %eax # imm = 0xF000 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI %ESI %RSI +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $-4096, %eax # imm = 0xF000 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI %ESI %RSI +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add i32 %0, -4096 + %4 = add i32 %3, %1 + ret i32 %4 +} + +define i32 @test_lea_mul(i32) { +; GENERIC-LABEL: test_lea_mul: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal (%rdi,%rdi,2), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul: +; SLM: # BB#0: +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i32 %0, 3 + ret i32 %2 +} + +define i32 @test_lea_mul_offset(i32) { +; GENERIC-LABEL: test_lea_mul_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul_offset: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal -32(%rdi,%rdi,2), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul_offset: +; SLM: # BB#0: +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul_offset: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $-32, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $-32, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i32 %0, 3 + %3 = add nsw i32 %2, -32 + ret i32 %3 +} + +define i32 @test_lea_mul_offset_big(i32) { +; GENERIC-LABEL: test_lea_mul_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal 10000(%rdi,%rdi,8), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul_offset_big: +; SLM: # BB#0: +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $10000, %eax # imm = 0x2710 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $10000, %eax # imm = 0x2710 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i32 %0, 9 + %3 = add nsw i32 %2, 10000 + ret i32 %3 +} + +define i32 @test_lea_add_scale(i32, i32) { +; GENERIC-LABEL: test_lea_add_scale: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI %ESI %RSI +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI %ESI %RSI +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal (%rdi,%rsi,2), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI %ESI %RSI +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI %ESI %RSI +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI %ESI %RSI +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI %ESI %RSI +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i32 %1, 1 + %4 = add nsw i32 %3, %0 + ret i32 %4 +} + +define i32 @test_lea_add_scale_offset(i32, i32) { +; GENERIC-LABEL: test_lea_add_scale_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI %ESI %RSI +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale_offset: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI %ESI %RSI +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal 96(%rdi,%rsi,4), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale_offset: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI %ESI %RSI +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale_offset: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI %ESI %RSI +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $96, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI %ESI %RSI +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $96, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI %ESI %RSI +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i32 %1, 2 + %4 = add i32 %0, 96 + %5 = add i32 %4, %3 + ret i32 %5 +} + +define i32 @test_lea_add_scale_offset_big(i32, i32) { +; GENERIC-LABEL: test_lea_add_scale_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI %ESI %RSI +; GENERIC-NEXT: # kill: %EDI %EDI %RDI +; GENERIC-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI %ESI %RSI +; ATOM-NEXT: # kill: %EDI %EDI %RDI +; ATOM-NEXT: leal -1200(%rdi,%rsi,8), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale_offset_big: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI %ESI %RSI +; SLM-NEXT: # kill: %EDI %EDI %RDI +; SLM-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI %ESI %RSI +; SANDY-NEXT: # kill: %EDI %EDI %RDI +; SANDY-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $-1200, %eax # imm = 0xFB50 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI %ESI %RSI +; HASWELL-NEXT: # kill: %EDI %EDI %RDI +; HASWELL-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $-1200, %eax # imm = 0xFB50 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI %ESI %RSI +; BTVER2-NEXT: # kill: %EDI %EDI %RDI +; BTVER2-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i32 %1, 3 + %4 = add i32 %0, -1200 + %5 = add i32 %4, %3 + ret i32 %5 +} diff --git a/test/CodeGen/X86/lea64-schedule.ll b/test/CodeGen/X86/lea64-schedule.ll new file mode 100644 index 000000000000..0ff1574c809d --- /dev/null +++ b/test/CodeGen/X86/lea64-schedule.ll @@ -0,0 +1,534 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i64 @test_lea_offset(i64) { +; GENERIC-LABEL: test_lea_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_offset: +; ATOM: # BB#0: +; ATOM-NEXT: leaq -24(%rdi), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_offset: +; SLM: # BB#0: +; SLM-NEXT: leaq -24(%rdi), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_offset: +; SANDY: # BB#0: +; SANDY-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -24(%rdi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = add nsw i64 %0, -24 + ret i64 %2 +} + +define i64 @test_lea_offset_big(i64) { +; GENERIC-LABEL: test_lea_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: leaq 1024(%rdi), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_offset_big: +; SLM: # BB#0: +; SLM-NEXT: leaq 1024(%rdi), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = add nsw i64 %0, 1024 + ret i64 %2 +} + +; Function Attrs: norecurse nounwind readnone uwtable +define i64 @test_lea_add(i64, i64) { +; GENERIC-LABEL: test_lea_add: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add: +; ATOM: # BB#0: +; ATOM-NEXT: leaq (%rdi,%rsi), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add: +; SLM: # BB#0: +; SLM-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add nsw i64 %1, %0 + ret i64 %3 +} + +define i64 @test_lea_add_offset(i64, i64) { +; GENERIC-LABEL: test_lea_add_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_offset: +; ATOM: # BB#0: +; ATOM-NEXT: leaq 16(%rdi,%rsi), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_offset: +; SLM: # BB#0: +; SLM-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_offset: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $16, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $16, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add i64 %0, 16 + %4 = add i64 %3, %1 + ret i64 %4 +} + +define i64 @test_lea_add_offset_big(i64, i64) { +; GENERIC-LABEL: test_lea_add_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: leaq -4096(%rdi,%rsi), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_offset_big: +; SLM: # BB#0: +; SLM-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $-4096, %rax # imm = 0xF000 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $-4096, %rax # imm = 0xF000 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add i64 %0, -4096 + %4 = add i64 %3, %1 + ret i64 %4 +} + +define i64 @test_lea_mul(i64) { +; GENERIC-LABEL: test_lea_mul: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul: +; ATOM: # BB#0: +; ATOM-NEXT: leaq (%rdi,%rdi,2), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul: +; SLM: # BB#0: +; SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i64 %0, 3 + ret i64 %2 +} + +define i64 @test_lea_mul_offset(i64) { +; GENERIC-LABEL: test_lea_mul_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul_offset: +; ATOM: # BB#0: +; ATOM-NEXT: leaq -32(%rdi,%rdi,2), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul_offset: +; SLM: # BB#0: +; SLM-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul_offset: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $-32, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $-32, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i64 %0, 3 + %3 = add nsw i64 %2, -32 + ret i64 %3 +} + +define i64 @test_lea_mul_offset_big(i64) { +; GENERIC-LABEL: test_lea_mul_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: leaq 10000(%rdi,%rdi,8), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul_offset_big: +; SLM: # BB#0: +; SLM-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $10000, %rax # imm = 0x2710 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $10000, %rax # imm = 0x2710 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i64 %0, 9 + %3 = add nsw i64 %2, 10000 + ret i64 %3 +} + +define i64 @test_lea_add_scale(i64, i64) { +; GENERIC-LABEL: test_lea_add_scale: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale: +; ATOM: # BB#0: +; ATOM-NEXT: leaq (%rdi,%rsi,2), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale: +; SLM: # BB#0: +; SLM-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i64 %1, 1 + %4 = add nsw i64 %3, %0 + ret i64 %4 +} + +define i64 @test_lea_add_scale_offset(i64, i64) { +; GENERIC-LABEL: test_lea_add_scale_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale_offset: +; ATOM: # BB#0: +; ATOM-NEXT: leaq 96(%rdi,%rsi,4), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale_offset: +; SLM: # BB#0: +; SLM-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale_offset: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $96, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $96, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i64 %1, 2 + %4 = add i64 %0, 96 + %5 = add i64 %4, %3 + ret i64 %5 +} + +define i64 @test_lea_add_scale_offset_big(i64, i64) { +; GENERIC-LABEL: test_lea_add_scale_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: leaq -1200(%rdi,%rsi,8), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale_offset_big: +; SLM: # BB#0: +; SLM-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $-1200, %rax # imm = 0xFB50 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $-1200, %rax # imm = 0xFB50 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i64 %1, 3 + %4 = add i64 %0, -1200 + %5 = add i64 %4, %3 + ret i64 %5 +} diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll index b3f2116e6486..3ad6cad32d83 100644 --- a/test/CodeGen/X86/legalize-shift-64.ll +++ b/test/CodeGen/X86/legalize-shift-64.ll @@ -148,8 +148,7 @@ define i32 @test6() { ; CHECK-NEXT: andl $-8, %esp ; CHECK-NEXT: subl $16, %esp ; CHECK-NEXT: movl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $1, (%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: shldl $32, %eax, %ecx @@ -175,9 +174,8 @@ define i32 @test6() { ; CHECK-NEXT: retl %x = alloca i32, align 4 %t = alloca i64, align 8 - store i32 1, i32* %x, align 4 - store i64 1, i64* %t, align 8 ;; DEAD - %load = load i32, i32* %x, align 4 + store volatile i32 1, i32* %x, align 4 + %load = load volatile i32, i32* %x, align 4 %shl = shl i32 %load, 8 %add = add i32 %shl, -224 %sh_prom = zext i32 %add to i64 diff --git a/test/CodeGen/X86/lzcnt-schedule.ll b/test/CodeGen/X86/lzcnt-schedule.ll new file mode 100644 index 000000000000..cd0dcbbd6afb --- /dev/null +++ b/test/CodeGen/X86/lzcnt-schedule.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) { +; GENERIC-LABEL: test_ctlz_i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: lzcntw (%rsi), %cx +; GENERIC-NEXT: lzcntw %di, %ax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_ctlz_i16: +; HASWELL: # BB#0: +; HASWELL-NEXT: lzcntw (%rsi), %cx +; HASWELL-NEXT: lzcntw %di, %ax +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: # kill: %AX %AX %EAX +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctlz_i16: +; BTVER2: # BB#0: +; BTVER2-NEXT: lzcntw (%rsi), %cx +; BTVER2-NEXT: lzcntw %di, %ax +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX %AX %EAX +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctlz_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lzcntw (%rsi), %cx +; ZNVER1-NEXT: lzcntw %di, %ax +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX %AX %EAX +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i16, i16 *%a1 + %2 = tail call i16 @llvm.ctlz.i16( i16 %1, i1 false ) + %3 = tail call i16 @llvm.ctlz.i16( i16 %a0, i1 false ) + %4 = or i16 %2, %3 + ret i16 %4 +} +declare i16 @llvm.ctlz.i16(i16, i1) + +define i32 @test_ctlz_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_ctlz_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: lzcntl (%rsi), %ecx +; GENERIC-NEXT: lzcntl %edi, %eax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_ctlz_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: lzcntl (%rsi), %ecx +; HASWELL-NEXT: lzcntl %edi, %eax +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctlz_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: lzcntl (%rsi), %ecx +; BTVER2-NEXT: lzcntl %edi, %eax +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctlz_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lzcntl (%rsi), %ecx +; ZNVER1-NEXT: lzcntl %edi, %eax +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = tail call i32 @llvm.ctlz.i32( i32 %1, i1 false ) + %3 = tail call i32 @llvm.ctlz.i32( i32 %a0, i1 false ) + %4 = or i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.ctlz.i32(i32, i1) + +define i64 @test_ctlz_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_ctlz_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: lzcntq (%rsi), %rcx +; GENERIC-NEXT: lzcntq %rdi, %rax +; GENERIC-NEXT: orq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_ctlz_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: lzcntq (%rsi), %rcx +; HASWELL-NEXT: lzcntq %rdi, %rax +; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctlz_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: lzcntq (%rsi), %rcx +; BTVER2-NEXT: lzcntq %rdi, %rax +; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctlz_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lzcntq (%rsi), %rcx +; ZNVER1-NEXT: lzcntq %rdi, %rax +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = tail call i64 @llvm.ctlz.i64( i64 %1, i1 false ) + %3 = tail call i64 @llvm.ctlz.i64( i64 %a0, i1 false ) + %4 = or i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.ctlz.i64(i64, i1) diff --git a/test/CodeGen/X86/machine-outliner-debuginfo.ll b/test/CodeGen/X86/machine-outliner-debuginfo.ll index 26a194764086..02d0964e37eb 100644 --- a/test/CodeGen/X86/machine-outliner-debuginfo.ll +++ b/test/CodeGen/X86/machine-outliner-debuginfo.ll @@ -17,6 +17,7 @@ define i32 @main() #0 !dbg !11 { call void @llvm.dbg.value(metadata i32 10, i64 0, metadata !15, metadata !16), !dbg !17 store i32 4, i32* %5, align 4 store i32 0, i32* @x, align 4, !dbg !24 + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; This is the same sequence of instructions without a debug value. It should be outlined ; in the same way. ; CHECK: callq l_OUTLINED_FUNCTION_0 diff --git a/test/CodeGen/X86/machine-outliner.ll b/test/CodeGen/X86/machine-outliner.ll index 9f8e6ec298f4..b4a277ec2d82 100644 --- a/test/CodeGen/X86/machine-outliner.ll +++ b/test/CodeGen/X86/machine-outliner.ll @@ -85,6 +85,7 @@ define i32 @main() #0 { store i32 3, i32* %4, align 4 store i32 4, i32* %5, align 4 store i32 1, i32* @x, align 4 + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: callq [[OFUNC2]] store i32 1, i32* %2, align 4 store i32 2, i32* %3, align 4 diff --git a/test/CodeGen/X86/memcmp-minsize.ll b/test/CodeGen/X86/memcmp-minsize.ll new file mode 100644 index 000000000000..a7f42644ca2d --- /dev/null +++ b/test/CodeGen/X86/memcmp-minsize.ll @@ -0,0 +1,721 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 + +declare i32 @memcmp(i8*, i8*, i64) + +define i32 @length2(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length2: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $2, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length2: +; X64: # BB#0: +; X64-NEXT: pushq $2 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + ret i32 %m +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length2_eq: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq: +; X64: # BB#0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length2_eq_const: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw $12849, (%eax) # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_const: +; X64: # BB#0: +; X64-NEXT: cmpw $12849, (%rdi) # imm = 0x3231 +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $2, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_nobuiltin_attr: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $2 +; X64-NEXT: popq %rdx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length3: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $3, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length3: +; X64: # BB#0: +; X64-NEXT: pushq $3 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length3_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $3, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length3_eq: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $3 +; X64-NEXT: popq %rdx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length4: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $4, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length4: +; X64: # BB#0: +; X64-NEXT: pushq $4 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length4_eq: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq: +; X64: # BB#0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length4_eq_const: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq_const: +; X64: # BB#0: +; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length5: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $5, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length5: +; X64: # BB#0: +; X64-NEXT: pushq $5 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length5_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $5, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length5_eq: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $5 +; X64-NEXT: popq %rdx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length8: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $8, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length8: +; X64: # BB#0: +; X64-NEXT: pushq $8 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length8_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $8, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length8_eq: +; X64: # BB#0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length8_eq_const: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $8, {{[0-9]+}}(%esp) +; X86-NEXT: movl $.L.str, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length8_eq_const: +; X64: # BB#0: +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length12_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $12, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length12_eq: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $12 +; X64-NEXT: popq %rdx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length12: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $12, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length12: +; X64: # BB#0: +; X64-NEXT: pushq $12 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + ret i32 %m +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length16: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $16, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length16: +; X64: # BB#0: +; X64-NEXT: pushq $16 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: subl $16, %esp +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) +; X86-NOSSE-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl $16, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # BB#0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length16_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rsi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length16_eq: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind minsize { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: subl $16, %esp +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl %eax, (%esp) +; X86-NOSSE-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl $16, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl $.L.str, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # BB#0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length16_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length16_eq_const: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length32: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $32, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length32: +; X64: # BB#0: +; X64-NEXT: pushq $32 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize { +; X86-LABEL: length32_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $32, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: pushq $32 +; X64-SSE2-NEXT: popq %rdx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length32_eq_const: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $32, {{[0-9]+}}(%esp) +; X86-NEXT: movl $.L.str, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: pushq $32 +; X64-SSE2-NEXT: popq %rdx +; X64-SSE2-NEXT: movl $.L.str, %esi +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq_const: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length64: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $64, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length64: +; X64: # BB#0: +; X64-NEXT: pushq $64 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize { +; X86-LABEL: length64_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $64, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length64_eq: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $64 +; X64-NEXT: popq %rdx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length64_eq_const: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $64, {{[0-9]+}}(%esp) +; X86-NEXT: movl $.L.str, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length64_eq_const: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $64 +; X64-NEXT: popq %rdx +; X64-NEXT: movl $.L.str, %esi +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll new file mode 100644 index 000000000000..450205a966d2 --- /dev/null +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -0,0 +1,871 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 + +declare i32 @memcmp(i8*, i8*, i64) + +define i32 @length2(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length2: +; X86: # BB#0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: incl %edi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: cmovael %edi, %eax +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: length2: +; X64: # BB#0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpw %cx, %ax +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + ret i32 %m +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length2_eq: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq: +; X64: # BB#0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length2_eq_const: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_const: +; X64: # BB#0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $2 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_nobuiltin_attr: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: movl $2, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length3: +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: movzwl %dx, %edx +; X86-NEXT: movzwl %si, %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB4_3 +; X86-NEXT: .LBB4_1: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: .LBB4_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length3: +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB4_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 2(%rdi), %eax +; X64-NEXT: movzbl 2(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB4_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length3_eq: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: cmpw (%ecx), %dx +; X86-NEXT: jne .LBB5_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 2(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: je .LBB5_3 +; X86-NEXT: .LBB5_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB5_3: # %endblock +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length3_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: jne .LBB5_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 2(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 2(%rsi), %cl +; X64-NEXT: je .LBB5_3 +; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB5_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length4: +; X86: # BB#0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: incl %edi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmovael %edi, %eax +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: length4: +; X64: # BB#0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length4_eq: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq: +; X64: # BB#0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length4_eq_const: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq_const: +; X64: # BB#0: +; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length5: +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB9_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB9_3 +; X86-NEXT: .LBB9_1: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: .LBB9_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length5: +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB9_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 4(%rdi), %eax +; X64-NEXT: movzbl 4(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB9_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length5_eq: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB10_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 4(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: je .LBB10_3 +; X86-NEXT: .LBB10_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB10_3: # %endblock +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length5_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: jne .LBB10_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 4(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 4(%rsi), %cl +; X64-NEXT: je .LBB10_3 +; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB10_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length8: +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB11_3 +; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: incl %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmovael %esi, %eax +; X86-NEXT: .LBB11_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length8: +; X64: # BB#0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax +; X64-NEXT: bswapq %rcx +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length8_eq: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB12_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: je .LBB12_3 +; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB12_3: # %endblock +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length8_eq: +; X64: # BB#0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length8_eq_const: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB13_3: # %endblock +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length8_eq_const: +; X64: # BB#0: +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length12_eq: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length12_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB14_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl 8(%rsi), %ecx +; X64-NEXT: je .LBB14_3 +; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB14_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length12: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length12: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + ret i32 %m +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length16: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $16 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length16: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # BB#0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X64-LABEL: length16_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB17_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq 8(%rsi), %rcx +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB17_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind optsize { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # BB#0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X64-LABEL: length16_eq_const: +; X64: # BB#0: # %loadbb +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: jne .LBB18_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 +; X64-NEXT: cmpq %rcx, 8(%rdi) +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB18_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length32: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length32: +; X64: # BB#0: +; X64-NEXT: movl $32, %edx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { +; X86-LABEL: length32_eq: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $32, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length32_eq_const: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $32 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $.L.str, %esi +; X64-SSE2-NEXT: movl $32, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq_const: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length64: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length64: +; X64: # BB#0: +; X64-NEXT: movl $64, %edx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { +; X86-LABEL: length64_eq: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length64_eq: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: movl $64, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length64_eq_const: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length64_eq_const: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: movl $.L.str, %esi +; X64-NEXT: movl $64, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 0e09abf73c8c..2e6782765462 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=AVX2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 @@ -12,43 +12,21 @@ declare i32 @memcmp(i8*, i8*, i64) define i32 @length2(i8* %X, i8* %Y) nounwind { -; X86-NOSSE-LABEL: length2: -; X86-NOSSE: # BB#0: -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movzwl (%ecx), %ecx -; X86-NOSSE-NEXT: movzwl (%eax), %eax -; X86-NOSSE-NEXT: rolw $8, %cx -; X86-NOSSE-NEXT: rolw $8, %ax -; X86-NOSSE-NEXT: cmpw %ax, %cx -; X86-NOSSE-NEXT: movl $-1, %eax -; X86-NOSSE-NEXT: jae .LBB0_1 -; X86-NOSSE-NEXT: # BB#2: -; X86-NOSSE-NEXT: je .LBB0_3 -; X86-NOSSE-NEXT: .LBB0_4: -; X86-NOSSE-NEXT: retl -; X86-NOSSE-NEXT: .LBB0_1: -; X86-NOSSE-NEXT: movl $1, %eax -; X86-NOSSE-NEXT: jne .LBB0_4 -; X86-NOSSE-NEXT: .LBB0_3: -; X86-NOSSE-NEXT: xorl %eax, %eax -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length2: -; X86-SSE2: # BB#0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movzwl (%ecx), %ecx -; X86-SSE2-NEXT: movzwl (%eax), %eax -; X86-SSE2-NEXT: rolw $8, %cx -; X86-SSE2-NEXT: rolw $8, %ax -; X86-SSE2-NEXT: xorl %edx, %edx -; X86-SSE2-NEXT: cmpw %ax, %cx -; X86-SSE2-NEXT: movl $-1, %ecx -; X86-SSE2-NEXT: movl $1, %eax -; X86-SSE2-NEXT: cmovbl %ecx, %eax -; X86-SSE2-NEXT: cmovel %edx, %eax -; X86-SSE2-NEXT: retl +; X86-LABEL: length2: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %ax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: cmovel %edx, %eax +; X86-NEXT: retl ; ; X64-LABEL: length2: ; X64: # BB#0: @@ -137,44 +115,90 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind { define i32 @length3(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: movzwl %dx, %edx +; X86-NEXT: movzwl %si, %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB4_1: # %res_block +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length3: -; X64: # BB#0: -; X64-NEXT: movl $3, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB4_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 2(%rdi), %eax +; X64-NEXT: movzbl 2(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB4_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind ret i32 %m } define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: cmpw (%ecx), %dx +; X86-NEXT: jne .LBB5_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 2(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: je .LBB5_3 +; X86-NEXT: .LBB5_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB5_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $3, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: jne .LBB5_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 2(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 2(%rsi), %cl +; X64-NEXT: je .LBB5_3 +; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind %c = icmp ne i32 %m, 0 @@ -182,43 +206,21 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind { } define i32 @length4(i8* %X, i8* %Y) nounwind { -; X86-NOSSE-LABEL: length4: -; X86-NOSSE: # BB#0: -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl (%ecx), %ecx -; X86-NOSSE-NEXT: movl (%eax), %eax -; X86-NOSSE-NEXT: bswapl %ecx -; X86-NOSSE-NEXT: bswapl %eax -; X86-NOSSE-NEXT: cmpl %eax, %ecx -; X86-NOSSE-NEXT: movl $-1, %eax -; X86-NOSSE-NEXT: jae .LBB6_1 -; X86-NOSSE-NEXT: # BB#2: -; X86-NOSSE-NEXT: je .LBB6_3 -; X86-NOSSE-NEXT: .LBB6_4: -; X86-NOSSE-NEXT: retl -; X86-NOSSE-NEXT: .LBB6_1: -; X86-NOSSE-NEXT: movl $1, %eax -; X86-NOSSE-NEXT: jne .LBB6_4 -; X86-NOSSE-NEXT: .LBB6_3: -; X86-NOSSE-NEXT: xorl %eax, %eax -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length4: -; X86-SSE2: # BB#0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl (%ecx), %ecx -; X86-SSE2-NEXT: movl (%eax), %eax -; X86-SSE2-NEXT: bswapl %ecx -; X86-SSE2-NEXT: bswapl %eax -; X86-SSE2-NEXT: xorl %edx, %edx -; X86-SSE2-NEXT: cmpl %eax, %ecx -; X86-SSE2-NEXT: movl $-1, %ecx -; X86-SSE2-NEXT: movl $1, %eax -; X86-SSE2-NEXT: cmovbl %ecx, %eax -; X86-SSE2-NEXT: cmovel %edx, %eax -; X86-SSE2-NEXT: retl +; X86-LABEL: length4: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: cmovel %edx, %eax +; X86-NEXT: retl ; ; X64-LABEL: length4: ; X64: # BB#0: @@ -278,44 +280,86 @@ define i1 @length4_eq_const(i8* %X) nounwind { define i32 @length5(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $5 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB9_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB9_1: # %res_block +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length5: -; X64: # BB#0: -; X64-NEXT: movl $5, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB9_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 4(%rdi), %eax +; X64-NEXT: movzbl 4(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB9_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind ret i32 %m } define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $5 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB10_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 4(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: je .LBB10_3 +; X86-NEXT: .LBB10_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB10_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $5, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: jne .LBB10_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 4(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 4(%rsi), %cl +; X64-NEXT: je .LBB10_3 +; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind %c = icmp ne i32 %m, 0 @@ -324,13 +368,33 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length8: @@ -352,13 +416,20 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB12_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: je .LBB12_3 +; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB12_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl @@ -376,13 +447,17 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -400,25 +475,43 @@ define i1 @length8_eq_const(i8* %X) nounwind { define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length12_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB14_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: cmpl 4(%eax), %edx +; X86-NEXT: jne .LBB14_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 8(%eax), %edx +; X86-NEXT: je .LBB14_4 +; X86-NEXT: .LBB14_1: # %res_block +; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB14_4: # %endblock +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $12, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB14_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl 8(%rsi), %ecx +; X64-NEXT: je .LBB14_3 +; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind %c = icmp ne i32 %m, 0 @@ -427,19 +520,66 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { define i32 @length12(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length12: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%esi), %ecx +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#4: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB15_1: # %res_block +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: -; X64-NEXT: movl $12, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m } @@ -448,111 +588,165 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { define i32 @length16(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length16: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $16 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%esi), %ecx +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#4: # %loadbb3 +; X86-NEXT: movl 12(%esi), %ecx +; X86-NEXT: movl 12(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#5: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB16_1: # %res_block +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: -; X64-NEXT: movl $16, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m } define i1 @length16_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length16_eq: -; X86-NOSSE: # BB#0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq: -; X86-SSE2: # BB#0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length16_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: movdqu (%rsi), %xmm0 -; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: retq +; X86-LABEL: length16_eq: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB17_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: cmpl 4(%eax), %edx +; X86-NEXT: jne .LBB17_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: cmpl 8(%eax), %edx +; X86-NEXT: jne .LBB17_1 +; X86-NEXT: # BB#4: # %loadbb3 +; X86-NEXT: movl 12(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 12(%eax), %edx +; X86-NEXT: je .LBB17_5 +; X86-NEXT: .LBB17_1: # %res_block +; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB17_5: # %endblock +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl ; -; X64-AVX2-LABEL: length16_eq: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: retq +; X64-LABEL: length16_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB17_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq 8(%rsi), %rcx +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB17_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp } define i1 @length16_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length16_eq_const: -; X86-NOSSE: # BB#0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq_const: -; X86-SSE2: # BB#0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length16_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq +; X86-LABEL: length16_eq_const: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $858927408, (%eax) # imm = 0x33323130 +; X86-NEXT: jne .LBB18_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: cmpl $926299444, 4(%eax) # imm = 0x37363534 +; X86-NEXT: jne .LBB18_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: cmpl $825243960, 8(%eax) # imm = 0x31303938 +; X86-NEXT: jne .LBB18_1 +; X86-NEXT: # BB#4: # %loadbb3 +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl $892613426, 12(%eax) # imm = 0x35343332 +; X86-NEXT: je .LBB18_5 +; X86-NEXT: .LBB18_1: # %res_block +; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB18_5: # %endblock +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl ; -; X64-AVX2-LABEL: length16_eq_const: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: retq +; X64-LABEL: length16_eq_const: +; X64: # BB#0: # %loadbb +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: jne .LBB18_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 +; X64-NEXT: cmpq %rcx, 8(%rdi) +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB18_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -570,9 +764,43 @@ define i32 @length32(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length32: -; X64: # BB#0: -; X64-NEXT: movl $32, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#3: # %loadbb2 +; X64-NEXT: movq 16(%rdi), %rcx +; X64-NEXT: movq 16(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#4: # %loadbb3 +; X64-NEXT: movq 24(%rdi), %rcx +; X64-NEXT: movq 24(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#5: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB19_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind ret i32 %m } @@ -592,25 +820,30 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp -; X64-SSE2-NEXT: testl %eax, %eax -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: popq %rcx -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length32_eq: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-LABEL: length32_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB20_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rax +; X64-NEXT: cmpq 8(%rsi), %rax +; X64-NEXT: jne .LBB20_1 +; X64-NEXT: # BB#3: # %loadbb2 +; X64-NEXT: movq 16(%rdi), %rax +; X64-NEXT: cmpq 16(%rsi), %rax +; X64-NEXT: jne .LBB20_1 +; X64-NEXT: # BB#4: # %loadbb3 +; X64-NEXT: movq 24(%rdi), %rcx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq 24(%rsi), %rcx +; X64-NEXT: je .LBB20_5 +; X64-NEXT: .LBB20_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB20_5: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -629,26 +862,30 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-NEXT: setne %al ; X86-NEXT: retl ; -; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $.L.str, %esi -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp -; X64-SSE2-NEXT: testl %eax, %eax -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: popq %rcx -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length32_eq_const: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-LABEL: length32_eq_const: +; X64: # BB#0: # %loadbb +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: jne .LBB21_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movabsq $3833745473465760056, %rax # imm = 0x3534333231303938 +; X64-NEXT: cmpq %rax, 8(%rdi) +; X64-NEXT: jne .LBB21_1 +; X64-NEXT: # BB#3: # %loadbb2 +; X64-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736 +; X64-NEXT: cmpq %rax, 16(%rdi) +; X64-NEXT: jne .LBB21_1 +; X64-NEXT: # BB#4: # %loadbb3 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movabsq $3544395820347831604, %rcx # imm = 0x3130393837363534 +; X64-NEXT: cmpq %rcx, 24(%rdi) +; X64-NEXT: je .LBB21_5 +; X64-NEXT: .LBB21_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB21_5: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind %c = icmp ne i32 %m, 0 ret i1 %c diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 50a661fcca11..76d750855cd4 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -105,7 +105,7 @@ define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind { ; ; AVX-LABEL: mul_v4i32c: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [117,117,117,117] ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -523,7 +523,7 @@ define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind { ; ; AVX-LABEL: mul_v8i32c: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117] ; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq entry: @@ -551,7 +551,7 @@ define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind { ; ; AVX-LABEL: mul_v4i64c: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [117,117,117,117] ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/popcnt-schedule.ll b/test/CodeGen/X86/popcnt-schedule.ll new file mode 100644 index 000000000000..c0d11280fc1d --- /dev/null +++ b/test/CodeGen/X86/popcnt-schedule.ll @@ -0,0 +1,167 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+popcnt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) { +; GENERIC-LABEL: test_ctpop_i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: popcntw (%rsi), %cx +; GENERIC-NEXT: popcntw %di, %ax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_ctpop_i16: +; SLM: # BB#0: +; SLM-NEXT: popcntw (%rsi), %cx # sched: [6:1.00] +; SLM-NEXT: popcntw %di, %ax # sched: [3:1.00] +; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: # kill: %AX %AX %EAX +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ctpop_i16: +; SANDY: # BB#0: +; SANDY-NEXT: popcntw (%rsi), %cx # sched: [7:1.00] +; SANDY-NEXT: popcntw %di, %ax # sched: [3:1.00] +; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: # kill: %AX %AX %EAX +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_ctpop_i16: +; HASWELL: # BB#0: +; HASWELL-NEXT: popcntw (%rsi), %cx # sched: [7:1.00] +; HASWELL-NEXT: popcntw %di, %ax # sched: [3:1.00] +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: # kill: %AX %AX %EAX +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctpop_i16: +; BTVER2: # BB#0: +; BTVER2-NEXT: popcntw (%rsi), %cx # sched: [8:1.00] +; BTVER2-NEXT: popcntw %di, %ax # sched: [3:1.00] +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX %AX %EAX +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctpop_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: popcntw (%rsi), %cx # sched: [10:1.00] +; ZNVER1-NEXT: popcntw %di, %ax # sched: [3:1.00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX %AX %EAX +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i16, i16 *%a1 + %2 = tail call i16 @llvm.ctpop.i16( i16 %1 ) + %3 = tail call i16 @llvm.ctpop.i16( i16 %a0 ) + %4 = or i16 %2, %3 + ret i16 %4 +} +declare i16 @llvm.ctpop.i16(i16) + +define i32 @test_ctpop_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_ctpop_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: popcntl (%rsi), %ecx +; GENERIC-NEXT: popcntl %edi, %eax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_ctpop_i32: +; SLM: # BB#0: +; SLM-NEXT: popcntl (%rsi), %ecx # sched: [6:1.00] +; SLM-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ctpop_i32: +; SANDY: # BB#0: +; SANDY-NEXT: popcntl (%rsi), %ecx # sched: [7:1.00] +; SANDY-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_ctpop_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: popcntl (%rsi), %ecx # sched: [7:1.00] +; HASWELL-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctpop_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: popcntl (%rsi), %ecx # sched: [8:1.00] +; BTVER2-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctpop_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: popcntl (%rsi), %ecx # sched: [10:1.00] +; ZNVER1-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = tail call i32 @llvm.ctpop.i32( i32 %1 ) + %3 = tail call i32 @llvm.ctpop.i32( i32 %a0 ) + %4 = or i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.ctpop.i32(i32) + +define i64 @test_ctpop_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_ctpop_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: popcntq (%rsi), %rcx +; GENERIC-NEXT: popcntq %rdi, %rax +; GENERIC-NEXT: orq %rcx, %rax +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_ctpop_i64: +; SLM: # BB#0: +; SLM-NEXT: popcntq (%rsi), %rcx # sched: [6:1.00] +; SLM-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ctpop_i64: +; SANDY: # BB#0: +; SANDY-NEXT: popcntq (%rsi), %rcx # sched: [9:1.00] +; SANDY-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_ctpop_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: popcntq (%rsi), %rcx # sched: [7:1.00] +; HASWELL-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctpop_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: popcntq (%rsi), %rcx # sched: [8:1.00] +; BTVER2-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctpop_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: popcntq (%rsi), %rcx # sched: [10:1.00] +; ZNVER1-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = tail call i64 @llvm.ctpop.i64( i64 %1 ) + %3 = tail call i64 @llvm.ctpop.i64( i64 %a0 ) + %4 = or i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.ctpop.i64(i64) diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll new file mode 100644 index 000000000000..26c4bdb2375a --- /dev/null +++ b/test/CodeGen/X86/pr32282.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=X64 + +; Check for assert in foldMaskAndShiftToScale due to out of range mask scaling. + +@b = common global i8 zeroinitializer, align 1 +@c = common global i8 zeroinitializer, align 1 +@d = common global i64 zeroinitializer, align 8 +@e = common global i64 zeroinitializer, align 8 + +define void @foo() { +; X86-LABEL: foo: +; X86: # BB#0: +; X86-NEXT: pushl %eax +; X86-NEXT: .Lcfi0: +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: movl d, %eax +; X86-NEXT: movl d+4, %ecx +; X86-NEXT: movl $701685459, %edx # imm = 0x29D2DED3 +; X86-NEXT: andnl %edx, %ecx, %ecx +; X86-NEXT: movl $-564453154, %edx # imm = 0xDE5B20DE +; X86-NEXT: andnl %edx, %eax, %edx +; X86-NEXT: shrdl $21, %ecx, %edx +; X86-NEXT: shrl $21, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: testb %al, %al +; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: cmovnel %eax, %ecx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: addl $7, %edx +; X86-NEXT: adcxl %eax, %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: .Lcfi1: +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: pushl %edx +; X86-NEXT: .Lcfi2: +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: pushl $0 +; X86-NEXT: .Lcfi3: +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: pushl $0 +; X86-NEXT: .Lcfi4: +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: calll __divdi3 +; X86-NEXT: addl $16, %esp +; X86-NEXT: .Lcfi5: +; X86-NEXT: .cfi_adjust_cfa_offset -16 +; X86-NEXT: orl %eax, %edx +; X86-NEXT: setne {{[0-9]+}}(%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl +; +; X64-LABEL: foo: +; X64: # BB#0: +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movabsq $3013716102212485120, %rcx # imm = 0x29D2DED3DE400000 +; X64-NEXT: andnq %rcx, %rax, %rcx +; X64-NEXT: shrq $21, %rcx +; X64-NEXT: addq $7, %rcx +; X64-NEXT: movabsq $4393751543808, %rax # imm = 0x3FF00000000 +; X64-NEXT: testq %rax, %rcx +; X64-NEXT: je .LBB0_1 +; X64-NEXT: # BB#2: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: idivq %rcx +; X64-NEXT: jmp .LBB0_3 +; X64-NEXT: .LBB0_1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divl %ecx +; X64-NEXT: # kill: %EAX %EAX %RAX +; X64-NEXT: .LBB0_3: +; X64-NEXT: testq %rax, %rax +; X64-NEXT: setne -{{[0-9]+}}(%rsp) +; X64-NEXT: retq + %1 = alloca i8, align 1 + %2 = load i64, i64* @d, align 8 + %3 = or i64 -3013716102214263007, %2 + %4 = xor i64 %3, -1 + %5 = load i64, i64* @e, align 8 + %6 = load i8, i8* @b, align 1 + %7 = trunc i8 %6 to i1 + %8 = zext i1 %7 to i64 + %9 = xor i64 %5, %8 + %10 = load i8, i8* @c, align 1 + %11 = trunc i8 %10 to i1 + %12 = zext i1 %11 to i32 + %13 = or i32 551409149, %12 + %14 = sub nsw i32 %13, 551409131 + %15 = zext i32 %14 to i64 + %16 = shl i64 %9, %15 + %17 = sub nsw i64 %16, 223084523 + %18 = ashr i64 %4, %17 + %19 = and i64 %18, 9223372036854775806 + %20 = add nsw i64 7, %19 + %21 = sdiv i64 0, %20 + %22 = icmp ne i64 %21, 0 + %23 = zext i1 %22 to i8 + store i8 %23, i8* %1, align 1 + ret void +} diff --git a/test/CodeGen/X86/pr32515.ll b/test/CodeGen/X86/pr32515.ll new file mode 100644 index 000000000000..aeb6803867aa --- /dev/null +++ b/test/CodeGen/X86/pr32515.ll @@ -0,0 +1,29 @@ +; RUN: llc -O0 -mtriple=x86_64-unknown -mcpu=skx -o - %s +; RUN: llc -mtriple=x86_64-unknown -mcpu=skx -o - %s +; RUN: llc -O0 -mtriple=i686-unknown -mcpu=skx -o - %s +; RUN: llc -mtriple=i686-unknown -mcpu=skx -o - %s +; REQUIRES: asserts + +@var_26 = external global i16, align 2 + +define void @foo() #0 { + %1 = alloca i16, align 2 + %2 = load i16, i16* @var_26, align 2 + %3 = zext i16 %2 to i32 + %4 = icmp ne i32 %3, 7 + %5 = zext i1 %4 to i16 + store i16 %5, i16* %1, align 2 + %6 = load i16, i16* @var_26, align 2 + %7 = zext i16 %6 to i32 + %8 = and i32 1, %7 + %9 = shl i32 %8, 0 + %10 = load i16, i16* @var_26, align 2 + %11 = zext i16 %10 to i32 + %12 = icmp ne i32 %11, 7 + %13 = zext i1 %12 to i32 + %14 = and i32 %9, %13 + %15 = icmp ne i32 %14, 0 + %16 = zext i1 %15 to i8 + store i8 %16, i8* undef, align 1 + unreachable + } diff --git a/test/CodeGen/X86/pr33772.ll b/test/CodeGen/X86/pr33772.ll new file mode 100644 index 000000000000..ff22c7478866 --- /dev/null +++ b/test/CodeGen/X86/pr33772.ll @@ -0,0 +1,15 @@ +; RUN: not llc < %s -mcpu=skylake-avx512 2>&1 | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" + +; make sure we don't crash if scale for gather isn't constant. + +; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.x86.avx512.gather.dpi.512 +declare <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32>, i8*, <16 x i32>, i16, i32) + +define internal <16 x i32> @__gather_base_offsets32_i32(i8* readonly %ptr, i32 %offset_scale, <16 x i32> %offsets, <16 x i8> %vecmask) { + %mask_vec_i1.i.i = icmp ne <16 x i8> %vecmask, zeroinitializer + %mask_i16.i = bitcast <16 x i1> %mask_vec_i1.i.i to i16 + %res = tail call <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32> undef, i8* %ptr, <16 x i32> %offsets, i16 %mask_i16.i, i32 %offset_scale) + ret <16 x i32> %res +} diff --git a/test/CodeGen/X86/pr33828.ll b/test/CodeGen/X86/pr33828.ll new file mode 100644 index 000000000000..1b7f44323b61 --- /dev/null +++ b/test/CodeGen/X86/pr33828.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=X64 + +@var_580 = external local_unnamed_addr global i8, align 1 + +define void @foo() { +; X86-LABEL: foo: +; X86: # BB#0: # %entry +; X86-NEXT: movsbl var_580, %eax +; X86-NEXT: testl $-536870913, %eax # imm = 0xDFFFFFFF +; X86-NEXT: jne .LBB0_1 +; X86-NEXT: # BB#2: # %if.end13 +; X86-NEXT: retl +; X86-NEXT: .LBB0_1: # %if.then11 +; +; X64-LABEL: foo: +; X64: # BB#0: # %entry +; X64-NEXT: movsbl {{.*}}(%rip), %eax +; X64-NEXT: testl $-536870913, %eax # imm = 0xDFFFFFFF +; X64-NEXT: jne .LBB0_1 +; X64-NEXT: # BB#2: # %if.end13 +; X64-NEXT: retq +; X64-NEXT: .LBB0_1: # %if.then11 +entry: + %tmp = icmp ugt i8 undef, 60 + %phitmp = zext i1 %tmp to i16 + br label %if.end + +if.end: + %tmp1 = load i8, i8* @var_580, align 1 + %conv7 = sext i8 %tmp1 to i32 + %conv8 = zext i16 %phitmp to i32 + %mul = shl nuw nsw i32 %conv8, 1 + %div9 = udiv i32 %mul, 71 + %sub = add nsw i32 %div9, -3 + %shl = shl i32 1, %sub + %neg = xor i32 %shl, -1 + %and = and i32 %neg, %conv7 + %tobool10 = icmp eq i32 %and, 0 + br i1 %tobool10, label %if.end13, label %if.then11 + +if.then11: + unreachable + +if.end13: + ret void +} diff --git a/test/CodeGen/X86/regparm.ll b/test/CodeGen/X86/regparm.ll index 9484e5a9490b..f427010edc51 100644 --- a/test/CodeGen/X86/regparm.ll +++ b/test/CodeGen/X86/regparm.ll @@ -1,4 +1,4 @@ -; RUN: llc %s -mtriple=i386-pc-linux -o - | FileCheck -check-prefix=CHECK %s +; RUN: llc %s -mtriple=i386-pc-linux -o - | FileCheck %s ; RUN: llc %s -mtriple=i386-pc-win32 -o - | FileCheck -check-prefix=WIN %s ; RUN: llc %s -mtriple=i386-pc-linux -fast-isel -o - | FileCheck -check-prefix=FAST %s ; RUN: llc %s -mtriple=i386-pc-win32 -fast-isel -o - | FileCheck -check-prefix=FASTWIN %s diff --git a/test/CodeGen/X86/rotate_vec.ll b/test/CodeGen/X86/rotate_vec.ll new file mode 100644 index 000000000000..8fb000bae827 --- /dev/null +++ b/test/CodeGen/X86/rotate_vec.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver4 | FileCheck %s + +define <4 x i32> @rot_v4i32_splat(<4 x i32> %x) { +; CHECK-LABEL: rot_v4i32_splat: +; CHECK: # BB#0: +; CHECK-NEXT: vprotd $31, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = lshr <4 x i32> %x, + %2 = shl <4 x i32> %x, + %3 = or <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <4 x i32> @rot_v4i32_non_splat(<4 x i32> %x) { +; CHECK-LABEL: rot_v4i32_non_splat: +; CHECK: # BB#0: +; CHECK-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = lshr <4 x i32> %x, + %2 = shl <4 x i32> %x, + %3 = or <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <4 x i32> @rot_v4i32_splat_2masks(<4 x i32> %x) { +; CHECK-LABEL: rot_v4i32_splat_2masks: +; CHECK: # BB#0: +; CHECK-NEXT: vprotd $31, %xmm0, %xmm0 +; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = lshr <4 x i32> %x, + %2 = and <4 x i32> %1, + + %3 = shl <4 x i32> %x, + %4 = and <4 x i32> %3, + %5 = or <4 x i32> %2, %4 + ret <4 x i32> %5 +} + +define <4 x i32> @rot_v4i32_non_splat_2masks(<4 x i32> %x) { +; CHECK-LABEL: rot_v4i32_non_splat_2masks: +; CHECK: # BB#0: +; CHECK-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = lshr <4 x i32> %x, + %2 = and <4 x i32> %1, + + %3 = shl <4 x i32> %x, + %4 = and <4 x i32> %3, + %5 = or <4 x i32> %2, %4 + ret <4 x i32> %5 +} diff --git a/test/CodeGen/X86/sibcall-win64.ll b/test/CodeGen/X86/sibcall-win64.ll index 204e1f8b050b..b9d5a4813e09 100644 --- a/test/CodeGen/X86/sibcall-win64.ll +++ b/test/CodeGen/X86/sibcall-win64.ll @@ -1,15 +1,15 @@ ; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s -declare x86_64_win64cc void @win64_callee(i32) -declare x86_64_win64cc void (i32)* @win64_indirect() -declare x86_64_win64cc void @win64_other(i32) +declare win64cc void @win64_callee(i32) +declare win64cc void (i32)* @win64_indirect() +declare win64cc void @win64_other(i32) declare void @sysv_callee(i32) declare void (i32)* @sysv_indirect() declare void @sysv_other(i32) define void @sysv_caller(i32 %p1) { entry: - tail call x86_64_win64cc void @win64_callee(i32 %p1) + tail call win64cc void @win64_callee(i32 %p1) ret void } @@ -19,7 +19,7 @@ entry: ; CHECK: addq $40, %rsp ; CHECK: retq -define x86_64_win64cc void @win64_caller(i32 %p1) { +define win64cc void @win64_caller(i32 %p1) { entry: tail call void @sysv_callee(i32 %p1) ret void @@ -37,18 +37,18 @@ define void @sysv_matched(i32 %p1) { ; CHECK-LABEL: sysv_matched: ; CHECK: jmp sysv_callee # TAILCALL -define x86_64_win64cc void @win64_matched(i32 %p1) { - tail call x86_64_win64cc void @win64_callee(i32 %p1) +define win64cc void @win64_matched(i32 %p1) { + tail call win64cc void @win64_callee(i32 %p1) ret void } ; CHECK-LABEL: win64_matched: ; CHECK: jmp win64_callee # TAILCALL -define x86_64_win64cc void @win64_indirect_caller(i32 %p1) { - %1 = call x86_64_win64cc void (i32)* @win64_indirect() - call x86_64_win64cc void @win64_other(i32 0) - tail call x86_64_win64cc void %1(i32 %p1) +define win64cc void @win64_indirect_caller(i32 %p1) { + %1 = call win64cc void (i32)* @win64_indirect() + call win64cc void @win64_other(i32 0) + tail call win64cc void %1(i32 %p1) ret void } diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll index c41acd43b3ab..29f726c3df6a 100644 --- a/test/CodeGen/X86/sse-schedule.ll +++ b/test/CodeGen/X86/sse-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; GENERIC-LABEL: test_addps: @@ -45,6 +45,12 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fadd <4 x float> %1, %2 @@ -87,6 +93,12 @@ define float @test_addss(float %a0, float %a1, float *%a2) { ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fadd float %1, %2 @@ -137,6 +149,12 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = and <4 x i32> %1, %2 @@ -191,6 +209,12 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; BTVER2-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andnotps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = xor <4 x i32> %1, @@ -245,6 +269,13 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fcmp oeq <4 x float> %a0, %2 @@ -290,6 +321,12 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) { ; BTVER2-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = insertelement <4 x float> undef, float %a1, i32 0 %3 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0) @@ -385,6 +422,20 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_comiss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vcomiss (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 4 %3 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %2) @@ -435,6 +486,13 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) { ; BTVER2-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2ss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i32 %a0 to float %2 = load i32, i32 *%a1, align 4 %3 = sitofp i32 %2 to float @@ -484,6 +542,13 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) { ; BTVER2-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2ssq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i64 %a0 to float %2 = load i64, i64 *%a1, align 8 %3 = sitofp i64 %2 to float @@ -533,6 +598,13 @@ define i32 @test_cvtss2si(float %a0, float *%a1) { ; BTVER2-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtss2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtss2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtss2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -585,6 +657,13 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) { ; BTVER2-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtss2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtss2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtss2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -637,6 +716,13 @@ define i32 @test_cvttss2si(float %a0, float *%a1) { ; BTVER2-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttss2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttss2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttss2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi float %a0 to i32 %2 = load float, float *%a1, align 4 %3 = fptosi float %2 to i32 @@ -686,6 +772,13 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) { ; BTVER2-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttss2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttss2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttss2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi float %a0 to i64 %2 = load float, float *%a1, align 4 %3 = fptosi float %2 to i64 @@ -729,6 +822,12 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fdiv <4 x float> %1, %2 @@ -771,6 +870,12 @@ define float @test_divss(float %a0, float %a1, float *%a2) { ; BTVER2-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fdiv float %1, %2 @@ -813,6 +918,12 @@ define void @test_ldmxcsr(i32 %a0) { ; BTVER2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00] ; BTVER2-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ldmxcsr: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50] +; ZNVER1-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* store i32 %a0, i32* %1 @@ -857,6 +968,12 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %2) @@ -900,6 +1017,12 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %1, <4 x float> %2) @@ -943,6 +1066,12 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %1, <4 x float> %2) @@ -986,6 +1115,12 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %1, <4 x float> %2) @@ -1035,6 +1170,13 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) { ; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movaps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovaps (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x float>, <4 x float> *%a0, align 16 %2 = fadd <4 x float> %1, %1 store <4 x float> %2, <4 x float> *%a1, align 16 @@ -1079,6 +1221,11 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movhlps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> ret <4 x float> %1 } @@ -1129,6 +1276,13 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) { ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to <2 x float>* %2 = load <2 x float>, <2 x float> *%1, align 8 %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> @@ -1177,6 +1331,12 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) { ; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movlhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = fadd <4 x float> %a1, %1 ret <4 x float> %2 @@ -1224,6 +1384,13 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) { ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movlps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to <2 x float>* %2 = load <2 x float>, <2 x float> *%1, align 8 %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> @@ -1266,6 +1433,11 @@ define i32 @test_movmskps(<4 x float> %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movmskps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ret i32 %1 } @@ -1307,6 +1479,11 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] store <4 x float> %a0, <4 x float> *%a1, align 16, !nontemporal !0 ret void } @@ -1353,6 +1530,13 @@ define void @test_movss_mem(float* %a0, float* %a1) { ; BTVER2-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movss_mem: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovss %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load float, float* %a0, align 1 %2 = fadd float %1, %1 store float %2, float *%a1, align 1 @@ -1395,6 +1579,11 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movss_reg: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> ret <4 x float> %1 } @@ -1441,6 +1630,13 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) { ; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movups: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovups (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovups %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x float>, <4 x float> *%a0, align 1 %2 = fadd <4 x float> %1, %1 store <4 x float> %2, <4 x float> *%a1, align 1 @@ -1483,6 +1679,12 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fmul <4 x float> %1, %2 @@ -1525,6 +1727,12 @@ define float @test_mulss(float %a0, float %a1, float *%a2) { ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fmul float %1, %2 @@ -1575,6 +1783,12 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2 ; BTVER2-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_orps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = or <4 x i32> %1, %2 @@ -1621,6 +1835,11 @@ define void @test_prefetchnta(i8* %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: prefetchnta (%rdi) # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_prefetchnta: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: prefetchnta (%rdi) # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1) ret void } @@ -1670,6 +1889,13 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vrcpps %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rcpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vrcpps (%rdi), %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %2) @@ -1728,6 +1954,14 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) { ; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [7:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rcpss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [12:0.50] +; ZNVER1-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -1782,6 +2016,13 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rsqrtps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %2) @@ -1840,6 +2081,14 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) { ; BTVER2-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [7:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rsqrtss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:0.50] +; ZNVER1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -1886,6 +2135,11 @@ define void @test_sfence() { ; BTVER2: # BB#0: ; BTVER2-NEXT: sfence # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sfence: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: sfence # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse.sfence() ret void } @@ -1931,6 +2185,12 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *% ; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50] ; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_shufps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50] +; ZNVER1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> @@ -1980,6 +2240,13 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vsqrtps %xmm0, %xmm0 # sched: [21:21.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsqrtps (%rdi), %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtps %xmm0, %xmm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %2) @@ -2038,6 +2305,14 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [26:21.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovaps (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %2) @@ -2082,6 +2357,12 @@ define i32 @test_stmxcsr() { ; BTVER2-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00] ; BTVER2-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_stmxcsr: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50] +; ZNVER1-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -2126,6 +2407,12 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fsub <4 x float> %1, %2 @@ -2168,6 +2455,12 @@ define float @test_subss(float %a0, float %a1, float *%a2) { ; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fsub float %1, %2 @@ -2258,6 +2551,20 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ucomiss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vucomiss (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 4 %3 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %2) @@ -2306,6 +2613,12 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] ; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpckhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; ZNVER1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> @@ -2352,6 +2665,12 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] ; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpcklps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> @@ -2402,6 +2721,12 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_xorps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = xor <4 x i32> %1, %2 diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll index 3c36b2138139..6ee908e0c787 100644 --- a/test/CodeGen/X86/sse2-schedule.ll +++ b/test/CodeGen/X86/sse2-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; GENERIC-LABEL: test_addpd: @@ -45,6 +45,12 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fadd <2 x double> %1, %2 @@ -87,6 +93,12 @@ define double @test_addsd(double %a0, double %a1, double *%a2) { ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fadd double %1, %2 @@ -135,6 +147,13 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = and <4 x i32> %1, %2 @@ -188,6 +207,13 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; BTVER2-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andnotpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = xor <4 x i32> %1, @@ -243,6 +269,13 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmppd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fcmp oeq <2 x double> %a0, %2 @@ -288,6 +321,12 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) { ; BTVER2-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x double> undef, double %a0, i32 0 %2 = insertelement <2 x double> undef, double %a1, i32 0 %3 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0) @@ -383,6 +422,20 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_comisd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vcomisd (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 8 %3 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %2) @@ -433,6 +486,13 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { ; BTVER2-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtdq2pd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> %2 = sitofp <2 x i32> %1 to <2 x double> %3 = load <4 x i32>, <4 x i32>*%a1, align 16 @@ -485,6 +545,13 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) { ; BTVER2-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtdq2ps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp <4 x i32> %a0 to <4 x float> %2 = load <4 x i32>, <4 x i32>*%a1, align 16 %3 = sitofp <4 x i32> %2 to <4 x float> @@ -535,6 +602,13 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtpd2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %2) @@ -586,6 +660,13 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtpd2ps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %2) @@ -637,6 +718,13 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtps2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %2) @@ -688,6 +776,13 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtps2pd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> %2 = fpext <2 x float> %1 to <2 x double> %3 = load <4 x float>, <4 x float> *%a1, align 16 @@ -739,6 +834,13 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) { ; BTVER2-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsd2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsd2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtsd2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x double> undef, double %a0, i32 0 %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %1) %3 = load double, double *%a1, align 8 @@ -791,6 +893,13 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) { ; BTVER2-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsd2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsd2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtsd2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x double> undef, double %a0, i32 0 %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %1) %3 = load double, double *%a1, align 8 @@ -850,6 +959,14 @@ define float @test_cvtsd2ss(double %a0, double *%a1) { ; BTVER2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00] ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsd2ss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00] +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptrunc double %a0 to float %2 = load double, double *%a1, align 8 %3 = fptrunc double %2 to float @@ -899,6 +1016,13 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) { ; BTVER2-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2sd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i32 %a0 to double %2 = load i32, i32 *%a1, align 8 %3 = sitofp i32 %2 to double @@ -948,6 +1072,13 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) { ; BTVER2-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2sdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i64 %a0 to double %2 = load i64, i64 *%a1, align 8 %3 = sitofp i64 %2 to double @@ -1006,6 +1137,14 @@ define double @test_cvtss2sd(float %a0, float *%a1) { ; BTVER2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00] ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtss2sd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00] +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fpext float %a0 to double %2 = load float, float *%a1, align 4 %3 = fpext float %2 to double @@ -1056,6 +1195,13 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttpd2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <2 x double> %a0 to <2 x i32> %2 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> %3 = load <2 x double>, <2 x double> *%a1, align 16 @@ -1108,6 +1254,13 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttps2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <4 x float> %a0 to <4 x i32> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = fptosi <4 x float> %2 to <4 x i32> @@ -1157,6 +1310,13 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) { ; BTVER2-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttsd2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttsd2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttsd2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi double %a0 to i32 %2 = load double, double *%a1, align 8 %3 = fptosi double %2 to i32 @@ -1206,6 +1366,13 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) { ; BTVER2-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttsd2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttsd2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttsd2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi double %a0 to i64 %2 = load double, double *%a1, align 8 %3 = fptosi double %2 to i64 @@ -1249,6 +1416,12 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fdiv <2 x double> %1, %2 @@ -1291,6 +1464,12 @@ define double @test_divsd(double %a0, double %a1, double *%a2) { ; BTVER2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fdiv double %1, %2 @@ -1333,6 +1512,11 @@ define void @test_lfence() { ; BTVER2: # BB#0: ; BTVER2-NEXT: lfence # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lfence: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lfence # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse2.lfence() ret void } @@ -1374,6 +1558,11 @@ define void @test_mfence() { ; BTVER2: # BB#0: ; BTVER2-NEXT: mfence # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mfence: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: mfence # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse2.mfence() ret void } @@ -1413,6 +1602,11 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovdqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) ret void } @@ -1454,6 +1648,12 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %2) @@ -1497,6 +1697,12 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %1, <2 x double> %2) @@ -1540,6 +1746,12 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %2) @@ -1583,6 +1795,12 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %1, <2 x double> %2) @@ -1632,6 +1850,13 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) { ; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movapd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovapd (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x double>, <2 x double> *%a0, align 16 %2 = fadd <2 x double> %1, %1 store <2 x double> %2, <2 x double> *%a1, align 16 @@ -1680,6 +1905,13 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) { ; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movdqa: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovdqa (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x i64>, <2 x i64> *%a0, align 16 %2 = add <2 x i64> %1, %1 store <2 x i64> %2, <2 x i64> *%a1, align 16 @@ -1728,6 +1960,13 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) { ; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movdqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovdqu (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x i64>, <2 x i64> *%a0, align 1 %2 = add <2 x i64> %1, %1 store <2 x i64> %2, <2 x i64> *%a1, align 1 @@ -1794,6 +2033,16 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) { ; BTVER2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovd %xmm0, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vmovd %edi, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vmovd %xmm1, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovd %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x i32> undef, i32 %a1, i32 0 %2 = load i32, i32 *%a2 %3 = insertelement <4 x i32> undef, i32 %2, i32 0 @@ -1865,6 +2114,16 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) { ; BTVER2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovq %xmm0, %rax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movd_64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vmovq %xmm1, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovq %xmm0, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x i64> undef, i64 %a1, i64 0 %2 = load i64, i64 *%a2 %3 = insertelement <2 x i64> undef, i64 %2, i64 0 @@ -1918,6 +2177,13 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) { ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movhpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to double* %2 = load double, double *%1, align 8 %3 = insertelement <2 x double> %a1, double %2, i32 1 @@ -1969,6 +2235,13 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) { ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movlpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to double* %2 = load double, double *%1, align 8 %3 = insertelement <2 x double> %a1, double %2, i32 0 @@ -2010,6 +2283,11 @@ define i32 @test_movmskpd(<2 x double> %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movmskpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ret i32 %1 } @@ -2053,6 +2331,12 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) { ; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntdqa: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <2 x i64> %a0, %a0 store <2 x i64> %1, <2 x i64> *%a1, align 16, !nontemporal !0 ret void @@ -2094,6 +2378,12 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <2 x double> %a0, %a0 store <2 x double> %1, <2 x double> *%a1, align 16, !nontemporal !0 ret void @@ -2141,6 +2431,13 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) { ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movq_mem: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovq %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64* %a1, align 1 %2 = insertelement <2 x i64> zeroinitializer, i64 %1, i32 0 %3 = add <2 x i64> %a0, %2 @@ -2187,6 +2484,12 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) { ; BTVER2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movq_reg: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> %2 = add <2 x i64> %a1, %1 ret <2 x i64> %2 @@ -2234,6 +2537,13 @@ define void @test_movsd_mem(double* %a0, double* %a1) { ; BTVER2-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsd_mem: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load double, double* %a0, align 1 %2 = fadd double %1, %1 store double %2, double *%a1, align 1 @@ -2277,6 +2587,11 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsd_reg: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> ret <2 x double> %1 } @@ -2323,6 +2638,13 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) { ; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movupd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovupd (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x double>, <2 x double> *%a0, align 1 %2 = fadd <2 x double> %1, %1 store <2 x double> %2, <2 x double> *%a1, align 1 @@ -2365,6 +2687,12 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fmul <2 x double> %1, %2 @@ -2407,6 +2735,12 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) { ; BTVER2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fmul double %1, %2 @@ -2455,6 +2789,13 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_orpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = or <4 x i32> %1, %2 @@ -2510,6 +2851,12 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packssdw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <8 x i16> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -2562,6 +2909,12 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packsswb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) %2 = bitcast <16 x i8> %1 to <8 x i16> %3 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -2614,6 +2967,12 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packuswb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) %2 = bitcast <16 x i8> %1 to <8 x i16> %3 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -2662,6 +3021,12 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = add <16 x i8> %1, %2 @@ -2708,6 +3073,12 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = add <4 x i32> %1, %2 @@ -2750,6 +3121,12 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = add <2 x i64> %1, %2 @@ -2796,6 +3173,12 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %1, <16 x i8> %2) @@ -2843,6 +3226,12 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2) @@ -2890,6 +3279,12 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddusb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2) @@ -2937,6 +3332,12 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddusw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2) @@ -2984,6 +3385,12 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = add <8 x i16> %1, %2 @@ -3032,6 +3439,13 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pand: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = and <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = and <2 x i64> %1, %2 @@ -3087,6 +3501,13 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pandn: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <2 x i64> %a0, %2 = and <2 x i64> %a1, %1 %3 = load <2 x i64>, <2 x i64> *%a2, align 16 @@ -3136,6 +3557,12 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pavgb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2) @@ -3183,6 +3610,12 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pavgw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2) @@ -3234,6 +3667,13 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = icmp eq <16 x i8> %a0, %2 @@ -3286,6 +3726,13 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = icmp eq <4 x i32> %a0, %2 @@ -3338,6 +3785,13 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = icmp eq <8 x i16> %a0, %2 @@ -3391,6 +3845,13 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = icmp sgt <16 x i8> %a0, %2 @@ -3444,6 +3905,13 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = icmp eq <4 x i32> %a0, %2 @@ -3497,6 +3965,13 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = icmp sgt <8 x i16> %a0, %2 @@ -3541,6 +4016,12 @@ define i16 @test_pextrw(<8 x i16> %a0) { ; BTVER2-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: # kill: %AX %AX %EAX ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX %AX %EAX +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <8 x i16> %a0, i32 6 ret i16 %1 } @@ -3585,6 +4066,12 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) { ; BTVER2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <8 x i16> %a0, i16 %a1, i32 1 %2 = load i16, i16 *%a2 %3 = insertelement <8 x i16> %1, i16 %2, i32 3 @@ -3635,6 +4122,12 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaddwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) %2 = bitcast <4 x i32> %1 to <8 x i16> %3 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -3683,6 +4176,12 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %1, <8 x i16> %2) @@ -3730,6 +4229,12 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxub: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %1, <16 x i8> %2) @@ -3777,6 +4282,12 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %1, <8 x i16> %2) @@ -3824,6 +4335,12 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminub: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %1, <16 x i8> %2) @@ -3863,6 +4380,11 @@ define i32 @test_pmovmskb(<16 x i8> %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovmskb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ret i32 %1 } @@ -3904,6 +4426,12 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulhuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %1, <8 x i16> %2) @@ -3947,6 +4475,12 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulhw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %1, <8 x i16> %2) @@ -3990,6 +4524,12 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmullw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = mul <8 x i16> %1, %2 @@ -4040,6 +4580,12 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmuludq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <2 x i64> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -4090,6 +4636,13 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_por: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = or <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = or <2 x i64> %1, %2 @@ -4141,6 +4694,12 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psadbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) %2 = bitcast <2 x i64> %1 to <16 x i8> %3 = load <16 x i8>, <16 x i8> *%a2, align 16 @@ -4193,6 +4752,13 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) { ; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshufd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50] +; ZNVER1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> %2 = load <4 x i32>, <4 x i32> *%a1, align 16 %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> @@ -4244,6 +4810,13 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) { ; BTVER2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshufhw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [8:0.50] +; ZNVER1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a1, align 16 %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> @@ -4295,6 +4868,13 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) { ; BTVER2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshuflw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [8:0.50] +; ZNVER1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a1, align 16 %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> @@ -4344,6 +4924,13 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pslld: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %1, <4 x i32> %2) @@ -4389,6 +4976,11 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pslldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %1 } @@ -4435,6 +5027,13 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psllq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %1, <2 x i64> %2) @@ -4486,6 +5085,13 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psllw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %1, <8 x i16> %2) @@ -4537,6 +5143,13 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrad: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> %2) @@ -4588,6 +5201,13 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psraw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> %2) @@ -4639,6 +5259,13 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrld: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %1, <4 x i32> %2) @@ -4684,6 +5311,11 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %1 } @@ -4730,6 +5362,13 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrlq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %1, <2 x i64> %2) @@ -4781,6 +5420,13 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrlw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %1, <8 x i16> %2) @@ -4830,6 +5476,12 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = sub <16 x i8> %1, %2 @@ -4876,6 +5528,12 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = sub <4 x i32> %1, %2 @@ -4918,6 +5576,12 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = sub <2 x i64> %1, %2 @@ -4964,6 +5628,12 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %1, <16 x i8> %2) @@ -5011,6 +5681,12 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2) @@ -5058,6 +5734,12 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubusb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2) @@ -5105,6 +5787,12 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubusw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2) @@ -5152,6 +5840,12 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = sub <8 x i16> %1, %2 @@ -5198,6 +5892,12 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50] ; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> @@ -5248,6 +5948,13 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [6:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> @@ -5297,6 +6004,13 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) ; BTVER2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhqdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> @@ -5344,6 +6058,12 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] ; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> @@ -5390,6 +6110,12 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] ; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpcklbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25] +; ZNVER1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> @@ -5440,6 +6166,13 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> @@ -5489,6 +6222,13 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) ; BTVER2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpcklqdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.25] +; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> @@ -5536,6 +6276,12 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] ; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpcklwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25] +; ZNVER1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> @@ -5584,6 +6330,13 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pxor: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = xor <2 x i64> %1, %2 @@ -5633,6 +6386,13 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double ; BTVER2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_shufpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50] +; ZNVER1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> @@ -5683,6 +6443,13 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:21.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %2) @@ -5741,6 +6508,14 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [26:21.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovapd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2) @@ -5785,6 +6560,12 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fsub <2 x double> %1, %2 @@ -5827,6 +6608,12 @@ define double @test_subsd(double %a0, double %a1, double *%a2) { ; BTVER2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fsub double %1, %2 @@ -5917,6 +6704,20 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ucomisd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vucomisd (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 8 %3 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %2) @@ -5967,6 +6768,13 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpckhpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50] +; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> @@ -6022,6 +6830,13 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpcklpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> @@ -6071,6 +6886,13 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_xorpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = xor <4 x i32> %1, %2 diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll index ef1ddae4532d..ad38d1c6ff49 100644 --- a/test/CodeGen/X86/sse3-schedule.ll +++ b/test/CodeGen/X86/sse3-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; GENERIC-LABEL: test_addsubpd: @@ -45,6 +45,12 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; BTVER2-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsubpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %1, <2 x double> %2) @@ -88,6 +94,12 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; BTVER2-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsubps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %1, <4 x float> %2) @@ -131,6 +143,12 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double ; BTVER2-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_haddpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %1, <2 x double> %2) @@ -174,6 +192,12 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *% ; BTVER2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_haddps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %2) @@ -217,6 +241,12 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double ; BTVER2-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_hsubpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %1, <2 x double> %2) @@ -260,6 +290,12 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *% ; BTVER2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_hsubps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %2) @@ -299,6 +335,11 @@ define <16 x i8> @test_lddqu(i8* %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vlddqu (%rdi), %xmm0 # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lddqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vlddqu (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0) ret <16 x i8> %1 } @@ -347,6 +388,13 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movddup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [8:0.50] +; ZNVER1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer @@ -397,6 +445,13 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movshdup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [8:0.50] +; ZNVER1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> @@ -447,6 +502,13 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsldup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [8:0.50] +; ZNVER1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll index 1ab1598fcab7..26cca98816a3 100644 --- a/test/CodeGen/X86/sse41-schedule.ll +++ b/test/CodeGen/X86/sse41-schedule.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; GENERIC-LABEL: test_blendpd: @@ -43,6 +43,13 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fadd <2 x double> %a1, %1 @@ -80,6 +87,12 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> * ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50] ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50] +; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> @@ -122,6 +135,12 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; BTVER2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendvpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) %2 = load <2 x double>, <2 x double> *%a3, align 16 %3 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %a2) @@ -165,6 +184,12 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; BTVER2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendvps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) %2 = load <4 x float>, <4 x float> *%a3 %3 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %a2) @@ -202,6 +227,12 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_dppd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %1, <2 x double> %2, i8 7) @@ -239,6 +270,12 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2 ; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_dpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %1, <4 x float> %2, i8 7) @@ -276,6 +313,12 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2) ; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50] ; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50] +; ZNVER1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17) %2 = load float, float *%a2 %3 = insertelement <4 x float> %1, float %2, i32 3 @@ -308,6 +351,11 @@ define <2 x i64> @test_movntdqa(i8* %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntdqa: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %a0) ret <2 x i64> %1 } @@ -343,6 +391,12 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [3:2.00] ; BTVER2-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mpsadbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) %2 = bitcast <8 x i16> %1 to <16 x i8> %3 = load <16 x i8>, <16 x i8> *%a2, align 16 @@ -381,6 +435,12 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packusdw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <8 x i16> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -425,6 +485,12 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 ; BTVER2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pblendvb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; ZNVER1-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) %2 = load <16 x i8>, <16 x i8> *%a3, align 16 %3 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %a2) @@ -462,6 +528,12 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50] ; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pblendw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50] +; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> @@ -498,6 +570,12 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <2 x i64> %a0, %a1 %2 = sext <2 x i1> %1 to <2 x i64> %3 = load <2 x i64>, <2 x i64>*%a2, align 16 @@ -536,6 +614,12 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) { ; BTVER2-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <16 x i8> %a0, i32 3 %2 = extractelement <16 x i8> %a0, i32 1 store i8 %2, i8 *%a1 @@ -573,6 +657,12 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) { ; BTVER2-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <4 x i32> %a0, i32 3 %2 = extractelement <4 x i32> %a0, i32 1 store i32 %2, i32 *%a1 @@ -609,6 +699,12 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) { ; BTVER2-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50] ; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <2 x i64> %a0, i32 1 %2 = extractelement <2 x i64> %a0, i32 1 store i64 %2, i64 *%a2 @@ -645,6 +741,12 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) { ; BTVER2-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <8 x i16> %a0, i32 3 %2 = extractelement <8 x i16> %a0, i32 1 store i16 %2, i16 *%a1 @@ -682,6 +784,12 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) { ; BTVER2-NEXT: vphminposuw (%rdi), %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: vphminposuw %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phminposuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: vphminposuw %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x i16>, <8 x i16> *%a0, align 16 %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %1) %3 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %2) @@ -719,6 +827,12 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) { ; BTVER2-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <16 x i8> %a0, i8 %a1, i32 1 %2 = load i8, i8 *%a2 %3 = insertelement <16 x i8> %1, i8 %2, i32 3 @@ -755,6 +869,12 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) { ; BTVER2-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x i32> %a0, i32 %a1, i32 1 %2 = load i32, i32 *%a2 %3 = insertelement <4 x i32> %1, i32 %2, i32 3 @@ -796,6 +916,13 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) { ; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x i64> %a0, i64 %a2, i32 1 %2 = load i64, i64 *%a3 %3 = insertelement <2 x i64> %a1, i64 %2, i32 1 @@ -833,6 +960,12 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %1, <16 x i8> %2) @@ -870,6 +1003,12 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> %2) @@ -907,6 +1046,12 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxud: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> %2) @@ -944,6 +1089,12 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %1, <8 x i16> %2) @@ -981,6 +1132,12 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %1, <16 x i8> %2) @@ -1018,6 +1175,12 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %1, <4 x i32> %2) @@ -1055,6 +1218,12 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminud: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> %2) @@ -1092,6 +1261,12 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %1, <8 x i16> %2) @@ -1135,6 +1310,13 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) { ; BTVER2-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> %2 = sext <8 x i8> %1 to <8 x i16> %3 = load <8 x i8>, <8 x i8>* %a1, align 1 @@ -1179,6 +1361,13 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) { ; BTVER2-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxbd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> %2 = sext <4 x i8> %1 to <4 x i32> %3 = load <4 x i8>, <4 x i8>* %a1, align 1 @@ -1223,6 +1412,13 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) { ; BTVER2-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxbq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> %2 = sext <2 x i8> %1 to <2 x i64> %3 = load <2 x i8>, <2 x i8>* %a1, align 1 @@ -1267,6 +1463,13 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) { ; BTVER2-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> %2 = sext <2 x i32> %1 to <2 x i64> %3 = load <2 x i32>, <2 x i32>* %a1, align 1 @@ -1311,6 +1514,13 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) { ; BTVER2-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = sext <4 x i16> %1 to <4 x i32> %3 = load <4 x i16>, <4 x i16>* %a1, align 1 @@ -1355,6 +1565,13 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) { ; BTVER2-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxwq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> %2 = sext <2 x i16> %1 to <2 x i64> %3 = load <2 x i16>, <2 x i16>* %a1, align 1 @@ -1399,6 +1616,13 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) { ; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> %2 = zext <8 x i8> %1 to <8 x i16> %3 = load <8 x i8>, <8 x i8>* %a1, align 1 @@ -1443,6 +1667,13 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) { ; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxbd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> %2 = zext <4 x i8> %1 to <4 x i32> %3 = load <4 x i8>, <4 x i8>* %a1, align 1 @@ -1487,6 +1718,13 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) { ; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxbq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> %2 = zext <2 x i8> %1 to <2 x i64> %3 = load <2 x i8>, <2 x i8>* %a1, align 1 @@ -1531,6 +1769,13 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) { ; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> %2 = zext <2 x i32> %1 to <2 x i64> %3 = load <2 x i32>, <2 x i32>* %a1, align 1 @@ -1575,6 +1820,13 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) { ; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = zext <4 x i16> %1 to <4 x i32> %3 = load <4 x i16>, <4 x i16>* %a1, align 1 @@ -1619,6 +1871,13 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) { ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxwq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> %2 = zext <2 x i16> %1 to <2 x i64> %3 = load <2 x i16>, <2 x i16>* %a1, align 1 @@ -1657,6 +1916,12 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmuldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <2 x i64> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -1695,6 +1960,12 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulld: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = mul <4 x i32> %1, %2 @@ -1751,6 +2022,16 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %cl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ptest: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vptest (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: setb %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %cl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %2) @@ -1795,6 +2076,13 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:1.00] +; ZNVER1-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %2, i32 7) @@ -1839,6 +2127,13 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:1.00] +; ZNVER1-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %2, i32 7) @@ -1884,6 +2179,13 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; BTVER2-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) %2 = load <2 x double>, <2 x double>* %a2, align 16 %3 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %2, i32 7) @@ -1929,6 +2231,13 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> * ; BTVER2-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %2, i32 7) diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll index 7ce9ffdbd0ea..adf857e12179 100644 --- a/test/CodeGen/X86/sse42-schedule.ll +++ b/test/CodeGen/X86/sse42-schedule.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) { ; GENERIC-LABEL: crc32_32_8: @@ -43,6 +43,13 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) { ; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_32_8: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32b (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1) %2 = load i8, i8 *%a2 %3 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %1, i8 %2) @@ -85,6 +92,13 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) { ; BTVER2-NEXT: crc32w (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_32_16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32w %si, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32w (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1) %2 = load i16, i16 *%a2 %3 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %1, i16 %2) @@ -127,6 +141,13 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) { ; BTVER2-NEXT: crc32l (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_32_32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32l %esi, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32l (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1) %2 = load i32, i32 *%a2 %3 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %1, i32 %2) @@ -169,6 +190,13 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind { ; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_64_8: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32b (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1) %2 = load i8, i8 *%a2 %3 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %1, i8 %2) @@ -211,6 +239,13 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) { ; BTVER2-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00] ; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_64_64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32q %rsi, %rdi # sched: [3:1.00] +; ZNVER1-NEXT: crc32q (%rdx), %rdi # sched: [10:1.00] +; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1) %2 = load i64, i64 *%a2 %3 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %1, i64 %2) @@ -283,6 +318,19 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: # kill: %ECX %ECX %RCX ; BTVER2-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpestri: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: movl %ecx, %esi # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: # kill: %ECX %ECX %RCX +; ZNVER1-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %2, i32 7, i8 7) @@ -341,6 +389,16 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17] ; BTVER2-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [18:2.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpestrm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) @@ -393,6 +451,15 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: # kill: %ECX %ECX %RCX ; BTVER2-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpistri: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: movl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: # kill: %ECX %ECX %RCX +; ZNVER1-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %2, i8 7) @@ -431,6 +498,12 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [12:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpistrm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %1, <16 x i8> %2, i8 7) @@ -468,6 +541,12 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <2 x i64> %a0, %a1 %2 = sext <2 x i1> %1 to <2 x i64> %3 = load <2 x i64>, <2 x i64>*%a2, align 16 diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll index 11afdb7989f1..9ad6b0dfd4d6 100644 --- a/test/CodeGen/X86/sse4a-schedule.ll +++ b/test/CodeGen/X86/sse4a-schedule.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1 define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) { ; GENERIC-LABEL: test_extrq: @@ -11,8 +11,13 @@ define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) { ; ; BTVER2-LABEL: test_extrq: ; BTVER2: # BB#0: -; BTVER2-NEXT: extrq %xmm1, %xmm0 +; BTVER2-NEXT: extrq %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_extrq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: extrq %xmm1, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %a0, <16 x i8> %a1) ret <2 x i64> %1 } @@ -26,8 +31,13 @@ define <2 x i64> @test_extrqi(<2 x i64> %a0) { ; ; BTVER2-LABEL: test_extrqi: ; BTVER2: # BB#0: -; BTVER2-NEXT: extrq $2, $3, %xmm0 +; BTVER2-NEXT: extrq $2, $3, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_extrqi: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: extrq $2, $3, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a0, i8 3, i8 2) ret <2 x i64> %1 } @@ -41,8 +51,13 @@ define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) { ; ; BTVER2-LABEL: test_insertq: ; BTVER2: # BB#0: -; BTVER2-NEXT: insertq %xmm1, %xmm0 +; BTVER2-NEXT: insertq %xmm1, %xmm0 # sched: [2:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: insertq %xmm1, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %a0, <2 x i64> %a1) ret <2 x i64> %1 } @@ -56,8 +71,13 @@ define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) { ; ; BTVER2-LABEL: test_insertqi: ; BTVER2: # BB#0: -; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0 +; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [2:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertqi: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 6) ret <2 x i64> %1 } @@ -73,6 +93,11 @@ define void @test_movntsd(i8* %p, <2 x double> %a) { ; BTVER2: # BB#0: ; BTVER2-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movntsd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) ret void } @@ -88,6 +113,11 @@ define void @test_movntss(i8* %p, <4 x float> %a) { ; BTVER2: # BB#0: ; BTVER2-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movntss %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) ret void } diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll index f24969a30c33..24ace69ebb9e 100644 --- a/test/CodeGen/X86/ssse3-schedule.ll +++ b/test/CodeGen/X86/ssse3-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) { ; GENERIC-LABEL: test_pabsb: @@ -52,6 +52,13 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) { ; BTVER2-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pabsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpabsb (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) %2 = load <16 x i8>, <16 x i8> *%a1, align 16 %3 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %2) @@ -103,6 +110,13 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) { ; BTVER2-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pabsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpabsd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) %2 = load <4 x i32>, <4 x i32> *%a1, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %2) @@ -147,6 +161,11 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pabsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) %2 = load <8 x i16>, <8 x i16> *%a1, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %2) @@ -196,6 +215,12 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50] ; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_palignr: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.25] +; ZNVER1-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %2, <8 x i16> %1, <8 x i32> @@ -238,6 +263,12 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phaddd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2) @@ -289,6 +320,12 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phaddsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %1, <8 x i16> %2) @@ -332,6 +369,12 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phaddw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2) @@ -375,6 +418,12 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phsubd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %1, <4 x i32> %2) @@ -426,6 +475,12 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phsubsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %1, <8 x i16> %2) @@ -469,6 +524,12 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phsubw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %1, <8 x i16> %2) @@ -512,6 +573,12 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaddubsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = bitcast <8 x i16> %1 to <16 x i8> @@ -550,6 +617,11 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulhrsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %1, <8 x i16> %2) @@ -593,6 +665,12 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshufb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> %2) @@ -644,6 +722,12 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psignb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %1, <16 x i8> %2) @@ -695,6 +779,12 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psignd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %1, <4 x i32> %2) @@ -746,6 +836,12 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psignw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %1, <8 x i16> %2) diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll index 29f8e3ed4f78..784b932addc8 100644 --- a/test/CodeGen/X86/statepoint-invoke.ll +++ b/test/CodeGen/X86/statepoint-invoke.ll @@ -95,8 +95,8 @@ left.relocs: right: ; CHECK-LABEL: %right - ; CHECK: movq ; CHECK: movq %rdx, (%rsp) + ; CHECK: movq ; CHECK: callq some_call %sp2 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3) to label %right.relocs unwind label %exceptional_return.right diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll index b16426eae3d5..6e7fc7bf1c07 100644 --- a/test/CodeGen/X86/statepoint-stack-usage.ll +++ b/test/CodeGen/X86/statepoint-stack-usage.ll @@ -11,9 +11,9 @@ target triple = "x86_64-pc-linux-gnu" define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" { ; CHECK-LABEL: back_to_back_calls ; The exact stores don't matter, but there need to be three stack slots created -; CHECK: movq %rdi, 16(%rsp) -; CHECK: movq %rdx, 8(%rsp) -; CHECK: movq %rsi, (%rsp) +; CHECK-DAG: movq %rdi, 16(%rsp) +; CHECK-DAG: movq %rdx, 8(%rsp) +; CHECK-DAG: movq %rsi, (%rsp) ; There should be no more than three moves ; CHECK-NOT: movq %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) @@ -36,9 +36,9 @@ define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 a define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" { ; CHECK-LABEL: reserve_first ; The exact stores don't matter, but there need to be three stack slots created -; CHECK: movq %rdi, 16(%rsp) -; CHECK: movq %rdx, 8(%rsp) -; CHECK: movq %rsi, (%rsp) +; CHECK-DAG: movq %rdi, 16(%rsp) +; CHECK-DAG: movq %rdx, 8(%rsp) +; CHECK-DAG: movq %rsi, (%rsp) %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12) %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13) @@ -61,21 +61,21 @@ define i32 @back_to_back_deopt(i32 %a, i32 %b, i32 %c) #1 gc "statepoint-example" { ; CHECK-LABEL: back_to_back_deopt ; The exact stores don't matter, but there need to be three stack slots created -; CHECK: movl %ebx, 12(%rsp) -; CHECK: movl %ebp, 8(%rsp) -; CHECK: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %ebp, 8(%rsp) +; CHECK-DAG: movl %r14d, 4(%rsp) ; CHECK: callq -; CHECK: movl %ebx, 12(%rsp) -; CHECK: movl %ebp, 8(%rsp) -; CHECK: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %ebp, 8(%rsp) +; CHECK-DAG: movl %r14d, 4(%rsp) ; CHECK: callq -; CHECK: movl %ebx, 12(%rsp) -; CHECK: movl %ebp, 8(%rsp) -; CHECK: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %ebp, 8(%rsp) +; CHECK-DAG: movl %r14d, 4(%rsp) ; CHECK: callq -; CHECK: movl %ebx, 12(%rsp) -; CHECK: movl %ebp, 8(%rsp) -; CHECK: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %ebp, 8(%rsp) +; CHECK-DAG: movl %r14d, 4(%rsp) ; CHECK: callq call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c) call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c) @@ -89,9 +89,9 @@ define i32 @back_to_back_invokes(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 ; CHECK-LABEL: back_to_back_invokes entry: ; The exact stores don't matter, but there need to be three stack slots created - ; CHECK: movq %rdi, 16(%rsp) - ; CHECK: movq %rdx, 8(%rsp) - ; CHECK: movq %rsi, (%rsp) + ; CHECK-DAG: movq %rdi, 16(%rsp) + ; CHECK-DAG: movq %rdx, 8(%rsp) + ; CHECK-DAG: movq %rsi, (%rsp) ; CHECK: callq %safepoint_token = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) to label %normal_return unwind label %exceptional_return diff --git a/test/CodeGen/X86/statepoint-vector.ll b/test/CodeGen/X86/statepoint-vector.ll index 5bc8f983ff06..538d17564957 100644 --- a/test/CodeGen/X86/statepoint-vector.ll +++ b/test/CodeGen/X86/statepoint-vector.ll @@ -49,8 +49,8 @@ entry: ; CHECK: subq $40, %rsp ; CHECK: testb $1, %dil ; CHECK: movaps (%rsi), %xmm0 -; CHECK: movaps %xmm0, 16(%rsp) -; CHECK: movaps %xmm0, (%rsp) +; CHECK-DAG: movaps %xmm0, (%rsp) +; CHECK-DAG: movaps %xmm0, 16(%rsp) ; CHECK: callq do_safepoint ; CHECK: movaps (%rsp), %xmm0 ; CHECK: addq $40, %rsp diff --git a/test/CodeGen/X86/vec_cmp_uint-128.ll b/test/CodeGen/X86/vec_cmp_uint-128.ll index 8bed14e7e5f5..cad7991c4f3b 100644 --- a/test/CodeGen/X86/vec_cmp_uint-128.ll +++ b/test/CodeGen/X86/vec_cmp_uint-128.ll @@ -463,7 +463,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; AVX2-LABEL: gt_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 @@ -476,7 +476,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; AVX512-LABEL: gt_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 @@ -782,7 +782,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; AVX2-LABEL: lt_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 @@ -795,7 +795,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; AVX512-LABEL: lt_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 2b5eb695f53e..87cf2026d1ef 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -135,7 +135,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: test_div7_4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 @@ -433,7 +433,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: test_rem7_4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 @@ -444,7 +444,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 ; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] ; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll index e7bfe3778212..ce0ec6c3875a 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -115,7 +115,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { ; ; AVX2-LABEL: test_div7_8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 @@ -381,7 +381,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; ; AVX2-LABEL: test_rem7_8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 @@ -392,7 +392,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2 ; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll index cd17fcf8c85b..8138442b3eaf 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -130,7 +130,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: test_div7_4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 @@ -412,7 +412,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: test_rem7_4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 @@ -423,7 +423,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] ; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll index 4adc2e2fb6c9..b0433110f181 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -123,7 +123,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { ; ; AVX2-LABEL: test_div7_8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 @@ -392,7 +392,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; ; AVX2-LABEL: test_rem7_8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 @@ -403,7 +403,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsrld $2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll index 6719a66f030f..c65c3e7fd004 100644 --- a/test/CodeGen/X86/vector-idiv.ll +++ b/test/CodeGen/X86/vector-idiv.ll @@ -73,7 +73,7 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: PR20355: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll index 852c1f4d3d98..04378ee2ee01 100644 --- a/test/CodeGen/X86/vector-rotate-128.ll +++ b/test/CodeGen/X86/vector-rotate-128.ll @@ -77,14 +77,19 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] -; AVX512-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: var_rotate_v2i64: ; XOP: # BB#0: @@ -207,21 +212,26 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; AVX2-LABEL: var_rotate_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpsrlvd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 -; AVX512-NEXT: vpsubd %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlvd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: var_rotate_v4i32: ; XOP: # BB#0: @@ -844,28 +854,24 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: constant_rotate_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_rotate_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; XOPAVX1-LABEL: constant_rotate_v2i64: -; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: constant_rotate_v2i64: -; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512VL-LABEL: constant_rotate_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; XOP-LABEL: constant_rotate_v2i64: +; XOP: # BB#0: +; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; ; X32-SSE-LABEL: constant_rotate_v2i64: ; X32-SSE: # BB#0: @@ -951,26 +957,24 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: constant_rotate_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_rotate_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; XOPAVX1-LABEL: constant_rotate_v4i32: -; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX1-NEXT: retq +; AVX512VL-LABEL: constant_rotate_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; -; XOPAVX2-LABEL: constant_rotate_v4i32: -; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: constant_rotate_v4i32: +; XOP: # BB#0: +; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; ; X32-SSE-LABEL: constant_rotate_v4i32: ; X32-SSE: # BB#0: @@ -1100,11 +1104,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { ; ; XOP-LABEL: constant_rotate_v8i16: ; XOP: # BB#0: -; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1 -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: constant_rotate_v8i16: @@ -1281,11 +1281,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; ; XOP-LABEL: constant_rotate_v16i8: ; XOP: # BB#0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm1 -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: constant_rotate_v16i8: @@ -1371,12 +1367,18 @@ define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind { ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllq $14, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlq $50, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $14, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_v2i64: ; XOP: # BB#0: @@ -1412,12 +1414,18 @@ define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind { ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $28, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_v4i32: ; XOP: # BB#0: @@ -1544,11 +1552,19 @@ define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind { ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsrlq $49, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $15, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v2i64: ; XOP: # BB#0: @@ -1595,14 +1611,19 @@ define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind { ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $28, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v4i32: ; XOP: # BB#0: diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll index 14215e486bf9..3b65b68352b5 100644 --- a/test/CodeGen/X86/vector-rotate-256.ll +++ b/test/CodeGen/X86/vector-rotate-256.ll @@ -41,21 +41,25 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; AVX2-LABEL: var_rotate_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64] ; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 -; AVX512-NEXT: vpsubq %ymm1, %ymm2, %ymm2 -; AVX512-NEXT: vpsllvq %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: var_rotate_v4i64: ; XOPAVX1: # BB#0: @@ -128,21 +132,25 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; ; AVX2-LABEL: var_rotate_v8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32] ; AVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 -; AVX512-NEXT: vpsubd %ymm1, %ymm2, %ymm2 -; AVX512-NEXT: vpsllvd %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: var_rotate_v8i32: ; XOPAVX1: # BB#0: @@ -466,7 +474,7 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3 @@ -483,36 +491,36 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: constant_rotate_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1 -; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_rotate_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: constant_rotate_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: constant_rotate_v4i64: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm4 -; XOPAVX1-NEXT: vpshlq %xmm4, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3 -; XOPAVX1-NEXT: vpshlq %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: constant_rotate_v4i64: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <4 x i64> %a, - %lshr = lshr <4 x i64> %a, + %lshr = lshr <4 x i64> %a, %or = or <4 x i64> %shl, %lshr ret <4 x i64> %or } @@ -549,30 +557,33 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: constant_rotate_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm1 -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_rotate_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: constant_rotate_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: constant_rotate_v8i32: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: constant_rotate_v8i32: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <8 x i32> %a, %lshr = lshr <8 x i32> %a, @@ -643,30 +654,18 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind { ; ; XOPAVX1-LABEL: constant_rotate_v16i16: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm3, %xmm4 -; XOPAVX1-NEXT: vpshlw %xmm4, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm3, %xmm3 -; XOPAVX1-NEXT: vpshlw %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: constant_rotate_v16i16: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm3 -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3 -; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm2 -; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <16 x i16> %a, %lshr = lshr <16 x i16> %a, @@ -768,32 +767,20 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; ; XOPAVX1-LABEL: constant_rotate_v32i8: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm3, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] +; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: constant_rotate_v32i8: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm3 -; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm1 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm3, %xmm3 -; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] +; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a, @@ -825,12 +812,17 @@ define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllq $14, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlq $50, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_v4i64: ; XOPAVX1: # BB#0: @@ -873,12 +865,17 @@ define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %ymm0, %ymm1 -; AVX512-NEXT: vpsrld $28, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_v8i32: ; XOPAVX1: # BB#0: @@ -1027,11 +1024,18 @@ define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsrlq $49, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $15, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64: ; XOPAVX1: # BB#0: @@ -1082,14 +1086,18 @@ define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %ymm0, %ymm1 -; AVX512-NEXT: vpsrld $28, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32: ; XOPAVX1: # BB#0: diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll new file mode 100644 index 000000000000..fa1b5c1c0cb4 --- /dev/null +++ b/test/CodeGen/X86/vector-rotate-512.ll @@ -0,0 +1,831 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VLBW + +; +; Variable Rotates +; + +define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { +; AVX512-LABEL: var_rotate_v8i64: +; AVX512: # BB#0: +; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %b64 = sub <8 x i64> , %b + %shl = shl <8 x i64> %a, %b + %lshr = lshr <8 x i64> %a, %b64 + %or = or <8 x i64> %shl, %lshr + ret <8 x i64> %or +} + +define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { +; AVX512-LABEL: var_rotate_v16i32: +; AVX512: # BB#0: +; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %b32 = sub <16 x i32> , %b + %shl = shl <16 x i32> %a, %b + %lshr = lshr <16 x i32> %a, %b32 + %or = or <16 x i32> %shl, %lshr + ret <16 x i32> %or +} + +define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512F-LABEL: var_rotate_v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm5 +; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v32i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm5 +; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm4 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3 +; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 +; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: var_rotate_v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: var_rotate_v32i16: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %b16 = sub <32 x i16> , %b + %shl = shl <32 x i16> %a, %b + %lshr = lshr <32 x i16> %a, %b16 + %or = or <32 x i16> %shl, %lshr + ret <32 x i16> %or +} + +define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512F-LABEL: var_rotate_v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm4 +; AVX512F-NEXT: vpsubb %ymm3, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm1, %ymm6 +; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm6 +; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm7 +; AVX512F-NEXT: vpand %ymm9, %ymm7, %ymm7 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm7 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm9, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm9, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v64i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %ymm2, %ymm5, %ymm4 +; AVX512VL-NEXT: vpsubb %ymm3, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm1, %ymm6 +; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm8 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm6 +; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm7 +; AVX512VL-NEXT: vpand %ymm9, %ymm7, %ymm7 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm7 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm9, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm9, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: var_rotate_v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} +; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2 +; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: var_rotate_v64i8: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} +; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} +; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm1 +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2 +; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLBW-NEXT: retq + %b8 = sub <64 x i8> , %b + %shl = shl <64 x i8> %a, %b + %lshr = lshr <64 x i8> %a, %b8 + %or = or <64 x i8> %shl, %lshr + ret <64 x i8> %or +} + +; +; Constant Rotates +; + +define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind { +; AVX512-LABEL: constant_rotate_v8i64: +; AVX512: # BB#0: +; AVX512-NEXT: vprolvq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <8 x i64> %a, + %lshr = lshr <8 x i64> %a, + %or = or <8 x i64> %shl, %lshr + ret <8 x i64> %or +} + +define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind { +; AVX512-LABEL: constant_rotate_v16i32: +; AVX512: # BB#0: +; AVX512-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <16 x i32> %a, + %lshr = lshr <16 x i32> %a, + %or = or <16 x i32> %shl, %lshr + ret <16 x i32> %or +} + +define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind { +; AVX512F-LABEL: constant_rotate_v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm4 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsrlvd %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: constant_rotate_v32i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vmovdqa32 {{.*#+}} zmm4 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: constant_rotate_v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: constant_rotate_v32i16: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <32 x i16> %a, + %lshr = lshr <32 x i16> %a, + %or = or <32 x i16> %shl, %lshr + ret <32 x i16> %or +} + +define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { +; AVX512F-LABEL: constant_rotate_v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5 +; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm9, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm10 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm9, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: constant_rotate_v64i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm10 +; AVX512VL-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: constant_rotate_v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: constant_rotate_v64i8: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} +; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} +; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <64 x i8> %a, + %lshr = lshr <64 x i8> %a, + %or = or <64 x i8> %shl, %lshr + ret <64 x i8> %or +} + +; +; Uniform Constant Rotates +; + +define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind { +; AVX512-LABEL: splatconstant_rotate_v8i64: +; AVX512: # BB#0: +; AVX512-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <8 x i64> %a, + %lshr = lshr <8 x i64> %a, + %or = or <8 x i64> %shl, %lshr + ret <8 x i64> %or +} + +define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind { +; AVX512-LABEL: splatconstant_rotate_v16i32: +; AVX512: # BB#0: +; AVX512-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <16 x i32> %a, + %lshr = lshr <16 x i32> %a, + %or = or <16 x i32> %shl, %lshr + ret <16 x i32> %or +} + +define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind { +; AVX512F-LABEL: splatconstant_rotate_v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm2 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3 +; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v32i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_v32i16: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <32 x i16> %a, + %lshr = lshr <32 x i16> %a, + %or = or <32 x i16> %shl, %lshr + ret <32 x i16> %or +} + +define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { +; AVX512F-LABEL: splatconstant_rotate_v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v64i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_v64i8: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <64 x i8> %a, + %lshr = lshr <64 x i8> %a, + %or = or <64 x i8> %shl, %lshr + ret <64 x i8> %or +} + +; +; Masked Uniform Constant Rotates +; + +define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind { +; AVX512-LABEL: splatconstant_rotate_mask_v8i64: +; AVX512: # BB#0: +; AVX512-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <8 x i64> %a, + %lshr = lshr <8 x i64> %a, + %rmask = and <8 x i64> %lshr, + %lmask = and <8 x i64> %shl, + %or = or <8 x i64> %lmask, %rmask + ret <8 x i64> %or +} + +define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind { +; AVX512-LABEL: splatconstant_rotate_mask_v16i32: +; AVX512: # BB#0: +; AVX512-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <16 x i32> %a, + %lshr = lshr <16 x i32> %a, + %rmask = and <16 x i32> %lshr, + %lmask = and <16 x i32> %shl, + %or = or <16 x i32> %lmask, %rmask + ret <16 x i32> %or +} + +define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { +; AVX512F-LABEL: splatconstant_rotate_mask_v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm2 +; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm3 +; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <32 x i16> %a, + %lshr = lshr <32 x i16> %a, + %rmask = and <32 x i16> %lshr, + %lmask = and <32 x i16> %shl, + %or = or <32 x i16> %lmask, %rmask + ret <32 x i16> %or +} + +define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { +; AVX512F-LABEL: splatconstant_rotate_mask_v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33] +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33] +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <64 x i8> %a, + %lshr = lshr <64 x i8> %a, + %rmask = and <64 x i8> %lshr, + %lmask = and <64 x i8> %shl, + %or = or <64 x i8> %lmask, %rmask + ret <64 x i8> %or +} diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll index 09e143ddcd4d..5f2b18fc9c03 100644 --- a/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -45,7 +45,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; AVX2-LABEL: var_shift_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 @@ -66,7 +66,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; XOPAVX2-LABEL: var_shift_v4i64: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 @@ -667,7 +667,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; AVX2-LABEL: splatvar_shift_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -687,7 +687,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; XOPAVX2-LABEL: splatvar_shift_v4i64: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -1700,7 +1700,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX2-LABEL: splatconstant_shift_v4i64: ; XOPAVX2: # BB#0: ; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72057594037927936,72057594037927936,72057594037927936,72057594037927936] ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 820178d2d992..5f00e55e225b 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -745,7 +745,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31] ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; @@ -755,7 +755,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm1 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31] ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index 30e5661d5485..4a7d25c1376e 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -179,7 +179,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; @@ -189,7 +189,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; @@ -432,7 +432,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31] ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; @@ -442,7 +442,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31] ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll index 3bf677aadf19..2fce8a601931 100644 --- a/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/test/CodeGen/X86/vector-tzcnt-512.ll @@ -89,7 +89,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1 ; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63] ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; @@ -99,7 +99,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; AVX512CDBW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63] ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; @@ -235,7 +235,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 ; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; @@ -245,7 +245,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 +; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll index 5503cfc357e5..5825a56b6f99 100644 --- a/test/CodeGen/X86/vselect-avx.ll +++ b/test/CodeGen/X86/vselect-avx.ll @@ -58,8 +58,8 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: movq (%rdi,%rsi,8), %rax -; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.5,-0.5,-0.5,-0.5] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.5,0.5,0.5,0.5] ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: vmovupd %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -108,7 +108,7 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, ; ; AVX2-LABEL: test3: ; AVX2: ## BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm3 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766] ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuldq %xmm4, %xmm5, %xmm4 @@ -117,7 +117,7 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] ; AVX2-NEXT: vpsrld $31, %xmm3, %xmm4 ; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm4 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3] ; AVX2-NEXT: vpmulld %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 diff --git a/test/CodeGen/X86/widen_arith-2.ll b/test/CodeGen/X86/widen_arith-2.ll index 48753ad4fd76..5731b63f3bc1 100644 --- a/test/CodeGen/X86/widen_arith-2.ll +++ b/test/CodeGen/X86/widen_arith-2.ll @@ -16,20 +16,17 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { ; CHECK-NEXT: .LBB0_2: # %forbody ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movl (%esp), %eax -; CHECK-NEXT: shll $3, %eax -; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl (%esp), %eax -; CHECK-NEXT: shll $3, %eax -; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl (%esp), %ecx +; CHECK-NEXT: leal (,%eax,8), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; CHECK-NEXT: pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; CHECK-NEXT: psubw %xmm0, %xmm3 ; CHECK-NEXT: pand %xmm1, %xmm3 ; CHECK-NEXT: pshufb %xmm2, %xmm3 -; CHECK-NEXT: movq %xmm3, (%edx,%ecx,8) +; CHECK-NEXT: movq %xmm3, (%edx,%eax,8) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: .LBB0_1: # %forcond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll index e55d62a461aa..cc6fb27a6293 100644 --- a/test/CodeGen/X86/widen_cast-4.ll +++ b/test/CodeGen/X86/widen_cast-4.ll @@ -16,22 +16,19 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { ; NARROW-NEXT: .LBB0_2: # %forbody ; NARROW-NEXT: # in Loop: Header=BB0_1 Depth=1 ; NARROW-NEXT: movl (%esp), %eax -; NARROW-NEXT: shll $3, %eax -; NARROW-NEXT: addl {{[0-9]+}}(%esp), %eax -; NARROW-NEXT: movl %eax, {{[0-9]+}}(%esp) -; NARROW-NEXT: movl (%esp), %eax -; NARROW-NEXT: shll $3, %eax -; NARROW-NEXT: addl {{[0-9]+}}(%esp), %eax -; NARROW-NEXT: movl %eax, {{[0-9]+}}(%esp) -; NARROW-NEXT: movl (%esp), %ecx +; NARROW-NEXT: leal (,%eax,8), %ecx ; NARROW-NEXT: movl {{[0-9]+}}(%esp), %edx +; NARROW-NEXT: addl %ecx, %edx +; NARROW-NEXT: movl %edx, {{[0-9]+}}(%esp) +; NARROW-NEXT: addl {{[0-9]+}}(%esp), %ecx +; NARROW-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; NARROW-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; NARROW-NEXT: psubw %xmm0, %xmm2 ; NARROW-NEXT: psllw $8, %xmm2 ; NARROW-NEXT: psraw $8, %xmm2 ; NARROW-NEXT: psraw $2, %xmm2 ; NARROW-NEXT: pshufb %xmm1, %xmm2 -; NARROW-NEXT: movq %xmm2, (%edx,%ecx,8) +; NARROW-NEXT: movq %xmm2, (%edx,%eax,8) ; NARROW-NEXT: incl (%esp) ; NARROW-NEXT: .LBB0_1: # %forcond ; NARROW-NEXT: # =>This Inner Loop Header: Depth=1 @@ -54,24 +51,21 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { ; WIDE-NEXT: .LBB0_2: # %forbody ; WIDE-NEXT: # in Loop: Header=BB0_1 Depth=1 ; WIDE-NEXT: movl (%esp), %eax -; WIDE-NEXT: shll $3, %eax -; WIDE-NEXT: addl {{[0-9]+}}(%esp), %eax -; WIDE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; WIDE-NEXT: movl (%esp), %eax -; WIDE-NEXT: shll $3, %eax -; WIDE-NEXT: addl {{[0-9]+}}(%esp), %eax -; WIDE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; WIDE-NEXT: movl (%esp), %ecx +; WIDE-NEXT: leal (,%eax,8), %ecx ; WIDE-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIDE-NEXT: addl %ecx, %edx +; WIDE-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIDE-NEXT: addl {{[0-9]+}}(%esp), %ecx +; WIDE-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; WIDE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; WIDE-NEXT: pinsrd $1, 4(%eax,%ecx,8), %xmm3 +; WIDE-NEXT: pinsrd $1, 4(%ecx,%eax,8), %xmm3 ; WIDE-NEXT: psubb %xmm0, %xmm3 ; WIDE-NEXT: psrlw $2, %xmm3 ; WIDE-NEXT: pand %xmm1, %xmm3 ; WIDE-NEXT: pxor %xmm2, %xmm3 ; WIDE-NEXT: psubb %xmm2, %xmm3 -; WIDE-NEXT: pextrd $1, %xmm3, 4(%edx,%ecx,8) -; WIDE-NEXT: movd %xmm3, (%edx,%ecx,8) +; WIDE-NEXT: pextrd $1, %xmm3, 4(%edx,%eax,8) +; WIDE-NEXT: movd %xmm3, (%edx,%eax,8) ; WIDE-NEXT: incl (%esp) ; WIDE-NEXT: .LBB0_1: # %forcond ; WIDE-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/test/CodeGen/X86/win64-nosse-csrs.ll b/test/CodeGen/X86/win64-nosse-csrs.ll index d1860b721044..29d4f165392e 100644 --- a/test/CodeGen/X86/win64-nosse-csrs.ll +++ b/test/CodeGen/X86/win64-nosse-csrs.ll @@ -20,7 +20,7 @@ entry-block: } ; Function Attrs: nounwind uwtable -define x86_64_win64cc i64 @peach() unnamed_addr #1 { +define win64cc i64 @peach() unnamed_addr #1 { entry-block: %0 = call i64 @banana() ret i64 %0 diff --git a/test/CodeGen/X86/win64_nonvol.ll b/test/CodeGen/X86/win64_nonvol.ll index 8e5f6cec1ab7..e1c615d75f28 100644 --- a/test/CodeGen/X86/win64_nonvol.ll +++ b/test/CodeGen/X86/win64_nonvol.ll @@ -5,7 +5,7 @@ ; Win64 nonvolatile registers get saved. ; CHECK-LABEL: bar: -define x86_64_win64cc void @bar(i32 %a, i32 %b) { +define win64cc void @bar(i32 %a, i32 %b) { ; CHECK-DAG: pushq %rdi ; CHECK-DAG: pushq %rsi ; CHECK-DAG: movaps %xmm6, diff --git a/test/CodeGen/X86/win64_params.ll b/test/CodeGen/X86/win64_params.ll index a0b552d4d584..6b4273512013 100644 --- a/test/CodeGen/X86/win64_params.ll +++ b/test/CodeGen/X86/win64_params.ll @@ -12,7 +12,7 @@ entry: ret i32 %add } -define x86_64_win64cc i32 @f7(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind readnone optsize { +define win64cc i32 @f7(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind readnone optsize { entry: ; CHECK: movl 48(%rsp), %eax ; CHECK: addl 40(%rsp), %eax diff --git a/test/CodeGen/X86/win_chkstk.ll b/test/CodeGen/X86/win_chkstk.ll index 0faa24ef7290..c7550a467a35 100644 --- a/test/CodeGen/X86/win_chkstk.ll +++ b/test/CodeGen/X86/win_chkstk.ll @@ -51,7 +51,7 @@ entry: ; Make sure we don't call __chkstk or __alloca on non-Windows even if the ; caller has the Win64 calling convention. -define x86_64_win64cc i32 @main4k_win64() nounwind { +define win64cc i32 @main4k_win64() nounwind { entry: ; WIN_X32: calll __chkstk ; WIN_X64: callq __chkstk diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll index c9a5fc2b3288..b4b8010ec564 100644 --- a/test/CodeGen/X86/win_coreclr_chkstk.ll +++ b/test/CodeGen/X86/win_coreclr_chkstk.ll @@ -103,7 +103,7 @@ entry: ; Make sure we don't emit the probe sequence if not on windows even if the ; caller has the Win64 calling convention. -define x86_64_win64cc i32 @main4k_win64() nounwind { +define win64cc i32 @main4k_win64() nounwind { entry: ; WIN_X64: movq %gs:16, %rcx ; LINUX-NOT: movq %gs:16, %rcx @@ -115,7 +115,7 @@ entry: declare i32 @bar(i8*) nounwind ; Within-body inline probe expansion -define x86_64_win64cc i32 @main4k_alloca(i64 %n) nounwind { +define win64cc i32 @main4k_alloca(i64 %n) nounwind { entry: ; WIN_X64: callq bar ; WIN_X64: movq %gs:16, [[R:%r.*]] diff --git a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll index 299190e8a595..e3387a2709cb 100644 --- a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll +++ b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll @@ -3,7 +3,7 @@ ; Verify that the var arg parameters which are passed in registers are stored ; in home stack slots allocated by the caller and that AP is correctly ; calculated. -define x86_64_win64cc void @average_va(i32 %count, ...) nounwind { +define win64cc void @average_va(i32 %count, ...) nounwind { entry: ; CHECK: pushq ; CHECK: movq %r9, 40(%rsp) @@ -24,7 +24,7 @@ declare void @llvm.va_end(i8*) nounwind ; CHECK-LABEL: f5: ; CHECK: pushq ; CHECK: leaq 56(%rsp), -define x86_64_win64cc i8** @f5(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, ...) nounwind { +define win64cc i8** @f5(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, ...) nounwind { entry: %ap = alloca i8*, align 8 %ap.0 = bitcast i8** %ap to i8* @@ -35,7 +35,7 @@ entry: ; CHECK-LABEL: f4: ; CHECK: pushq ; CHECK: leaq 48(%rsp), -define x86_64_win64cc i8** @f4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { +define win64cc i8** @f4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { entry: %ap = alloca i8*, align 8 %ap.0 = bitcast i8** %ap to i8* @@ -46,7 +46,7 @@ entry: ; CHECK-LABEL: f3: ; CHECK: pushq ; CHECK: leaq 40(%rsp), -define x86_64_win64cc i8** @f3(i64 %a0, i64 %a1, i64 %a2, ...) nounwind { +define win64cc i8** @f3(i64 %a0, i64 %a1, i64 %a2, ...) nounwind { entry: %ap = alloca i8*, align 8 %ap.0 = bitcast i8** %ap to i8* @@ -62,7 +62,7 @@ entry: ; CHECK: movq [[REG_copy1]], 8(%rsp) ; CHECK: movq [[REG_copy1]], (%rsp) ; CHECK: ret -define x86_64_win64cc void @copy1(i64 %a0, ...) nounwind { +define win64cc void @copy1(i64 %a0, ...) nounwind { entry: %ap = alloca i8*, align 8 %cp = alloca i8*, align 8 @@ -78,7 +78,7 @@ entry: ; CHECK: movq [[REG_copy4]], 8(%rsp) ; CHECK: movq [[REG_copy4]], (%rsp) ; CHECK: ret -define x86_64_win64cc void @copy4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { +define win64cc void @copy4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { entry: %ap = alloca i8*, align 8 %cp = alloca i8*, align 8 @@ -96,7 +96,7 @@ entry: ; CHECK: movq [[REG_arg4_2]], (%rsp) ; CHECK: movl 48(%rsp), %eax ; CHECK: ret -define x86_64_win64cc i32 @arg4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { +define win64cc i32 @arg4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { entry: %ap = alloca i8*, align 8 %ap.0 = bitcast i8** %ap to i8* diff --git a/test/CodeGen/X86/x86-cmov-converter.ll b/test/CodeGen/X86/x86-cmov-converter.ll new file mode 100644 index 000000000000..39877c14429f --- /dev/null +++ b/test/CodeGen/X86/x86-cmov-converter.ll @@ -0,0 +1,321 @@ +; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; This test checks that x86-cmov-converter optimization transform CMOV +;; instruction into branches when it is profitable. +;; There are 5 cases below: +;; 1. CmovInCriticalPath: +;; CMOV depends on the condition and it is in the hot path. +;; Thus, it worths transforming. +;; +;; 2. CmovNotInCriticalPath: +;; similar test like in (1), just that CMOV is not in the hot path. +;; Thus, it does not worth transforming. +;; +;; 3. MaxIndex: +;; Maximum calculation algorithm that is looking for the max index, +;; calculating CMOV value is cheaper than calculating CMOV condition. +;; Thus, it worths transforming. +;; +;; 4. MaxValue: +;; Maximum calculation algorithm that is looking for the max value, +;; calculating CMOV value is not cheaper than calculating CMOV condition. +;; Thus, it does not worth transforming. +;; +;; 5. BinarySearch: +;; Usually, binary search CMOV is not predicted. +;; Thus, it does not worth transforming. +;; +;; Test was created using the following command line: +;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o - +;; Where foo.c is: +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void CmovInHotPath(int n, int a, int b, int *c, int *d) { +;; for (int i = 0; i < n; i++) { +;; int t = c[i]; +;; if (c[i] * a > b) +;; t = 10; +;; c[i] = t; +;; } +;;} +;; +;; +;;void CmovNotInHotPath(int n, int a, int b, int *c, int *d) { +;; for (int i = 0; i < n; i++) { +;; int t = c[i]; +;; if (c[i] * a > b) +;; t = 10; +;; c[i] = t; +;; d[i] /= b; +;; } +;;} +;; +;; +;;int MaxIndex(int n, int *a) { +;; int t = 0; +;; for (int i = 1; i < n; i++) { +;; if (a[i] > a[t]) +;; t = i; +;; } +;; return a[t]; +;;} +;; +;; +;;int MaxValue(int n, int *a) { +;; int t = a[0]; +;; for (int i = 1; i < n; i++) { +;; if (a[i] > t) +;; t = a[i]; +;; } +;; return t; +;;} +;; +;;typedef struct Node Node; +;;struct Node { +;; unsigned Val; +;; Node *Right; +;; Node *Left; +;;}; +;; +;;unsigned BinarySearch(unsigned Mask, Node *Curr, Node *Next) { +;; while (Curr->Val > Next->Val) { +;; Curr = Next; +;; if (Mask & (0x1 << Curr->Val)) +;; Next = Curr->Right; +;; else +;; Next = Curr->Left; +;; } +;; return Curr->Val; +;;} +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%struct.Node = type { i32, %struct.Node*, %struct.Node* } + +; CHECK-LABEL: CmovInHotPath +; CHECK-NOT: cmov +; CHECK: jg + +define void @CmovInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 { +entry: + %cmp14 = icmp sgt i32 %n, 0 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %0, %a + %cmp3 = icmp sgt i32 %mul, %b + %. = select i1 %cmp3, i32 10, i32 %0 + store i32 %., i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: CmovNotInHotPath +; CHECK: cmovg + +define void @CmovNotInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture %d) #0 { +entry: + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %0, %a + %cmp3 = icmp sgt i32 %mul, %b + %. = select i1 %cmp3, i32 10, i32 %0 + store i32 %., i32* %arrayidx, align 4 + %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv + %1 = load i32, i32* %arrayidx7, align 4 + %div = sdiv i32 %1, %b + store i32 %div, i32* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: MaxIndex +; CHECK-NOT: cmov +; CHECK: jg + +define i32 @MaxIndex(i32 %n, i32* nocapture readonly %a) #0 { +entry: + %cmp14 = icmp sgt i32 %n, 1 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %phitmp = sext i32 %i.0.t.0 to i64 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %t.0.lcssa = phi i64 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ] + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %t.0.lcssa + %0 = load i32, i32* %arrayidx5, align 4 + ret i32 %0 + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ] + %t.015 = phi i32 [ %i.0.t.0, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4 + %idxprom1 = sext i32 %t.015 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %idxprom1 + %2 = load i32, i32* %arrayidx2, align 4 + %cmp3 = icmp sgt i32 %1, %2 + %3 = trunc i64 %indvars.iv to i32 + %i.0.t.0 = select i1 %cmp3, i32 %3, i32 %t.015 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; CHECK-LABEL: MaxValue +; CHECK-NOT: jg +; CHECK: cmovg + +define i32 @MaxValue(i32 %n, i32* nocapture readonly %a) #0 { +entry: + %0 = load i32, i32* %a, align 4 + %cmp13 = icmp sgt i32 %n, 1 + br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %t.0.lcssa = phi i32 [ %0, %entry ], [ %.t.0, %for.body ] + ret i32 %t.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ] + %t.014 = phi i32 [ %.t.0, %for.body ], [ %0, %for.body.preheader ] + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %1 = load i32, i32* %arrayidx1, align 4 + %cmp2 = icmp sgt i32 %1, %t.014 + %.t.0 = select i1 %cmp2, i32 %1, i32 %t.014 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: BinarySearch +; CHECK: cmov + +define i32 @BinarySearch(i32 %Mask, %struct.Node* nocapture readonly %Curr, %struct.Node* nocapture readonly %Next) #0 { +entry: + %Val8 = getelementptr inbounds %struct.Node, %struct.Node* %Curr, i64 0, i32 0 + %0 = load i32, i32* %Val8, align 8 + %Val19 = getelementptr inbounds %struct.Node, %struct.Node* %Next, i64 0, i32 0 + %1 = load i32, i32* %Val19, align 8 + %cmp10 = icmp ugt i32 %0, %1 + br i1 %cmp10, label %while.body, label %while.end + +while.body: ; preds = %entry, %while.body + %2 = phi i32 [ %4, %while.body ], [ %1, %entry ] + %Next.addr.011 = phi %struct.Node* [ %3, %while.body ], [ %Next, %entry ] + %shl = shl i32 1, %2 + %and = and i32 %shl, %Mask + %tobool = icmp eq i32 %and, 0 + %Left = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 2 + %Right = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 1 + %Left.sink = select i1 %tobool, %struct.Node** %Left, %struct.Node** %Right + %3 = load %struct.Node*, %struct.Node** %Left.sink, align 8 + %Val1 = getelementptr inbounds %struct.Node, %struct.Node* %3, i64 0, i32 0 + %4 = load i32, i32* %Val1, align 8 + %cmp = icmp ugt i32 %2, %4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + %.lcssa = phi i32 [ %0, %entry ], [ %2, %while.body ] + ret i32 %.lcssa +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; The following test checks that x86-cmov-converter optimization transforms +;; CMOV instructions into branch correctly. +;; +;; MBB: +;; cond = cmp ... +;; v1 = CMOVgt t1, f1, cond +;; v2 = CMOVle s1, f2, cond +;; +;; Where: t1 = 11, f1 = 22, f2 = a +;; +;; After CMOV transformation +;; ------------------------- +;; MBB: +;; cond = cmp ... +;; ja %SinkMBB +;; +;; FalseMBB: +;; jmp %SinkMBB +;; +;; SinkMBB: +;; %v1 = phi[%f1, %FalseMBB], [%t1, %MBB] +;; %v2 = phi[%f1, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch +;; ; true-value with false-value +;; ; Phi instruction cannot use +;; ; previous Phi instruction result +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; CHECK-LABEL: Transform +; CHECK-NOT: cmov +; CHECK: divl [[a:%[0-9a-z]*]] +; CHECK: cmpl [[a]], %eax +; CHECK: movl $11, [[s1:%[0-9a-z]*]] +; CHECK: movl [[a]], [[s2:%[0-9a-z]*]] +; CHECK: ja [[SinkBB:.*]] +; CHECK: [[FalseBB:.*]]: +; CHECK: movl $22, [[s1]] +; CHECK: movl $22, [[s2]] +; CHECK: [[SinkBB]]: +; CHECK: ja + +define void @Transform(i32 *%arr, i32 *%arr2, i32 %a, i32 %b, i32 %c, i32 %n) #0 { +entry: + %cmp10 = icmp ugt i32 0, %n + br i1 %cmp10, label %while.body, label %while.end + +while.body: ; preds = %entry, %while.body + %i = phi i32 [ %i_inc, %while.body ], [ 0, %entry ] + %arr_i = getelementptr inbounds i32, i32* %arr, i32 %i + %x = load i32, i32* %arr_i, align 4 + %div = udiv i32 %x, %a + %cond = icmp ugt i32 %div, %a + %condOpp = icmp ule i32 %div, %a + %s1 = select i1 %cond, i32 11, i32 22 + %s2 = select i1 %condOpp, i32 %s1, i32 %a + %sum = urem i32 %s1, %s2 + store i32 %sum, i32* %arr_i, align 4 + %i_inc = add i32 %i, 1 + %cmp = icmp ugt i32 %i_inc, %n + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +attributes #0 = {"target-cpu"="x86-64"} diff --git a/test/CodeGen/XCore/varargs.ll b/test/CodeGen/XCore/varargs.ll index 2e364b275610..b6f716d66c9d 100644 --- a/test/CodeGen/XCore/varargs.ll +++ b/test/CodeGen/XCore/varargs.ll @@ -26,10 +26,10 @@ entry: ; CHECK-LABEL: test_vararg ; CHECK: extsp 6 ; CHECK: stw lr, sp[1] -; CHECK: stw r3, sp[6] -; CHECK: stw r0, sp[3] -; CHECK: stw r1, sp[4] -; CHECK: stw r2, sp[5] +; CHECK-DAG: stw r3, sp[6] +; CHECK-DAG: stw r0, sp[3] +; CHECK-DAG: stw r1, sp[4] +; CHECK-DAG: stw r2, sp[5] ; CHECK: ldaw r0, sp[3] ; CHECK: stw r0, sp[2] %list = alloca i8*, align 4 diff --git a/test/DebugInfo/Generic/namespace.ll b/test/DebugInfo/Generic/namespace.ll index 983e20db4111..5a8f65263192 100644 --- a/test/DebugInfo/Generic/namespace.ll +++ b/test/DebugInfo/Generic/namespace.ll @@ -54,17 +54,14 @@ ; CHECK-NOT: NULL ; CHECK: DW_TAG_imported_module -; This is a bug, it should be in F2 but it inherits the file from its -; enclosing scope -; CHECK-NEXT: DW_AT_decl_file{{.*}}stdin +; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2:.*]]) ; CHECK-NEXT: DW_AT_decl_line{{.*}}(15) ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS2]]}) ; CHECK: NULL ; CHECK-NOT: NULL ; CHECK: DW_TAG_imported_module -; Same bug as above, this should be F2 -; CHECK-NEXT: DW_AT_decl_file{{.*}}debug-info-namespace.cpp +; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2:.*]]) ; CHECK-NEXT: DW_AT_decl_line{{.*}}(18) ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]}) ; CHECK-NOT: NULL @@ -320,29 +317,29 @@ attributes #1 = { nounwind readnone } !31 = !DIGlobalVariable(name: "i", linkageName: "_ZN1A1B1iE", line: 20, isLocal: false, isDefinition: true, scope: !6, file: !18, type: !13) !32 = !DIGlobalVariable(name: "var_fwd", linkageName: "_ZN1A1B7var_fwdE", line: 44, isLocal: false, isDefinition: true, scope: !6, file: !18, type: !13) !33 = !{!34, !35, !36, !37, !40, !41, !42, !43, !44, !45, !47, !48, !49, !51, !54, !55, !56} -!34 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 15, scope: !7, entity: !6) -!35 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 18, scope: !0, entity: !7) -!36 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 19, name: "E", scope: !0, entity: !7) -!37 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 23, scope: !38, entity: !6) +!34 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 15, scope: !7, entity: !6) +!35 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 18, scope: !0, entity: !7) +!36 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 19, name: "E", scope: !0, entity: !7) +!37 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 23, scope: !38, entity: !6) !38 = distinct !DILexicalBlock(line: 22, column: 10, file: !5, scope: !39) !39 = distinct !DILexicalBlock(line: 22, column: 7, file: !5, scope: !21) -!40 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 26, scope: !21, entity: !7) -!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 27, scope: !21, entity: !4) -!42 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 28, scope: !21, entity: !8) -!43 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 29, scope: !21, entity: !14) -!44 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 30, scope: !21, entity: !31) -!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 31, scope: !21, entity: !46) +!40 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 26, scope: !21, entity: !7) +!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 27, scope: !21, entity: !4) +!42 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 28, scope: !21, entity: !8) +!43 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 29, scope: !21, entity: !14) +!44 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 30, scope: !21, entity: !31) +!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 31, scope: !21, entity: !46) !46 = !DIDerivedType(tag: DW_TAG_typedef, name: "baz", line: 7, file: !5, scope: !6, baseType: !8) -!47 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 32, name: "X", scope: !21, entity: !7) -!48 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 33, name: "Y", scope: !21, entity: !47) -!49 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 34, scope: !21, entity: !50) +!47 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 32, name: "X", scope: !21, entity: !7) +!48 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 33, name: "Y", scope: !21, entity: !47) +!49 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 34, scope: !21, entity: !50) !50 = !DIGlobalVariable(name: "var_decl", linkageName: "_ZN1A1B8var_declE", line: 8, isLocal: false, isDefinition: false, scope: !6, file: !18, type: !13) -!51 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 35, scope: !21, entity: !52) +!51 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 35, scope: !21, entity: !52) !52 = !DISubprogram(name: "func_decl", linkageName: "_ZN1A1B9func_declEv", line: 9, isLocal: false, isDefinition: false, flags: DIFlagPrototyped, isOptimized: false, file: !5, scope: !6, type: !19, variables: !53) !53 = !{} ; previously: invalid DW_TAG_base_type -!54 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 36, scope: !21, entity: !32) -!55 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 37, scope: !21, entity: !26) -!56 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 42, scope: !7, entity: !31) +!54 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 36, scope: !21, entity: !32) +!55 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 37, scope: !21, entity: !26) +!56 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 42, scope: !7, entity: !31) !57 = !{i32 2, !"Dwarf Version", i32 2} !58 = !{i32 2, !"Debug Info Version", i32 3} !59 = !{!"clang version 3.6.0 "} diff --git a/test/DebugInfo/PDB/pdbdump-headers.test b/test/DebugInfo/PDB/pdbdump-headers.test index 1887af2e8268..14fe4bb352f6 100644 --- a/test/DebugInfo/PDB/pdbdump-headers.test +++ b/test/DebugInfo/PDB/pdbdump-headers.test @@ -91,189 +91,189 @@ ALL-NEXT: Mod 0001 | `* Linker *`: ALL: Types (TPI Stream) ALL-NEXT: ============================================================ ALL-NEXT: Showing 75 records -ALL-NEXT: 0x1000 | LF_ARGLIST [size = 8, hash = 205956] -ALL-NEXT: 0x1001 | LF_PROCEDURE [size = 16, hash = 163561] +ALL-NEXT: 0x1000 | LF_ARGLIST [size = 8, hash = 0x32484] +ALL-NEXT: 0x1001 | LF_PROCEDURE [size = 16, hash = 0x27EE9] ALL-NEXT: return type = 0x0074 (int), # args = 0, param list = 0x1000 ALL-NEXT: calling conv = cdecl, options = None -ALL-NEXT: 0x1002 | LF_FIELDLIST [size = 76, hash = 59811] +ALL-NEXT: 0x1002 | LF_FIELDLIST [size = 76, hash = 0xE9A3] ALL-NEXT: - LF_ENUMERATE [apartment = 1] ALL-NEXT: - LF_ENUMERATE [single = 2] ALL-NEXT: - LF_ENUMERATE [free = 3] ALL-NEXT: - LF_ENUMERATE [neutral = 4] ALL-NEXT: - LF_ENUMERATE [both = 5] -ALL-NEXT: 0x1003 | LF_ENUM [size = 120, hash = 208239] `__vc_attributes::threadingAttribute::threading_e` +ALL-NEXT: 0x1003 | LF_ENUM [size = 120, hash = 0x32D6F] `__vc_attributes::threadingAttribute::threading_e` ALL-NEXT: unique name: `.?AW4threading_e@threadingAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x1002, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x1004 | LF_STRUCTURE [size = 100, hash = 16377] `__vc_attributes::threadingAttribute` +ALL-NEXT: 0x1004 | LF_STRUCTURE [size = 100, hash = 0x3FF9] `__vc_attributes::threadingAttribute` ALL-NEXT: unique name: `.?AUthreadingAttribute@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: ALL-NEXT: options: forward ref | has unique name -ALL-NEXT: 0x1005 | LF_POINTER [size = 12, hash = 247078] +ALL-NEXT: 0x1005 | LF_POINTER [size = 12, hash = 0x3C526] ALL-NEXT: referent = 0x1004, mode = pointer, opts = const, kind = ptr32 -ALL-NEXT: 0x1006 | LF_ARGLIST [size = 12, hash = 194342] +ALL-NEXT: 0x1006 | LF_ARGLIST [size = 12, hash = 0x2F726] ALL-NEXT: 0x1003: `__vc_attributes::threadingAttribute::threading_e` -ALL-NEXT: 0x1007 | LF_MFUNCTION [size = 28, hash = 254156] +ALL-NEXT: 0x1007 | LF_MFUNCTION [size = 28, hash = 0x3E0CC] ALL-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x1006 ALL-NEXT: class type = 0x1004, this type = 0x1005, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1008 | LF_MFUNCTION [size = 28, hash = 194536] +ALL-NEXT: 0x1008 | LF_MFUNCTION [size = 28, hash = 0x2F7E8] ALL-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x1000 ALL-NEXT: class type = 0x1004, this type = 0x1005, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1009 | LF_METHODLIST [size = 20, hash = 167492] +ALL-NEXT: 0x1009 | LF_METHODLIST [size = 20, hash = 0x28E44] ALL-NEXT: - Method [type = 0x1007, vftable offset = -1, attrs = public] ALL-NEXT: - Method [type = 0x1008, vftable offset = -1, attrs = public] -ALL-NEXT: 0x100A | LF_FIELDLIST [size = 68, hash = 185421] +ALL-NEXT: 0x100A | LF_FIELDLIST [size = 68, hash = 0x2D44D] ALL-NEXT: - LF_NESTTYPE [name = `threading_e`, parent = 0x1003] ALL-NEXT: - LF_METHOD [name = `threadingAttribute`, # overloads = 2, overload list = 0x1009] ALL-NEXT: - LF_MEMBER [name = `value`, Type = 0x1003, offset = 0, attrs = public] -ALL-NEXT: 0x100B | LF_STRUCTURE [size = 100, hash = 119540] `__vc_attributes::threadingAttribute` +ALL-NEXT: 0x100B | LF_STRUCTURE [size = 100, hash = 0x1D2F4] `__vc_attributes::threadingAttribute` ALL-NEXT: unique name: `.?AUthreadingAttribute@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: 0x100A ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name -ALL-NEXT: 0x100C | LF_FIELDLIST [size = 48, hash = 261871] +ALL-NEXT: 0x100C | LF_FIELDLIST [size = 48, hash = 0x3FEEF] ALL-NEXT: - LF_ENUMERATE [native = 0] ALL-NEXT: - LF_ENUMERATE [com = 1] ALL-NEXT: - LF_ENUMERATE [managed = 2] -ALL-NEXT: 0x100D | LF_ENUM [size = 120, hash = 198119] `__vc_attributes::event_receiverAttribute::type_e` +ALL-NEXT: 0x100D | LF_ENUM [size = 120, hash = 0x305E7] `__vc_attributes::event_receiverAttribute::type_e` ALL-NEXT: unique name: `.?AW4type_e@event_receiverAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x100C, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x100E | LF_STRUCTURE [size = 112, hash = 48056] `__vc_attributes::event_receiverAttribute` +ALL-NEXT: 0x100E | LF_STRUCTURE [size = 112, hash = 0xBBB8] `__vc_attributes::event_receiverAttribute` ALL-NEXT: unique name: `.?AUevent_receiverAttribute@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: ALL-NEXT: options: forward ref | has unique name -ALL-NEXT: 0x100F | LF_POINTER [size = 12, hash = 251486] +ALL-NEXT: 0x100F | LF_POINTER [size = 12, hash = 0x3D65E] ALL-NEXT: referent = 0x100E, mode = pointer, opts = const, kind = ptr32 -ALL-NEXT: 0x1010 | LF_ARGLIST [size = 16, hash = 134580] +ALL-NEXT: 0x1010 | LF_ARGLIST [size = 16, hash = 0x20DB4] ALL-NEXT: 0x100D: `__vc_attributes::event_receiverAttribute::type_e` ALL-NEXT: 0x0030 (bool): `bool` -ALL-NEXT: 0x1011 | LF_MFUNCTION [size = 28, hash = 148190] +ALL-NEXT: 0x1011 | LF_MFUNCTION [size = 28, hash = 0x242DE] ALL-NEXT: return type = 0x0003 (void), # args = 2, param list = 0x1010 ALL-NEXT: class type = 0x100E, this type = 0x100F, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1012 | LF_ARGLIST [size = 12, hash = 113636] +ALL-NEXT: 0x1012 | LF_ARGLIST [size = 12, hash = 0x1BBE4] ALL-NEXT: 0x100D: `__vc_attributes::event_receiverAttribute::type_e` -ALL-NEXT: 0x1013 | LF_MFUNCTION [size = 28, hash = 53336] +ALL-NEXT: 0x1013 | LF_MFUNCTION [size = 28, hash = 0xD058] ALL-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x1012 ALL-NEXT: class type = 0x100E, this type = 0x100F, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1014 | LF_MFUNCTION [size = 28, hash = 55779] +ALL-NEXT: 0x1014 | LF_MFUNCTION [size = 28, hash = 0xD9E3] ALL-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x1000 ALL-NEXT: class type = 0x100E, this type = 0x100F, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1015 | LF_METHODLIST [size = 28, hash = 220695] +ALL-NEXT: 0x1015 | LF_METHODLIST [size = 28, hash = 0x35E17] ALL-NEXT: - Method [type = 0x1011, vftable offset = -1, attrs = public] ALL-NEXT: - Method [type = 0x1013, vftable offset = -1, attrs = public] ALL-NEXT: - Method [type = 0x1014, vftable offset = -1, attrs = public] -ALL-NEXT: 0x1016 | LF_FIELDLIST [size = 96, hash = 198114] +ALL-NEXT: 0x1016 | LF_FIELDLIST [size = 96, hash = 0x305E2] ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x100D] ALL-NEXT: - LF_METHOD [name = `event_receiverAttribute`, # overloads = 3, overload list = 0x1015] ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x100D, offset = 0, attrs = public] ALL-NEXT: - LF_MEMBER [name = `layout_dependent`, Type = 0x0030 (bool), offset = 4, attrs = public] -ALL-NEXT: 0x1017 | LF_STRUCTURE [size = 112, hash = 148734] `__vc_attributes::event_receiverAttribute` +ALL-NEXT: 0x1017 | LF_STRUCTURE [size = 112, hash = 0x244FE] `__vc_attributes::event_receiverAttribute` ALL-NEXT: unique name: `.?AUevent_receiverAttribute@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: 0x1016 ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name -ALL-NEXT: 0x1018 | LF_FIELDLIST [size = 48, hash = 81128] +ALL-NEXT: 0x1018 | LF_FIELDLIST [size = 48, hash = 0x13CE8] ALL-NEXT: - LF_ENUMERATE [never = 0] ALL-NEXT: - LF_ENUMERATE [allowed = 1] ALL-NEXT: - LF_ENUMERATE [always = 2] -ALL-NEXT: 0x1019 | LF_ENUM [size = 116, hash = 60158] `__vc_attributes::aggregatableAttribute::type_e` +ALL-NEXT: 0x1019 | LF_ENUM [size = 116, hash = 0xEAFE] `__vc_attributes::aggregatableAttribute::type_e` ALL-NEXT: unique name: `.?AW4type_e@aggregatableAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x1018, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x101A | LF_STRUCTURE [size = 108, hash = 217249] `__vc_attributes::aggregatableAttribute` +ALL-NEXT: 0x101A | LF_STRUCTURE [size = 108, hash = 0x350A1] `__vc_attributes::aggregatableAttribute` ALL-NEXT: unique name: `.?AUaggregatableAttribute@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: ALL-NEXT: options: forward ref | has unique name -ALL-NEXT: 0x101B | LF_POINTER [size = 12, hash = 174209] +ALL-NEXT: 0x101B | LF_POINTER [size = 12, hash = 0x2A881] ALL-NEXT: referent = 0x101A, mode = pointer, opts = const, kind = ptr32 -ALL-NEXT: 0x101C | LF_ARGLIST [size = 12, hash = 159978] +ALL-NEXT: 0x101C | LF_ARGLIST [size = 12, hash = 0x270EA] ALL-NEXT: 0x1019: `__vc_attributes::aggregatableAttribute::type_e` -ALL-NEXT: 0x101D | LF_MFUNCTION [size = 28, hash = 249504] +ALL-NEXT: 0x101D | LF_MFUNCTION [size = 28, hash = 0x3CEA0] ALL-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x101C ALL-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x101E | LF_MFUNCTION [size = 28, hash = 141941] +ALL-NEXT: 0x101E | LF_MFUNCTION [size = 28, hash = 0x22A75] ALL-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x1000 ALL-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x101F | LF_METHODLIST [size = 20, hash = 238785] +ALL-NEXT: 0x101F | LF_METHODLIST [size = 20, hash = 0x3A4C1] ALL-NEXT: - Method [type = 0x101D, vftable offset = -1, attrs = public] ALL-NEXT: - Method [type = 0x101E, vftable offset = -1, attrs = public] -ALL-NEXT: 0x1020 | LF_FIELDLIST [size = 68, hash = 6214] +ALL-NEXT: 0x1020 | LF_FIELDLIST [size = 68, hash = 0x1846] ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x1019] ALL-NEXT: - LF_METHOD [name = `aggregatableAttribute`, # overloads = 2, overload list = 0x101F] ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x1019, offset = 0, attrs = public] -ALL-NEXT: 0x1021 | LF_STRUCTURE [size = 108, hash = 94935] `__vc_attributes::aggregatableAttribute` +ALL-NEXT: 0x1021 | LF_STRUCTURE [size = 108, hash = 0x172D7] `__vc_attributes::aggregatableAttribute` ALL-NEXT: unique name: `.?AUaggregatableAttribute@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: 0x1020 ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name -ALL-NEXT: 0x1022 | LF_ENUM [size = 116, hash = 151449] `__vc_attributes::event_sourceAttribute::type_e` +ALL-NEXT: 0x1022 | LF_ENUM [size = 116, hash = 0x24F99] `__vc_attributes::event_sourceAttribute::type_e` ALL-NEXT: unique name: `.?AW4type_e@event_sourceAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x100C, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x1023 | LF_FIELDLIST [size = 28, hash = 135589] +ALL-NEXT: 0x1023 | LF_FIELDLIST [size = 28, hash = 0x211A5] ALL-NEXT: - LF_ENUMERATE [speed = 0] ALL-NEXT: - LF_ENUMERATE [size = 1] -ALL-NEXT: 0x1024 | LF_ENUM [size = 124, hash = 73373] `__vc_attributes::event_sourceAttribute::optimize_e` +ALL-NEXT: 0x1024 | LF_ENUM [size = 124, hash = 0x11E9D] `__vc_attributes::event_sourceAttribute::optimize_e` ALL-NEXT: unique name: `.?AW4optimize_e@event_sourceAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x1023, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x1025 | LF_STRUCTURE [size = 108, hash = 96512] `__vc_attributes::event_sourceAttribute` +ALL-NEXT: 0x1025 | LF_STRUCTURE [size = 108, hash = 0x17900] `__vc_attributes::event_sourceAttribute` ALL-NEXT: unique name: `.?AUevent_sourceAttribute@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: ALL-NEXT: options: forward ref | has unique name -ALL-NEXT: 0x1026 | LF_POINTER [size = 12, hash = 254299] +ALL-NEXT: 0x1026 | LF_POINTER [size = 12, hash = 0x3E15B] ALL-NEXT: referent = 0x1025, mode = pointer, opts = const, kind = ptr32 -ALL-NEXT: 0x1027 | LF_ARGLIST [size = 12, hash = 17744] +ALL-NEXT: 0x1027 | LF_ARGLIST [size = 12, hash = 0x4550] ALL-NEXT: 0x1022: `__vc_attributes::event_sourceAttribute::type_e` -ALL-NEXT: 0x1028 | LF_MFUNCTION [size = 28, hash = 239514] +ALL-NEXT: 0x1028 | LF_MFUNCTION [size = 28, hash = 0x3A79A] ALL-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x1027 ALL-NEXT: class type = 0x1025, this type = 0x1026, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1029 | LF_MFUNCTION [size = 28, hash = 173189] +ALL-NEXT: 0x1029 | LF_MFUNCTION [size = 28, hash = 0x2A485] ALL-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x1000 ALL-NEXT: class type = 0x1025, this type = 0x1026, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x102A | LF_METHODLIST [size = 20, hash = 130544] +ALL-NEXT: 0x102A | LF_METHODLIST [size = 20, hash = 0x1FDF0] ALL-NEXT: - Method [type = 0x1028, vftable offset = -1, attrs = public] ALL-NEXT: - Method [type = 0x1029, vftable offset = -1, attrs = public] -ALL-NEXT: 0x102B | LF_FIELDLIST [size = 128, hash = 204437] +ALL-NEXT: 0x102B | LF_FIELDLIST [size = 128, hash = 0x31E95] ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x1022] ALL-NEXT: - LF_NESTTYPE [name = `optimize_e`, parent = 0x1024] ALL-NEXT: - LF_METHOD [name = `event_sourceAttribute`, # overloads = 2, overload list = 0x102A] ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x1022, offset = 0, attrs = public] ALL-NEXT: - LF_MEMBER [name = `optimize`, Type = 0x1024, offset = 4, attrs = public] ALL-NEXT: - LF_MEMBER [name = `decorate`, Type = 0x0030 (bool), offset = 8, attrs = public] -ALL-NEXT: 0x102C | LF_STRUCTURE [size = 108, hash = 238560] `__vc_attributes::event_sourceAttribute` +ALL-NEXT: 0x102C | LF_STRUCTURE [size = 108, hash = 0x3A3E0] `__vc_attributes::event_sourceAttribute` ALL-NEXT: unique name: `.?AUevent_sourceAttribute@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: 0x102B ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name -ALL-NEXT: 0x102D | LF_FIELDLIST [size = 92, hash = 144673] +ALL-NEXT: 0x102D | LF_FIELDLIST [size = 92, hash = 0x23521] ALL-NEXT: - LF_ENUMERATE [dll = 1] ALL-NEXT: - LF_ENUMERATE [exe = 2] ALL-NEXT: - LF_ENUMERATE [service = 3] ALL-NEXT: - LF_ENUMERATE [unspecified = 4] ALL-NEXT: - LF_ENUMERATE [EXE = 2] ALL-NEXT: - LF_ENUMERATE [SERVICE = 3] -ALL-NEXT: 0x102E | LF_ENUM [size = 104, hash = 115151] `__vc_attributes::moduleAttribute::type_e` +ALL-NEXT: 0x102E | LF_ENUM [size = 104, hash = 0x1C1CF] `__vc_attributes::moduleAttribute::type_e` ALL-NEXT: unique name: `.?AW4type_e@moduleAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x102D, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x102F | LF_STRUCTURE [size = 96, hash = 197306] `__vc_attributes::moduleAttribute` +ALL-NEXT: 0x102F | LF_STRUCTURE [size = 96, hash = 0x302BA] `__vc_attributes::moduleAttribute` ALL-NEXT: unique name: `.?AUmoduleAttribute@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: ALL-NEXT: options: forward ref | has unique name -ALL-NEXT: 0x1030 | LF_POINTER [size = 12, hash = 256035] +ALL-NEXT: 0x1030 | LF_POINTER [size = 12, hash = 0x3E823] ALL-NEXT: referent = 0x102F, mode = pointer, opts = const, kind = ptr32 -ALL-NEXT: 0x1031 | LF_MODIFIER [size = 12, hash = 101096] +ALL-NEXT: 0x1031 | LF_MODIFIER [size = 12, hash = 0x18AE8] ALL-NEXT: referent = 0x0070 (char), modifiers = const -ALL-NEXT: 0x1032 | LF_POINTER [size = 12, hash = 231280] +ALL-NEXT: 0x1032 | LF_POINTER [size = 12, hash = 0x38770] ALL-NEXT: referent = 0x1031, mode = pointer, opts = None, kind = ptr32 -ALL-NEXT: 0x1033 | LF_ARGLIST [size = 68, hash = 52156] +ALL-NEXT: 0x1033 | LF_ARGLIST [size = 68, hash = 0xCBBC] ALL-NEXT: 0x102E: `__vc_attributes::moduleAttribute::type_e` ALL-NEXT: 0x1032: `const char*` ALL-NEXT: 0x1032: `const char*` @@ -289,25 +289,25 @@ ALL-NEXT: 0x0030 (bool): `bool` ALL-NEXT: 0x0030 (bool): `bool` ALL-NEXT: 0x1032: `const char*` ALL-NEXT: 0x1032: `const char*` -ALL-NEXT: 0x1034 | LF_MFUNCTION [size = 28, hash = 48854] +ALL-NEXT: 0x1034 | LF_MFUNCTION [size = 28, hash = 0xBED6] ALL-NEXT: return type = 0x0003 (void), # args = 15, param list = 0x1033 ALL-NEXT: class type = 0x102F, this type = 0x1030, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1035 | LF_ARGLIST [size = 12, hash = 170035] +ALL-NEXT: 0x1035 | LF_ARGLIST [size = 12, hash = 0x29833] ALL-NEXT: 0x102E: `__vc_attributes::moduleAttribute::type_e` -ALL-NEXT: 0x1036 | LF_MFUNCTION [size = 28, hash = 177041] +ALL-NEXT: 0x1036 | LF_MFUNCTION [size = 28, hash = 0x2B391] ALL-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x1035 ALL-NEXT: class type = 0x102F, this type = 0x1030, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1037 | LF_MFUNCTION [size = 28, hash = 102745] +ALL-NEXT: 0x1037 | LF_MFUNCTION [size = 28, hash = 0x19159] ALL-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x1000 ALL-NEXT: class type = 0x102F, this type = 0x1030, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1038 | LF_METHODLIST [size = 28, hash = 16947] +ALL-NEXT: 0x1038 | LF_METHODLIST [size = 28, hash = 0x4233] ALL-NEXT: - Method [type = 0x1034, vftable offset = -1, attrs = public] ALL-NEXT: - Method [type = 0x1036, vftable offset = -1, attrs = public] ALL-NEXT: - Method [type = 0x1037, vftable offset = -1, attrs = public] -ALL-NEXT: 0x1039 | LF_FIELDLIST [size = 356, hash = 183703] +ALL-NEXT: 0x1039 | LF_FIELDLIST [size = 356, hash = 0x2CD97] ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x102E] ALL-NEXT: - LF_METHOD [name = `moduleAttribute`, # overloads = 3, overload list = 0x1038] ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x102E, offset = 0, attrs = public] @@ -325,11 +325,11 @@ ALL-NEXT: - LF_MEMBER [name = `hidden`, Type = 0x0030 (bool), offset ALL-NEXT: - LF_MEMBER [name = `restricted`, Type = 0x0030 (bool), offset = 45, attrs = public] ALL-NEXT: - LF_MEMBER [name = `custom`, Type = 0x1032, offset = 48, attrs = public] ALL-NEXT: - LF_MEMBER [name = `resource_name`, Type = 0x1032, offset = 52, attrs = public] -ALL-NEXT: 0x103A | LF_STRUCTURE [size = 96, hash = 98548] `__vc_attributes::moduleAttribute` +ALL-NEXT: 0x103A | LF_STRUCTURE [size = 96, hash = 0x180F4] `__vc_attributes::moduleAttribute` ALL-NEXT: unique name: `.?AUmoduleAttribute@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: 0x1039 ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name -ALL-NEXT: 0x103B | LF_FIELDLIST [size = 756, hash = 35693] +ALL-NEXT: 0x103B | LF_FIELDLIST [size = 756, hash = 0x8B6D] ALL-NEXT: - LF_ENUMERATE [eAnyUsage = 0] ALL-NEXT: - LF_ENUMERATE [eCoClassUsage = 1] ALL-NEXT: - LF_ENUMERATE [eCOMInterfaceUsage = 2] @@ -360,58 +360,58 @@ ALL-NEXT: - LF_ENUMERATE [eModuleUsage = 16777216] ALL-NEXT: - LF_ENUMERATE [eIllegalUsage = 33554432] ALL-NEXT: - LF_ENUMERATE [eAsynchronousUsage = 67108864] ALL-NEXT: - LF_ENUMERATE [eAnyIDLUsage = 4161535] -ALL-NEXT: 0x103C | LF_ENUM [size = 140, hash = 171328] `__vc_attributes::helper_attributes::usageAttribute::usage_e` +ALL-NEXT: 0x103C | LF_ENUM [size = 140, hash = 0x29D40] `__vc_attributes::helper_attributes::usageAttribute::usage_e` ALL-NEXT: unique name: `.?AW4usage_e@usageAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: field list: 0x103B, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x103D | LF_STRUCTURE [size = 128, hash = 203640] `__vc_attributes::helper_attributes::usageAttribute` +ALL-NEXT: 0x103D | LF_STRUCTURE [size = 128, hash = 0x31B78] `__vc_attributes::helper_attributes::usageAttribute` ALL-NEXT: unique name: `.?AUusageAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: ALL-NEXT: options: forward ref | has unique name -ALL-NEXT: 0x103E | LF_POINTER [size = 12, hash = 139292] +ALL-NEXT: 0x103E | LF_POINTER [size = 12, hash = 0x2201C] ALL-NEXT: referent = 0x103D, mode = pointer, opts = const, kind = ptr32 -ALL-NEXT: 0x103F | LF_ARGLIST [size = 12, hash = 49018] +ALL-NEXT: 0x103F | LF_ARGLIST [size = 12, hash = 0xBF7A] ALL-NEXT: 0x0075 (unsigned): `unsigned` -ALL-NEXT: 0x1040 | LF_MFUNCTION [size = 28, hash = 43821] +ALL-NEXT: 0x1040 | LF_MFUNCTION [size = 28, hash = 0xAB2D] ALL-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x103F ALL-NEXT: class type = 0x103D, this type = 0x103E, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1041 | LF_FIELDLIST [size = 60, hash = 202555] +ALL-NEXT: 0x1041 | LF_FIELDLIST [size = 60, hash = 0x3173B] ALL-NEXT: - LF_NESTTYPE [name = `usage_e`, parent = 0x103C] ALL-NEXT: - LF_ONEMETHOD [name = `usageAttribute`] ALL-NEXT: type = 0x1040, vftable offset = -1, attrs = public ALL-NEXT: - LF_MEMBER [name = `value`, Type = 0x0075 (unsigned), offset = 0, attrs = public] -ALL-NEXT: 0x1042 | LF_STRUCTURE [size = 128, hash = 165040] `__vc_attributes::helper_attributes::usageAttribute` +ALL-NEXT: 0x1042 | LF_STRUCTURE [size = 128, hash = 0x284B0] `__vc_attributes::helper_attributes::usageAttribute` ALL-NEXT: unique name: `.?AUusageAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: 0x1041 ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name -ALL-NEXT: 0x1043 | LF_FIELDLIST [size = 68, hash = 215835] +ALL-NEXT: 0x1043 | LF_FIELDLIST [size = 68, hash = 0x34B1B] ALL-NEXT: - LF_ENUMERATE [eBoolean = 0] ALL-NEXT: - LF_ENUMERATE [eInteger = 1] ALL-NEXT: - LF_ENUMERATE [eFloat = 2] ALL-NEXT: - LF_ENUMERATE [eDouble = 3] -ALL-NEXT: 0x1044 | LF_ENUM [size = 148, hash = 142625] `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e` +ALL-NEXT: 0x1044 | LF_ENUM [size = 148, hash = 0x22D21] `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e` ALL-NEXT: unique name: `.?AW4type_e@v1_alttypeAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: field list: 0x1043, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x1045 | LF_STRUCTURE [size = 140, hash = 52534] `__vc_attributes::helper_attributes::v1_alttypeAttribute` +ALL-NEXT: 0x1045 | LF_STRUCTURE [size = 140, hash = 0xCD36] `__vc_attributes::helper_attributes::v1_alttypeAttribute` ALL-NEXT: unique name: `.?AUv1_alttypeAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: ALL-NEXT: options: forward ref | has unique name -ALL-NEXT: 0x1046 | LF_POINTER [size = 12, hash = 44186] +ALL-NEXT: 0x1046 | LF_POINTER [size = 12, hash = 0xAC9A] ALL-NEXT: referent = 0x1045, mode = pointer, opts = const, kind = ptr32 -ALL-NEXT: 0x1047 | LF_ARGLIST [size = 12, hash = 103930] +ALL-NEXT: 0x1047 | LF_ARGLIST [size = 12, hash = 0x195FA] ALL-NEXT: 0x1044: `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e` -ALL-NEXT: 0x1048 | LF_MFUNCTION [size = 28, hash = 110942] +ALL-NEXT: 0x1048 | LF_MFUNCTION [size = 28, hash = 0x1B15E] ALL-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x1047 ALL-NEXT: class type = 0x1045, this type = 0x1046, this adjust = 0 ALL-NEXT: calling conv = thiscall, options = constructor -ALL-NEXT: 0x1049 | LF_FIELDLIST [size = 64, hash = 17991] +ALL-NEXT: 0x1049 | LF_FIELDLIST [size = 64, hash = 0x4647] ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x1044] ALL-NEXT: - LF_ONEMETHOD [name = `v1_alttypeAttribute`] ALL-NEXT: type = 0x1048, vftable offset = -1, attrs = public ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x1044, offset = 0, attrs = public] -ALL-NEXT: 0x104A | LF_STRUCTURE [size = 140, hash = 213215] `__vc_attributes::helper_attributes::v1_alttypeAttribute` +ALL-NEXT: 0x104A | LF_STRUCTURE [size = 140, hash = 0x340DF] `__vc_attributes::helper_attributes::v1_alttypeAttribute` ALL-NEXT: unique name: `.?AUv1_alttypeAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: vtable: , base list: , field list: 0x1049 ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name @@ -421,29 +421,29 @@ ALL: Hash Adjusters: ALL: Types (IPI Stream) ALL-NEXT: ============================================================ ALL-NEXT: Showing 15 records -ALL-NEXT: 0x1000 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7186] +ALL-NEXT: 0x1000 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C12] ALL-NEXT: udt = 0x100B, mod = 1, file = 1, line = 481 -ALL-NEXT: 0x1001 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7198] +ALL-NEXT: 0x1001 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C1E] ALL-NEXT: udt = 0x1017, mod = 1, file = 1, line = 194 -ALL-NEXT: 0x1002 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7180] +ALL-NEXT: 0x1002 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C0C] ALL-NEXT: udt = 0x1021, mod = 1, file = 1, line = 603 -ALL-NEXT: 0x1003 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7191] +ALL-NEXT: 0x1003 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C17] ALL-NEXT: udt = 0x102C, mod = 1, file = 1, line = 1200 -ALL-NEXT: 0x1004 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7201] +ALL-NEXT: 0x1004 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C21] ALL-NEXT: udt = 0x103A, mod = 1, file = 1, line = 540 -ALL-NEXT: 0x1005 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7241] +ALL-NEXT: 0x1005 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C49] ALL-NEXT: udt = 0x1042, mod = 1, file = 1, line = 108 -ALL-NEXT: 0x1006 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7249] +ALL-NEXT: 0x1006 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C51] ALL-NEXT: udt = 0x104A, mod = 1, file = 1, line = 96 -ALL-NEXT: 0x1007 | LF_STRING_ID [size = 48, hash = 80727] ID: , String: d:\src\llvm\test\DebugInfo\PDB\Inputs -ALL-NEXT: 0x1008 | LF_STRING_ID [size = 76, hash = 154177] ID: , String: C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe -ALL-NEXT: 0x1009 | LF_STRING_ID [size = 20, hash = 75189] ID: , String: empty.cpp -ALL-NEXT: 0x100A | LF_STRING_ID [size = 56, hash = 253662] ID: , String: d:\src\llvm\test\DebugInfo\PDB\Inputs\vc120.pdb -ALL-NEXT: 0x100B | LF_STRING_ID [size = 252, hash = 193467] ID: , String: -Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows -ALL-NEXT: 0x100C | LF_SUBSTR_LIST [size = 12, hash = 222705] +ALL-NEXT: 0x1007 | LF_STRING_ID [size = 48, hash = 0x13B57] ID: , String: d:\src\llvm\test\DebugInfo\PDB\Inputs +ALL-NEXT: 0x1008 | LF_STRING_ID [size = 76, hash = 0x25A41] ID: , String: C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe +ALL-NEXT: 0x1009 | LF_STRING_ID [size = 20, hash = 0x125B5] ID: , String: empty.cpp +ALL-NEXT: 0x100A | LF_STRING_ID [size = 56, hash = 0x3DEDE] ID: , String: d:\src\llvm\test\DebugInfo\PDB\Inputs\vc120.pdb +ALL-NEXT: 0x100B | LF_STRING_ID [size = 252, hash = 0x2F3BB] ID: , String: -Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows +ALL-NEXT: 0x100C | LF_SUBSTR_LIST [size = 12, hash = 0x365F1] ALL-NEXT: 0x100B: `-Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows` -ALL-NEXT: 0x100D | LF_STRING_ID [size = 96, hash = 186099] ID: 0x100C, String: Kits\8.1\include\um" -I"C:\Program Files (x86)\Windows Kits\8.1\include\winrt" -TP -X -ALL-NEXT: 0x100E | LF_BUILDINFO [size = 28, hash = 257108] +ALL-NEXT: 0x100D | LF_STRING_ID [size = 96, hash = 0x2D6F3] ID: 0x100C, String: Kits\8.1\include\um" -I"C:\Program Files (x86)\Windows Kits\8.1\include\winrt" -TP -X +ALL-NEXT: 0x100E | LF_BUILDINFO [size = 28, hash = 0x3EC54] ALL-NEXT: 0x1007: `d:\src\llvm\test\DebugInfo\PDB\Inputs` ALL-NEXT: 0x1008: `C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe` ALL-NEXT: 0x1009: `empty.cpp` @@ -472,13 +472,13 @@ ALL-NEXT: frontend = 18.0.31101.0, backend = 18.0.31101.0 ALL-NEXT: flags = security checks ALL-NEXT: 120 | S_GPROC32 [size = 44] `main` ALL-NEXT: parent = 0, end = 196, addr = 0001:0016, code size = 10 -ALL-NEXT: debug start = 3, debug end = 8, flags = has fp +ALL-NEXT: type = `0x1001 (int ())`, debug start = 3, debug end = 8, flags = has fp ALL-NEXT: 164 | S_FRAMEPROC [size = 32] ALL-NEXT: size = 0, padding size = 0, offset to padding = 0 ALL-NEXT: bytes of callee saved registers = 0, exception handler addr = 0000:0000 ALL-NEXT: flags = has async eh | opt speed ALL-NEXT: 196 | S_END [size = 4] -ALL-NEXT: 200 | S_BUILDINFO [size = 8] BuildId = `4110` +ALL-NEXT: 200 | S_BUILDINFO [size = 8] BuildId = `0x100E` ALL-NEXT: Mod 0001 | `* Linker *`: ALL-NEXT: 4 | S_OBJNAME [size = 20] sig=0, `* Linker *` ALL-NEXT: 24 | S_COMPILE3 [size = 48] diff --git a/test/DebugInfo/X86/DIModule.ll b/test/DebugInfo/X86/DIModule.ll index 1fe7f0c5fabe..eacdfe10f53b 100644 --- a/test/DebugInfo/X86/DIModule.ll +++ b/test/DebugInfo/X86/DIModule.ll @@ -18,7 +18,7 @@ target triple = "x86_64-apple-macosx" !1 = !DIFile(filename: "/llvm/tools/clang/test/Modules/", directory: "/") !2 = !{} !3 = !{!4} -!4 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !0, entity: !5, line: 5) +!4 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !0, entity: !5, file: !1, line: 5) !5 = !DIModule(scope: null, name: "DebugModule", configMacros: "-DMODULES=0", includePath: "/llvm/tools/clang/test/Modules/Inputs", isysroot: "/") !6 = !{i32 2, !"Dwarf Version", i32 4} !7 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/test/DebugInfo/X86/DIModuleContext.ll b/test/DebugInfo/X86/DIModuleContext.ll index a63fd0f373cd..02d4441c8234 100644 --- a/test/DebugInfo/X86/DIModuleContext.ll +++ b/test/DebugInfo/X86/DIModuleContext.ll @@ -24,7 +24,7 @@ target triple = "x86_64-apple-macosx" !4 = !{} !5 = !{!0} !6 = !{!7} -!7 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !8, line: 11) +!7 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !8, file: !3, line: 11) !8 = !DIModule(scope: null, name: "Module", includePath: ".", isysroot: "/") !9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 64) !10 = !DICompositeType(tag: DW_TAG_structure_type, name: "s", scope: !8, file: !3, line: 1, flags: DIFlagFwdDecl) diff --git a/test/DebugInfo/X86/fission-inline.ll b/test/DebugInfo/X86/fission-inline.ll index 45e0127294d1..eadcd15b2f21 100644 --- a/test/DebugInfo/X86/fission-inline.ll +++ b/test/DebugInfo/X86/fission-inline.ll @@ -110,7 +110,7 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n !16 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed) !17 = !DISubprogram(name: "f2", linkageName: "_ZN3foo2f2IiEEvv", line: 10, isLocal: false, isDefinition: false, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 10, file: !1, scope: !4, type: !12, templateParams: !14) !18 = !{!19} -!19 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 19, scope: !20, entity: !4) +!19 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 19, scope: !20, entity: !4) !20 = distinct !DILexicalBlock(line: 16, column: 13, file: !1, scope: !21) !21 = distinct !DILexicalBlock(line: 16, column: 7, file: !1, scope: !10) !22 = !{i32 2, !"Dwarf Version", i32 4} diff --git a/test/DebugInfo/X86/gnu-public-names.ll b/test/DebugInfo/X86/gnu-public-names.ll index 533ab838a732..df1d113538ea 100644 --- a/test/DebugInfo/X86/gnu-public-names.ll +++ b/test/DebugInfo/X86/gnu-public-names.ll @@ -345,9 +345,9 @@ attributes #1 = { nounwind readnone } !42 = !DINamespace(scope: !43) !43 = !DINamespace(name: "outer", scope: null) !44 = !{!45, !47} -!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !46, line: 34) +!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !46, file:!3, line: 34) !46 = !DIGlobalVariable(name: "global_namespace_variable_decl", linkageName: "_ZN2ns30global_namespace_variable_declE", scope: !18, file: !3, line: 28, type: !9, isLocal: false, isDefinition: false) -!47 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !43, entity: !42, line: 43) +!47 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !43, entity: !42, file: !3, line: 43) !48 = !{i32 2, !"Dwarf Version", i32 4} !49 = !{i32 2, !"Debug Info Version", i32 3} !50 = !{!"clang version 3.7.0 (trunk 234897) (llvm/trunk 234911)"} diff --git a/test/DebugInfo/X86/lexical-block-file-inline.ll b/test/DebugInfo/X86/lexical-block-file-inline.ll index 0f85a5573f03..9f040f41ec5e 100644 --- a/test/DebugInfo/X86/lexical-block-file-inline.ll +++ b/test/DebugInfo/X86/lexical-block-file-inline.ll @@ -134,7 +134,7 @@ attributes #2 = { nounwind } !8 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !9, file: !9, line: 6, type: !5, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) !9 = !DIFile(filename: "test.h", directory: "/") !10 = !{!11} -!11 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !12, entity: !14, line: 1) +!11 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !12, entity: !14, file: !1, line: 1) !12 = !DILexicalBlockFile(scope: !13, file: !9, discriminator: 0) !13 = distinct !DILexicalBlock(scope: !4, file: !1, line: 3) !14 = !DINamespace(name: "N", scope: null) diff --git a/test/DebugInfo/X86/pr19307.ll b/test/DebugInfo/X86/pr19307.ll index a8278c9dcf83..c94572f7d23c 100644 --- a/test/DebugInfo/X86/pr19307.ll +++ b/test/DebugInfo/X86/pr19307.ll @@ -105,26 +105,26 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n !19 = !DIDerivedType(tag: DW_TAG_typedef, name: "string", line: 65, file: !20, scope: !10, baseType: !8) !20 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", directory: "/llvm_cmake_gcc") !21 = !{!22, !26, !29, !33, !38, !41} -!22 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 57, scope: !23, entity: !25) +!22 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !1, line: 57, scope: !23, entity: !25) !23 = !DINamespace(name: "__gnu_debug", scope: null) !24 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/debug/debug.h", directory: "/llvm_cmake_gcc") !25 = !DINamespace(name: "__debug", scope: !10) -!26 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 66, scope: !10, entity: !27) +!26 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 66, scope: !10, entity: !27) !27 = !DIDerivedType(tag: DW_TAG_typedef, name: "mbstate_t", line: 106, file: !5, baseType: !28) !28 = !DIDerivedType(tag: DW_TAG_typedef, name: "__mbstate_t", line: 95, file: !5, baseType: !4) -!29 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 141, scope: !10, entity: !30) +!29 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 141, scope: !10, entity: !30) !30 = !DIDerivedType(tag: DW_TAG_typedef, name: "wint_t", line: 141, file: !31, baseType: !32) !31 = !DIFile(filename: "/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", directory: "/llvm_cmake_gcc") !32 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned) -!33 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 42, scope: !34, entity: !36) +!33 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 42, scope: !34, entity: !36) !34 = !DINamespace(name: "__gnu_cxx", scope: null) !35 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/cpp_type_traits.h", directory: "/llvm_cmake_gcc") !36 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", line: 155, file: !11, scope: !10, baseType: !37) !37 = !DIBasicType(tag: DW_TAG_base_type, name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned) -!38 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 43, scope: !34, entity: !39) +!38 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 43, scope: !34, entity: !39) !39 = !DIDerivedType(tag: DW_TAG_typedef, name: "ptrdiff_t", line: 156, file: !11, scope: !10, baseType: !40) !40 = !DIBasicType(tag: DW_TAG_base_type, name: "long int", size: 64, align: 64, encoding: DW_ATE_signed) -!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 55, scope: !10, entity: !6) +!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 55, scope: !10, entity: !6) !42 = !{i32 2, !"Dwarf Version", i32 4} !43 = !{i32 2, !"Debug Info Version", i32 3} !44 = !{!"clang version 3.5.0 (209308)"} diff --git a/test/DllTool/coff-exports.def b/test/DllTool/coff-exports.def new file mode 100644 index 000000000000..0226886a523c --- /dev/null +++ b/test/DllTool/coff-exports.def @@ -0,0 +1,13 @@ +; RUN: llvm-dlltool -m i386:x86-64 --input-def %s --output-lib %t.a +; RUN: llvm-readobj -coff-exports %t.a | FileCheck %s + +LIBRARY test.dll +EXPORTS +TestFunction + +; CHECK: File: test.dll +; CHECK: Format: COFF-import-file +; CHECK: Type: code +; CHECK: Name type: name +; CHECK: Symbol: __imp_TestFunction +; CHECK: Symbol: TestFunction diff --git a/test/DllTool/coff-weak-exports.def b/test/DllTool/coff-weak-exports.def new file mode 100644 index 000000000000..511d947d8395 --- /dev/null +++ b/test/DllTool/coff-weak-exports.def @@ -0,0 +1,19 @@ +; RUN: llvm-dlltool -m i386:x86-64 --input-def %s --output-lib %t.a +; RUN: llvm-readobj -coff-exports %t.a | FileCheck %s + +LIBRARY test.dll +EXPORTS +TestFunction==AltTestFunction + +; CHECK: File: test.dll +; CHECK: Format: COFF-x86-64 +; CHECK: Arch: x86_64 +; CHECK: AddressSize: 64bit +; CHECK: File: test.dll +; CHECK: Format: COFF-x86-64 +; CHECK: Arch: x86_64 +; CHECK: AddressSize: 64bit +; CHECK: File: test.dll +; CHECK: Format: COFF-x86-64 +; CHECK: Arch: x86_64 +; CHECK: AddressSize: 64bit diff --git a/test/DllTool/lit.local.cfg b/test/DllTool/lit.local.cfg new file mode 100644 index 000000000000..482608486d21 --- /dev/null +++ b/test/DllTool/lit.local.cfg @@ -0,0 +1 @@ +config.suffixes = ['.def'] diff --git a/test/FileCheck/regex-scope.txt b/test/FileCheck/regex-scope.txt index e77f3f6513a8..989f422c6bcd 100644 --- a/test/FileCheck/regex-scope.txt +++ b/test/FileCheck/regex-scope.txt @@ -1,4 +1,4 @@ -// RUN: FileCheck -check-prefix CHECK -input-file %s %s +// RUN: FileCheck -input-file %s %s // RUN: FileCheck -check-prefixes CHECK,GLOBAL -input-file %s %s // RUN: FileCheck -check-prefixes CHECK,LOCAL -input-file %s %s // RUN: FileCheck -check-prefixes CHECK,GLOBAL --enable-var-scope -input-file %s %s diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll index 9827e7a6792b..5eb5388c87a8 100644 --- a/test/Instrumentation/AddressSanitizer/basic.ll +++ b/test/Instrumentation/AddressSanitizer/basic.ll @@ -170,6 +170,26 @@ define void @memintr_test(i8* %a, i8* %b) nounwind uwtable sanitize_address { ; CHECK: __asan_memcpy ; CHECK: ret void +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind + +define void @memintr_element_atomic_test(i8* %a, i8* %b) nounwind uwtable sanitize_address { + ; This is a canary test to make sure that these don't get lowered into calls that don't + ; have the element-atomic property. Eventually, asan will have to be enhanced to lower + ; these properly. + ; CHECK-LABEL: memintr_element_atomic_test + ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1) + ; CHECK-NEXT: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1) + ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1) + ; CHECK-NEXT: ret void + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1) + tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1) + ret void +} + + ; CHECK-LABEL: @test_swifterror ; CHECK-NOT: __asan_report_load ; CHECK: ret void diff --git a/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll b/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll new file mode 100644 index 000000000000..32610ce3b815 --- /dev/null +++ b/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll @@ -0,0 +1,48 @@ +; This check verifies that arguments passed by value get redzones. +; RUN: opt < %s -asan -asan-realign-stack=32 -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { [8 x i32] } + +declare i32 @bar(%struct.A*) + +; Test behavior for named argument with explicit alignment. The memcpy and +; alloca alignments should match the explicit alignment of 64. +define void @foo(%struct.A* byval align 64 %a) sanitize_address { +entry: +; CHECK-LABEL: foo +; CHECK: call i64 @__asan_stack_malloc +; CHECK: alloca i8, i64 {{.*}} align 64 +; CHECK: [[copyPtr:%[^ \t]+]] = inttoptr i64 %{{[^ \t]+}} to %struct.A* +; CHECK: [[copyBytePtr:%[^ \t]+]] = bitcast %struct.A* [[copyPtr]] +; CHECK: [[aBytePtr:%[^ \t]+]] = bitcast %struct.A* %a +; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyBytePtr]]{{[^%]+}}[[aBytePtr]],{{[^,]+}}, i32 64 +; CHECK: call i32 @bar(%struct.A* [[copyPtr]]) +; CHECK: ret void + + %call = call i32 @bar(%struct.A* %a) + ret void +} + +; Test behavior for unnamed argument without explicit alignment. In this case, +; the first argument is referenced by the identifier %0 and the ABI requires a +; minimum alignment of 4 bytes since struct.A contains i32s which have 4-byte +; alignment. However, the alloca alignment will be 32 since that is the value +; passed via the -asan-realign-stack option, which is greater than 4. +define void @baz(%struct.A* byval) sanitize_address { +entry: +; CHECK-LABEL: baz +; CHECK: call i64 @__asan_stack_malloc +; CHECK: alloca i8, i64 {{.*}} align 32 +; CHECK: [[copyPtr:%[^ \t]+]] = inttoptr i64 %{{[^ \t]+}} to %struct.A* +; CHECK: [[copyBytePtr:%[^ \t]+]] = bitcast %struct.A* [[copyPtr]] +; CHECK: [[aBytePtr:%[^ \t]+]] = bitcast %struct.A* %0 +; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyBytePtr]]{{[^%]+}}[[aBytePtr]],{{[^,]+}}, i32 4 +; CHECK: call i32 @bar(%struct.A* [[copyPtr]]) +; CHECK: ret void + + %call = call i32 @bar(%struct.A* %0) + ret void +} diff --git a/test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll b/test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll new file mode 100644 index 000000000000..9b6e3db9050f --- /dev/null +++ b/test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll @@ -0,0 +1,37 @@ +; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;; Placeholder tests that will fail once element atomic @llvm.mem[move|set] instrinsics have +;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to +;; verify that dfsan handles these intrinsics properly once they have been +;; added to that class hierarchy. + +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind + +define void @test_memcpy(i8* nocapture, i8* nocapture) { + ; CHECK-LABEL: dfs$test_memcpy + ; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1) + ; CHECK: ret void + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1) + ret void +} + +define void @test_memmove(i8* nocapture, i8* nocapture) { + ; CHECK-LABEL: dfs$test_memmove + ; CHECK: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1) + ; CHECK: ret void + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1) + ret void +} + +define void @test_memset(i8* nocapture) { + ; CHECK-LABEL: dfs$test_memset + ; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %0, i8 88, i64 16, i32 1) + ; CHECK: ret void + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %0, i8 88, i64 16, i32 1) + ret void +} diff --git a/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll b/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll index 3457cfc7e278..344ad86e99e4 100644 --- a/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll +++ b/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll @@ -233,6 +233,39 @@ entry: ; CHECK: ret void } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Ensure that esan doesn't convert element atomic memory intrinsics to +; calls. + +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind + +define void @elementAtomic_memCpyTest(i8* nocapture %x, i8* nocapture %y) { + ; CHECK-LABEL: elementAtomic_memCpyTest + ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1) + ; CHECK-NEXT: ret void + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1) + ret void +} + +define void @elementAtomic_memMoveTest(i8* nocapture %x, i8* nocapture %y) { + ; CHECK-LABEL: elementAtomic_memMoveTest + ; CHECK-NEXT: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1) + ; CHECK-NEXT: ret void + tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1) + ret void +} + +define void @elementAtomic_memSetTest(i8* nocapture %x) { + ; CHECK-LABEL: elementAtomic_memSetTest + ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1) + ; CHECK-NEXT: ret void + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1) + ret void +} + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Top-level: diff --git a/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll b/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll index 1c5978e52864..22c8d5c59a16 100644 --- a/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll +++ b/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll @@ -250,6 +250,38 @@ entry: ; CHECK: ret void } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Ensure that esan doesn't convert element atomic memory intrinsics to +; calls. + +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind + +define void @elementAtomic_memCpyTest(i8* nocapture %x, i8* nocapture %y) { + ; CHECK-LABEL: elementAtomic_memCpyTest + ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1) + ; CHECK-NEXT: ret void + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1) + ret void +} + +define void @elementAtomic_memMoveTest(i8* nocapture %x, i8* nocapture %y) { + ; CHECK-LABEL: elementAtomic_memMoveTest + ; CHECK-NEXT: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1) + ; CHECK-NEXT: ret void + tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1) + ret void +} + +define void @elementAtomic_memSetTest(i8* nocapture %x) { + ; CHECK-LABEL: elementAtomic_memSetTest + ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1) + ; CHECK-NEXT: ret void + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1) + ret void +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Top-level: diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll index ffb239a15256..47912b5b6901 100644 --- a/test/Instrumentation/MemorySanitizer/msan_basic.ll +++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll @@ -238,6 +238,41 @@ declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, ; CHECK: call i8* @__msan_memmove ; CHECK: ret void +;; ------------ +;; Placeholder tests that will fail once element atomic @llvm.mem[cpy|move|set] instrinsics have +;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to +;; verify that MSAN handles these intrinsics properly once they have been +;; added to that class hierarchy. +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind + +define void @atomic_memcpy(i8* nocapture %x, i8* nocapture %y) nounwind { + ; CHECK-LABEL: atomic_memcpy + ; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1) + ret void +} + +define void @atomic_memmove(i8* nocapture %x, i8* nocapture %y) nounwind { + ; CHECK-LABEL: atomic_memmove + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1) + ret void +} + +define void @atomic_memset(i8* nocapture %x) nounwind { + ; CHECK-LABEL: atomic_memset + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 88, i64 16, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 88, i64 16, i32 1) + ret void +} + +;; ------------ + ; Check that we propagate shadow for "select" diff --git a/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll new file mode 100644 index 000000000000..badf07db2c15 --- /dev/null +++ b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll @@ -0,0 +1,22 @@ +; Test -sanitizer-coverage-trace-compares=1 API declarations on a non-x86_64 arch +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s + +target triple = "x86_32-unknown-linux-gnu" +define i32 @foo() #0 { +entry: + ret i32 0 +} + +; CHECK: declare void @__sanitizer_cov_trace_pc_indir(i64) +; CHECK: declare void @__sanitizer_cov_trace_cmp1(i8, i8) +; CHECK: declare void @__sanitizer_cov_trace_cmp2(i16, i16) +; CHECK: declare void @__sanitizer_cov_trace_cmp4(i32, i32) +; CHECK: declare void @__sanitizer_cov_trace_cmp8(i64, i64) +; CHECK: declare void @__sanitizer_cov_trace_div4(i32) +; CHECK: declare void @__sanitizer_cov_trace_div8(i64) +; CHECK: declare void @__sanitizer_cov_trace_gep(i64) +; CHECK: declare void @__sanitizer_cov_trace_switch(i64, i64*) +; CHECK: declare void @__sanitizer_cov_trace_pc() +; CHECK: declare void @__sanitizer_cov_trace_pc_guard(i32*) +; CHECK: declare void @__sanitizer_cov_trace_pc_guard_init(i32*, i32*) +; CHECK-NOT: declare diff --git a/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll new file mode 100644 index 000000000000..16689f9831d8 --- /dev/null +++ b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll @@ -0,0 +1,22 @@ +; Test -sanitizer-coverage-trace-compares=1 API declarations on x86_64 +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" +define i32 @foo() #0 { +entry: + ret i32 0 +} + +; CHECK: declare void @__sanitizer_cov_trace_pc_indir(i64) +; CHECK: declare void @__sanitizer_cov_trace_cmp1(i8 zeroext, i8 zeroext) +; CHECK: declare void @__sanitizer_cov_trace_cmp2(i16 zeroext, i16 zeroext) +; CHECK: declare void @__sanitizer_cov_trace_cmp4(i32 zeroext, i32 zeroext) +; CHECK: declare void @__sanitizer_cov_trace_cmp8(i64, i64) +; CHECK: declare void @__sanitizer_cov_trace_div4(i32 zeroext) +; CHECK: declare void @__sanitizer_cov_trace_div8(i64) +; CHECK: declare void @__sanitizer_cov_trace_gep(i64) +; CHECK: declare void @__sanitizer_cov_trace_switch(i64, i64*) +; CHECK: declare void @__sanitizer_cov_trace_pc() +; CHECK: declare void @__sanitizer_cov_trace_pc_guard(i32*) +; CHECK: declare void @__sanitizer_cov_trace_pc_guard_init(i32*, i32*) +; CHECK-NOT: declare diff --git a/test/Linker/pr26037.ll b/test/Linker/pr26037.ll index 0e6da17e9fb7..da771669574f 100644 --- a/test/Linker/pr26037.ll +++ b/test/Linker/pr26037.ll @@ -44,13 +44,13 @@ entry: !7 = !{null} !8 = distinct !DISubprogram(name: "b", linkageName: "_ZN1A1bEv", scope: !5, file: !1, line: 8, type: !6, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) !9 = !{!10, !16} -!10 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !8, entity: !4, line: 8) +!10 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !8, entity: !4, file: !1, line: 8) !11 = !{i32 2, !"Dwarf Version", i32 4} !12 = !{i32 2, !"Debug Info Version", i32 3} !13 = !{!"clang version 3.8.0 (trunk 256934) (llvm/trunk 256936)"} !14 = !DILocation(line: 7, column: 12, scope: !4) !15 = !DILocation(line: 8, column: 24, scope: !8) -!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, line: 8) +!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, file: !1, line: 8) !17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 9, column: 8) !18 = distinct !DISubprogram(name: "c", linkageName: "_ZN1A1cEv", scope: !5, file: !1, line: 9, type: !6, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) !19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 10, column: 8) diff --git a/test/MC/AArch64/coff-relocations.s b/test/MC/AArch64/coff-relocations.s new file mode 100644 index 000000000000..221ecfd4cd41 --- /dev/null +++ b/test/MC/AArch64/coff-relocations.s @@ -0,0 +1,52 @@ +; RUN: llvm-mc -triple aarch64-windows -filetype obj -o - %s | \ +; RUN: llvm-readobj -r - | FileCheck %s + +; IMAGE_REL_ARM64_ADDR32 +.Linfo_foo: + .asciz "foo" + .long foo + +; IMAGE_REL_ARM64_ADDR32NB +.long func@IMGREL + +; IMAGE_REL_ARM64_ADDR64 +.globl struc +struc: + .quad arr + +; IMAGE_REL_ARM64_BRANCH26 +b target + +; IMAGE_REL_ARM64_PAGEBASE_REL21 +adrp x0, foo + +; IMAGE_REL_ARM64_PAGEOFFSET_12A +add x0, x0, :lo12:foo + +; IMAGE_REL_ARM64_PAGEOFFSET_12L +ldr x0, [x0, :lo12:foo] + +; IMAGE_REL_ARM64_SECREL +.secrel32 .Linfo_bar +.Linfo_bar: + +; IMAGE_REL_ARM64_SECTION +.secidx func + + +; CHECK: Format: COFF-ARM64 +; CHECK: Arch: aarch64 +; CHECK: AddressSize: 64bit +; CHECK: Relocations [ +; CHECK: Section (1) .text { +; CHECK: 0x4 IMAGE_REL_ARM64_ADDR32 foo +; CHECK: 0x8 IMAGE_REL_ARM64_ADDR32NB func +; CHECK: 0xC IMAGE_REL_ARM64_ADDR64 arr +; CHECK: 0x14 IMAGE_REL_ARM64_BRANCH26 target +; CHECK: 0x18 IMAGE_REL_ARM64_PAGEBASE_REL21 foo +; CHECK: 0x1C IMAGE_REL_ARM64_PAGEOFFSET_12A foo +; CHECK: 0x20 IMAGE_REL_ARM64_PAGEOFFSET_12L foo +; CHECK: 0x24 IMAGE_REL_ARM64_SECREL .text +; CHECK: 0x28 IMAGE_REL_ARM64_SECTION func +; CHECK: } +; CHECK: ] diff --git a/test/MC/AArch64/invalid-instructions-spellcheck.s b/test/MC/AArch64/invalid-instructions-spellcheck.s new file mode 100644 index 000000000000..8acb285ac9a6 --- /dev/null +++ b/test/MC/AArch64/invalid-instructions-spellcheck.s @@ -0,0 +1,37 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -triple=aarch64 -mattr=-neon -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-NEON + +// This tests the mnemonic spell checker. + +// First check what happens when an instruction is omitted: + + w1, w2, w3 + +// CHECK: error: unknown token in expression +// CHECK-NEXT: w1, w2, w3 +// CHECK-NEXT: ^ +// CHECK-NEXT: error: invalid operand +// CHECK-NEXT: w1, w2, w3 +// CHECK-NEXT: ^ + +// We don't want to see a suggestion here; the edit distance is too large to +// give sensible suggestions: + + addddddddd w1, w2, w3 + +// CHECK: error: unrecognized instruction mnemonic +// CHECK-NEXT: addddddddd w1, w2, w3 +// CHECK-NEXT: ^ + + addd w1, w2, w3 + +// CHECK: error: unrecognized instruction mnemonic, did you mean: add, addp, adds, addv, fadd, madd? +// CHECK-NEXT: addd w1, w2, w3 +// CHECK-NEXT: ^ + +// Instructions 'addv' and 'addp' are only available when NEON is enabled, so we +// don't want to see them here: + +// CHECK-NO-NEON: error: unrecognized instruction mnemonic, did you mean: add, adds, fadd, madd? +// CHECK-NO-NEON-NEXT: addd w1, w2, w3 +// CHECK-NO-NEON-NEXT: ^ diff --git a/test/MC/AMDGPU/gfx9_asm_all.s b/test/MC/AMDGPU/gfx9_asm_all.s index 56484a37bdce..40bc2f8e159a 100644 --- a/test/MC/AMDGPU/gfx9_asm_all.s +++ b/test/MC/AMDGPU/gfx9_asm_all.s @@ -105392,3 +105392,354 @@ v_mad_mixlo_f16 v5, |v1|, |v2|, |v3| v_mad_mixlo_f16 v5, v1, v2, v3 clamp // CHECK: [0x05,0xc0,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_i16 v5, v1, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_i16 v255, v1, v2, v3 +// CHECK: [0xff,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_i16 v5, v255, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0xff,0x05,0x0e,0x1c] + +v_pk_mad_i16 v5, s1, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x04,0x0e,0x1c] + +v_pk_mad_i16 v5, s101, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x65,0x04,0x0e,0x1c] + +v_pk_mad_i16 v5, flat_scratch_lo, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x66,0x04,0x0e,0x1c] + +v_pk_mad_i16 v5, flat_scratch_hi, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x67,0x04,0x0e,0x1c] + +v_pk_mad_i16 v5, vcc_lo, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x6a,0x04,0x0e,0x1c] + +v_pk_mad_i16 v5, vcc_hi, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x6b,0x04,0x0e,0x1c] + +v_pk_mad_i16 v5, m0, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x7c,0x04,0x0e,0x1c] + +v_pk_mad_i16 v5, exec_lo, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x7e,0x04,0x0e,0x1c] + +v_pk_mad_i16 v5, exec_hi, v2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x7f,0x04,0x0e,0x1c] + +v_pk_mad_i16 v5, v1, v255, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c] + +v_pk_mad_i16 v5, v1, s2, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0c,0x1c] + +v_pk_mad_i16 v5, v1, s101, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xcb,0x0c,0x1c] + +v_pk_mad_i16 v5, v1, flat_scratch_lo, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xcd,0x0c,0x1c] + +v_pk_mad_i16 v5, v1, flat_scratch_hi, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xcf,0x0c,0x1c] + +v_pk_mad_i16 v5, v1, vcc_lo, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xd5,0x0c,0x1c] + +v_pk_mad_i16 v5, v1, vcc_hi, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xd7,0x0c,0x1c] + +v_pk_mad_i16 v5, v1, m0, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xf9,0x0c,0x1c] + +v_pk_mad_i16 v5, v1, exec_lo, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xfd,0x0c,0x1c] + +v_pk_mad_i16 v5, v1, exec_hi, v3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0c,0x1c] + +v_pk_mad_i16 v5, v1, v2, v255 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f] + +v_pk_mad_i16 v5, v1, v2, s3 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x18] + +v_pk_mad_i16 v5, v1, v2, s101 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x96,0x19] + +v_pk_mad_i16 v5, v1, v2, flat_scratch_lo +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9a,0x19] + +v_pk_mad_i16 v5, v1, v2, flat_scratch_hi +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9e,0x19] + +v_pk_mad_i16 v5, v1, v2, vcc_lo +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xaa,0x19] + +v_pk_mad_i16 v5, v1, v2, vcc_hi +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xae,0x19] + +v_pk_mad_i16 v5, v1, v2, m0 +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xf2,0x19] + +v_pk_mad_i16 v5, v1, v2, exec_lo +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfa,0x19] + +v_pk_mad_i16 v5, v1, v2, exec_hi +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x19] + +v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,0,0] +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,0,0] +// CHECK: [0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,1,0] +// CHECK: [0x05,0x50,0x80,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,0,1] +// CHECK: [0x05,0x60,0x80,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,1,1] +// CHECK: [0x05,0x78,0x80,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[1,1,1] +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,0] +// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x04] + +v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[1,0,0] +// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x0c] + +v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,1,0] +// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x14] + +v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,1] +// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x04] + +v_pk_mad_i16 v5, v1, v2, v3 clamp +// CHECK: [0x05,0xc0,0x80,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_u16 v5, v1, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_u16 v255, v1, v2, v3 +// CHECK: [0xff,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_u16 v5, v255, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0xff,0x05,0x0e,0x1c] + +v_pk_mad_u16 v5, s1, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x04,0x0e,0x1c] + +v_pk_mad_u16 v5, s101, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x65,0x04,0x0e,0x1c] + +v_pk_mad_u16 v5, flat_scratch_lo, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x66,0x04,0x0e,0x1c] + +v_pk_mad_u16 v5, flat_scratch_hi, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x67,0x04,0x0e,0x1c] + +v_pk_mad_u16 v5, vcc_lo, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x6a,0x04,0x0e,0x1c] + +v_pk_mad_u16 v5, vcc_hi, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x6b,0x04,0x0e,0x1c] + +v_pk_mad_u16 v5, m0, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x7c,0x04,0x0e,0x1c] + +v_pk_mad_u16 v5, exec_lo, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x7e,0x04,0x0e,0x1c] + +v_pk_mad_u16 v5, exec_hi, v2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x7f,0x04,0x0e,0x1c] + +v_pk_mad_u16 v5, v1, v255, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c] + +v_pk_mad_u16 v5, v1, s2, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0c,0x1c] + +v_pk_mad_u16 v5, v1, s101, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xcb,0x0c,0x1c] + +v_pk_mad_u16 v5, v1, flat_scratch_lo, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xcd,0x0c,0x1c] + +v_pk_mad_u16 v5, v1, flat_scratch_hi, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xcf,0x0c,0x1c] + +v_pk_mad_u16 v5, v1, vcc_lo, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xd5,0x0c,0x1c] + +v_pk_mad_u16 v5, v1, vcc_hi, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xd7,0x0c,0x1c] + +v_pk_mad_u16 v5, v1, m0, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xf9,0x0c,0x1c] + +v_pk_mad_u16 v5, v1, exec_lo, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xfd,0x0c,0x1c] + +v_pk_mad_u16 v5, v1, exec_hi, v3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0c,0x1c] + +v_pk_mad_u16 v5, v1, v2, v255 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f] + +v_pk_mad_u16 v5, v1, v2, s3 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x18] + +v_pk_mad_u16 v5, v1, v2, s101 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x96,0x19] + +v_pk_mad_u16 v5, v1, v2, flat_scratch_lo +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9a,0x19] + +v_pk_mad_u16 v5, v1, v2, flat_scratch_hi +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9e,0x19] + +v_pk_mad_u16 v5, v1, v2, vcc_lo +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xaa,0x19] + +v_pk_mad_u16 v5, v1, v2, vcc_hi +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xae,0x19] + +v_pk_mad_u16 v5, v1, v2, m0 +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xf2,0x19] + +v_pk_mad_u16 v5, v1, v2, exec_lo +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfa,0x19] + +v_pk_mad_u16 v5, v1, v2, exec_hi +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x19] + +v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,0,0] +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,0,0] +// CHECK: [0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,1,0] +// CHECK: [0x05,0x50,0x89,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,0,1] +// CHECK: [0x05,0x60,0x89,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,1,1] +// CHECK: [0x05,0x78,0x89,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[1,1,1] +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,0] +// CHECK: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x04] + +v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[1,0,0] +// CHECK: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x0c] + +v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,1,0] +// CHECK: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x14] + +v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,1] +// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x04] + +v_pk_mad_u16 v5, v1, v2, v3 clamp +// CHECK: [0x05,0xc0,0x89,0xd3,0x01,0x05,0x0e,0x1c] + +v_pk_sub_u16 v5, v1, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18] + +v_pk_sub_u16 v255, v1, v2 +// CHECK: [0xff,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18] + +v_pk_sub_u16 v5, v255, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0xff,0x05,0x02,0x18] + +v_pk_sub_u16 v5, s1, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x04,0x02,0x18] + +v_pk_sub_u16 v5, s101, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x65,0x04,0x02,0x18] + +v_pk_sub_u16 v5, flat_scratch_lo, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x66,0x04,0x02,0x18] + +v_pk_sub_u16 v5, flat_scratch_hi, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x67,0x04,0x02,0x18] + +v_pk_sub_u16 v5, vcc_lo, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x6a,0x04,0x02,0x18] + +v_pk_sub_u16 v5, vcc_hi, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x6b,0x04,0x02,0x18] + +v_pk_sub_u16 v5, m0, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x7c,0x04,0x02,0x18] + +v_pk_sub_u16 v5, exec_lo, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x7e,0x04,0x02,0x18] + +v_pk_sub_u16 v5, exec_hi, v2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x7f,0x04,0x02,0x18] + +v_pk_sub_u16 v5, v1, v255 +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x03,0x18] + +v_pk_sub_u16 v5, v1, s2 +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x00,0x18] + +v_pk_sub_u16 v5, v1, s101 +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xcb,0x00,0x18] + +v_pk_sub_u16 v5, v1, flat_scratch_lo +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xcd,0x00,0x18] + +v_pk_sub_u16 v5, v1, flat_scratch_hi +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xcf,0x00,0x18] + +v_pk_sub_u16 v5, v1, vcc_lo +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xd5,0x00,0x18] + +v_pk_sub_u16 v5, v1, vcc_hi +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xd7,0x00,0x18] + +v_pk_sub_u16 v5, v1, m0 +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xf9,0x00,0x18] + +v_pk_sub_u16 v5, v1, exec_lo +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xfd,0x00,0x18] + +v_pk_sub_u16 v5, v1, exec_hi +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x00,0x18] + +v_pk_sub_u16 v5, v1, v2 op_sel:[0,0] +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18] + +v_pk_sub_u16 v5, v1, v2 op_sel:[1,0] +// CHECK: [0x05,0x08,0x8b,0xd3,0x01,0x05,0x02,0x18] + +v_pk_sub_u16 v5, v1, v2 op_sel:[0,1] +// CHECK: [0x05,0x10,0x8b,0xd3,0x01,0x05,0x02,0x18] + +v_pk_sub_u16 v5, v1, v2 op_sel:[1,1] +// CHECK: [0x05,0x18,0x8b,0xd3,0x01,0x05,0x02,0x18] + +v_pk_sub_u16 v5, v1, v2 op_sel_hi:[1,1] +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18] + +v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,0] +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x00] + +v_pk_sub_u16 v5, v1, v2 op_sel_hi:[1,0] +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x08] + +v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,1] +// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x10] + +v_pk_sub_u16 v5, v1, v2 clamp +// CHECK: [0x05,0x80,0x8b,0xd3,0x01,0x05,0x02,0x18] diff --git a/test/MC/AMDGPU/vop3-errs.s b/test/MC/AMDGPU/vop3-errs.s index 7ba577049af3..855dd0b5de08 100644 --- a/test/MC/AMDGPU/vop3-errs.s +++ b/test/MC/AMDGPU/vop3-errs.s @@ -1,35 +1,47 @@ -// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX67 --check-prefix=GCN +// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX67 --check-prefix=GCN +// RUN: not llvm-mc -arch=amdgcn -mcpu=fiji -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX89 --check-prefix=GCN +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX89 --check-prefix=GCN v_add_f32_e64 v0, v1 -// CHECK: error: too few operands for instruction +// GCN: error: too few operands for instruction v_div_scale_f32 v24, vcc, v22, 1.1, v22 -// CHECK: error: invalid operand for instruction +// GCN: error: invalid operand for instruction v_mqsad_u32_u8 v[0:3], s[2:3], v4, v[0:3] -// CHECK: error: instruction not supported on this GPU +// GFX67: error: instruction not supported on this GPU +// GFX89: error: destination must be different than all sources v_mqsad_pk_u16_u8 v[0:1], v[1:2], v9, v[4:5] -// CHECK: error: destination must be different than all sources +// GCN: error: destination must be different than all sources v_mqsad_pk_u16_u8 v[1:2], v[1:2], v9, v[4:5] -// CHECK: error: destination must be different than all sources +// GCN: error: destination must be different than all sources v_mqsad_pk_u16_u8 v[2:3], v[1:2], v9, v[4:5] -// CHECK: error: destination must be different than all sources +// GCN: error: destination must be different than all sources v_mqsad_pk_u16_u8 v[3:4], v[0:1], v9, v[4:5] -// CHECK: error: destination must be different than all sources +// GCN: error: destination must be different than all sources v_mqsad_pk_u16_u8 v[4:5], v[1:2], v9, v[4:5] -// CHECK: error: destination must be different than all sources +// GCN: error: destination must be different than all sources v_mqsad_pk_u16_u8 v[5:6], v[1:2], v9, v[4:5] -// CHECK: error: destination must be different than all sources +// GCN: error: destination must be different than all sources v_mqsad_pk_u16_u8 v[8:9], v[1:2], v9, v[4:5] -// CHECK: error: destination must be different than all sources +// GCN: error: destination must be different than all sources v_mqsad_pk_u16_u8 v[9:10], v[1:2], v9, v[4:5] -// CHECK: error: destination must be different than all sources +// GCN: error: destination must be different than all sources + +v_cmp_eq_f32_e64 vcc, v0, v1 mul:2 +// GCN: error: invalid operand for instruction + +v_cmp_le_f64_e64 vcc, v0, v1 mul:4 +// GCN: error: invalid operand for instruction + +v_cvt_u32_f32_e64 v0, v1 div:2 +// GCN: error: invalid operand for instruction \ No newline at end of file diff --git a/test/MC/ARM/virtexts-thumb.s b/test/MC/ARM/virtexts-thumb.s index d911e1dfb1d7..0ea0bafe5a86 100644 --- a/test/MC/ARM/virtexts-thumb.s +++ b/test/MC/ARM/virtexts-thumb.s @@ -50,7 +50,7 @@ # CHECK-THUMB: [0xde,0xf3,0x00,0x8f] # SUBS PC, LR, #0 should have the same encoding as ERET. -# The conditional forms can't be tested becuse the ARM assembler parser doesn't +# The conditional forms can't be tested because the ARM assembler parser doesn't # accept SUBS PC, LR, #, only the unconditonal form is allowed. This # is due to the way that the custom parser handles optional operands; see the # FIXME in ARM/AsmParser/ARMAsmParser.cpp. diff --git a/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt b/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt index 63374300504c..188534706eb5 100644 --- a/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt +++ b/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt @@ -88934,3 +88934,408 @@ # CHECK: v_pk_sub_i16 v5, v1, v2 clamp ; encoding: [0x05,0x80,0x83,0xd3,0x01,0x05,0x02,0x18] 0x05,0x80,0x83,0xd3,0x01,0x05,0x02,0x18 + +# CHECK: v_pk_mad_i16 v5, v1, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v255, v1, v2, v3 ; encoding: [0xff,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c] +0xff,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, v255, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0xff,0x05,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0xff,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, s1, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x04,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0x01,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, s101, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x65,0x04,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0x65,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, flat_scratch_lo, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x66,0x04,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0x66,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, flat_scratch_hi, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x67,0x04,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0x67,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x6a,0x04,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0x6a,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, vcc_hi, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x6b,0x04,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0x6b,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, m0, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x7c,0x04,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0x7c,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, exec_lo, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x7e,0x04,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0x7e,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, exec_hi, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x7f,0x04,0x0e,0x1c] +0x05,0x40,0x80,0xd3,0x7f,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, v255, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c] +0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, s2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0c,0x1c] +0x05,0x40,0x80,0xd3,0x01,0x05,0x0c,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, s101, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xcb,0x0c,0x1c] +0x05,0x40,0x80,0xd3,0x01,0xcb,0x0c,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, flat_scratch_lo, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xcd,0x0c,0x1c] +0x05,0x40,0x80,0xd3,0x01,0xcd,0x0c,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, flat_scratch_hi, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xcf,0x0c,0x1c] +0x05,0x40,0x80,0xd3,0x01,0xcf,0x0c,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, vcc_lo, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xd5,0x0c,0x1c] +0x05,0x40,0x80,0xd3,0x01,0xd5,0x0c,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, vcc_hi, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xd7,0x0c,0x1c] +0x05,0x40,0x80,0xd3,0x01,0xd7,0x0c,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, m0, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xf9,0x0c,0x1c] +0x05,0x40,0x80,0xd3,0x01,0xf9,0x0c,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, exec_lo, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xfd,0x0c,0x1c] +0x05,0x40,0x80,0xd3,0x01,0xfd,0x0c,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, exec_hi, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0c,0x1c] +0x05,0x40,0x80,0xd3,0x01,0xff,0x0c,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, v2, v255 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f] +0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f + +# CHECK: v_pk_mad_i16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x18] +0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x18 + +# CHECK: v_pk_mad_i16 v5, v1, v2, s101 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x96,0x19] +0x05,0x40,0x80,0xd3,0x01,0x05,0x96,0x19 + +# CHECK: v_pk_mad_i16 v5, v1, v2, flat_scratch_lo ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9a,0x19] +0x05,0x40,0x80,0xd3,0x01,0x05,0x9a,0x19 + +# CHECK: v_pk_mad_i16 v5, v1, v2, flat_scratch_hi ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9e,0x19] +0x05,0x40,0x80,0xd3,0x01,0x05,0x9e,0x19 + +# CHECK: v_pk_mad_i16 v5, v1, v2, vcc_lo ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xaa,0x19] +0x05,0x40,0x80,0xd3,0x01,0x05,0xaa,0x19 + +# CHECK: v_pk_mad_i16 v5, v1, v2, vcc_hi ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xae,0x19] +0x05,0x40,0x80,0xd3,0x01,0x05,0xae,0x19 + +# CHECK: v_pk_mad_i16 v5, v1, v2, m0 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xf2,0x19] +0x05,0x40,0x80,0xd3,0x01,0x05,0xf2,0x19 + +# CHECK: v_pk_mad_i16 v5, v1, v2, exec_lo ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfa,0x19] +0x05,0x40,0x80,0xd3,0x01,0x05,0xfa,0x19 + +# CHECK: v_pk_mad_i16 v5, v1, v2, exec_hi ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x19] +0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x19 + +# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x05,0x50,0x80,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x50,0x80,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x05,0x60,0x80,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x60,0x80,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,1,1] ; encoding: [0x05,0x78,0x80,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x78,0x80,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x04] +0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x04 + +# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x0c] +0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x0c + +# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,1,0] ; encoding: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x14] +0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x14 + +# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x04] +0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x04 + +# CHECK: v_pk_mad_i16 v5, v1, v2, v3 clamp ; encoding: [0x05,0xc0,0x80,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0xc0,0x80,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v255, v1, v2, v3 ; encoding: [0xff,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c] +0xff,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, v255, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0xff,0x05,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0xff,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, s1, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x04,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0x01,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, s101, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x65,0x04,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0x65,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, flat_scratch_lo, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x66,0x04,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0x66,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, flat_scratch_hi, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x67,0x04,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0x67,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x6a,0x04,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0x6a,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, vcc_hi, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x6b,0x04,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0x6b,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, m0, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x7c,0x04,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0x7c,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, exec_lo, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x7e,0x04,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0x7e,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, exec_hi, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x7f,0x04,0x0e,0x1c] +0x05,0x40,0x89,0xd3,0x7f,0x04,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, v255, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c] +0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, s2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0c,0x1c] +0x05,0x40,0x89,0xd3,0x01,0x05,0x0c,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, s101, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xcb,0x0c,0x1c] +0x05,0x40,0x89,0xd3,0x01,0xcb,0x0c,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, flat_scratch_lo, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xcd,0x0c,0x1c] +0x05,0x40,0x89,0xd3,0x01,0xcd,0x0c,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, flat_scratch_hi, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xcf,0x0c,0x1c] +0x05,0x40,0x89,0xd3,0x01,0xcf,0x0c,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, vcc_lo, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xd5,0x0c,0x1c] +0x05,0x40,0x89,0xd3,0x01,0xd5,0x0c,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, vcc_hi, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xd7,0x0c,0x1c] +0x05,0x40,0x89,0xd3,0x01,0xd7,0x0c,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, m0, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xf9,0x0c,0x1c] +0x05,0x40,0x89,0xd3,0x01,0xf9,0x0c,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, exec_lo, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xfd,0x0c,0x1c] +0x05,0x40,0x89,0xd3,0x01,0xfd,0x0c,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, exec_hi, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0c,0x1c] +0x05,0x40,0x89,0xd3,0x01,0xff,0x0c,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, v2, v255 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f] +0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f + +# CHECK: v_pk_mad_u16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x18] +0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x18 + +# CHECK: v_pk_mad_u16 v5, v1, v2, s101 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x96,0x19] +0x05,0x40,0x89,0xd3,0x01,0x05,0x96,0x19 + +# CHECK: v_pk_mad_u16 v5, v1, v2, flat_scratch_lo ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9a,0x19] +0x05,0x40,0x89,0xd3,0x01,0x05,0x9a,0x19 + +# CHECK: v_pk_mad_u16 v5, v1, v2, flat_scratch_hi ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9e,0x19] +0x05,0x40,0x89,0xd3,0x01,0x05,0x9e,0x19 + +# CHECK: v_pk_mad_u16 v5, v1, v2, vcc_lo ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xaa,0x19] +0x05,0x40,0x89,0xd3,0x01,0x05,0xaa,0x19 + +# CHECK: v_pk_mad_u16 v5, v1, v2, vcc_hi ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xae,0x19] +0x05,0x40,0x89,0xd3,0x01,0x05,0xae,0x19 + +# CHECK: v_pk_mad_u16 v5, v1, v2, m0 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xf2,0x19] +0x05,0x40,0x89,0xd3,0x01,0x05,0xf2,0x19 + +# CHECK: v_pk_mad_u16 v5, v1, v2, exec_lo ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfa,0x19] +0x05,0x40,0x89,0xd3,0x01,0x05,0xfa,0x19 + +# CHECK: v_pk_mad_u16 v5, v1, v2, exec_hi ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x19] +0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x19 + +# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x05,0x50,0x89,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x50,0x89,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x05,0x60,0x89,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x60,0x89,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,1,1] ; encoding: [0x05,0x78,0x89,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x78,0x89,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x04] +0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x04 + +# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x0c] +0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x0c + +# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,1,0] ; encoding: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x14] +0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x14 + +# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x04] +0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x04 + +# CHECK: v_pk_mad_u16 v5, v1, v2, v3 clamp ; encoding: [0x05,0xc0,0x89,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0xc0,0x89,0xd3,0x01,0x05,0x0e,0x1c + +# CHECK: v_pk_sub_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18] +0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18 + +# CHECK: v_pk_sub_u16 v255, v1, v2 ; encoding: [0xff,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18] +0xff,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, v255, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0xff,0x05,0x02,0x18] +0x05,0x00,0x8b,0xd3,0xff,0x05,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, s1, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x04,0x02,0x18] +0x05,0x00,0x8b,0xd3,0x01,0x04,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, s101, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0x65,0x04,0x02,0x18] +0x05,0x00,0x8b,0xd3,0x65,0x04,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, flat_scratch_lo, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0x66,0x04,0x02,0x18] +0x05,0x00,0x8b,0xd3,0x66,0x04,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, flat_scratch_hi, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0x67,0x04,0x02,0x18] +0x05,0x00,0x8b,0xd3,0x67,0x04,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0x6a,0x04,0x02,0x18] +0x05,0x00,0x8b,0xd3,0x6a,0x04,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, vcc_hi, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0x6b,0x04,0x02,0x18] +0x05,0x00,0x8b,0xd3,0x6b,0x04,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, m0, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0x7c,0x04,0x02,0x18] +0x05,0x00,0x8b,0xd3,0x7c,0x04,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, exec_lo, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0x7e,0x04,0x02,0x18] +0x05,0x00,0x8b,0xd3,0x7e,0x04,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, exec_hi, v2 ; encoding: [0x05,0x00,0x8b,0xd3,0x7f,0x04,0x02,0x18] +0x05,0x00,0x8b,0xd3,0x7f,0x04,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, v255 ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x03,0x18] +0x05,0x00,0x8b,0xd3,0x01,0xff,0x03,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, s2 ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x00,0x18] +0x05,0x00,0x8b,0xd3,0x01,0x05,0x00,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, s101 ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xcb,0x00,0x18] +0x05,0x00,0x8b,0xd3,0x01,0xcb,0x00,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, flat_scratch_lo ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xcd,0x00,0x18] +0x05,0x00,0x8b,0xd3,0x01,0xcd,0x00,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, flat_scratch_hi ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xcf,0x00,0x18] +0x05,0x00,0x8b,0xd3,0x01,0xcf,0x00,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, vcc_lo ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xd5,0x00,0x18] +0x05,0x00,0x8b,0xd3,0x01,0xd5,0x00,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, vcc_hi ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xd7,0x00,0x18] +0x05,0x00,0x8b,0xd3,0x01,0xd7,0x00,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, m0 ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xf9,0x00,0x18] +0x05,0x00,0x8b,0xd3,0x01,0xf9,0x00,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, exec_lo ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xfd,0x00,0x18] +0x05,0x00,0x8b,0xd3,0x01,0xfd,0x00,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, exec_hi ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x00,0x18] +0x05,0x00,0x8b,0xd3,0x01,0xff,0x00,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x08,0x8b,0xd3,0x01,0x05,0x02,0x18] +0x05,0x08,0x8b,0xd3,0x01,0x05,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[0,1] ; encoding: [0x05,0x10,0x8b,0xd3,0x01,0x05,0x02,0x18] +0x05,0x10,0x8b,0xd3,0x01,0x05,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[1,1] ; encoding: [0x05,0x18,0x8b,0xd3,0x01,0x05,0x02,0x18] +0x05,0x18,0x8b,0xd3,0x01,0x05,0x02,0x18 + +# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,0] ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x00] +0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x00 + +# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel_hi:[1,0] ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x08] +0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x08 + +# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,1] ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x10] +0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x10 + +# CHECK: v_pk_sub_u16 v5, v1, v2 clamp ; encoding: [0x05,0x80,0x8b,0xd3,0x01,0x05,0x02,0x18] +0x05,0x80,0x8b,0xd3,0x01,0x05,0x02,0x18 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], s[2:3], v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x02,0x04,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0x02,0x04,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], s[4:5], v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x04,0x04,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0x04,0x04,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], s[100:101], v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x64,0x04,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0x64,0x04,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], flat_scratch, v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x66,0x04,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0x66,0x04,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], vcc, v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x6a,0x04,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0x6a,0x04,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], exec, v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x7e,0x04,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0x7e,0x04,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], 0, v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x80,0x04,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0x80,0x04,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], -1, v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0xc1,0x04,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0xc1,0x04,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], 0.5, v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0xf0,0x04,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0xf0,0x04,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], -4.0, v2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0xf7,0x04,0x0e,0x04] +0xfc,0x00,0xe7,0xd1,0xf7,0x04,0x0e,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], s2, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0c,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0c,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], s101, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xcb,0x0c,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0xcb,0x0c,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], flat_scratch_lo, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xcd,0x0c,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0xcd,0x0c,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], flat_scratch_hi, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xcf,0x0c,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0xcf,0x0c,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], vcc_lo, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xd5,0x0c,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0xd5,0x0c,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], vcc_hi, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xd7,0x0c,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0xd7,0x0c,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], m0, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xf9,0x0c,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0xf9,0x0c,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], exec_lo, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xfd,0x0c,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0xfd,0x0c,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], exec_hi, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xff,0x0c,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0xff,0x0c,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], 0, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x01,0x0d,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0x01,0x0d,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], -1, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x83,0x0d,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0x83,0x0d,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], 0.5, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xe1,0x0d,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0xe1,0x0d,0x04 + +# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], -4.0, v[3:6] ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xef,0x0d,0x04] +0xfc,0x00,0xe7,0xd1,0x01,0xef,0x0d,0x04 diff --git a/test/MC/Disassembler/Mips/mt/valid-r2-el.txt b/test/MC/Disassembler/Mips/mt/valid-r2-el.txt index 62e7092086aa..7025354d6847 100644 --- a/test/MC/Disassembler/Mips/mt/valid-r2-el.txt +++ b/test/MC/Disassembler/Mips/mt/valid-r2-el.txt @@ -10,23 +10,4 @@ 0x08 0x10 0x65 0x7c # CHECK: fork $2, $3, $5 0x09 0x00 0x80 0x7c # CHECK: yield $4 0x09 0x20 0xa0 0x7c # CHECK: yield $4, $5 -0x02 0x20 0x05 0x41 # CHECK: mftr $4, $5, 0, 2, 0 -0x20 0x20 0x05 0x41 # CHECK: mftr $4, $5, 1, 0, 0 -0x21 0x20 0x00 0x41 # CHECK: mftr $4, $zero, 1, 1, 0 -0x21 0x20 0x0a 0x41 # CHECK: mftr $4, $10, 1, 1, 0 -0x22 0x20 0x0a 0x41 # CHECK: mftr $4, $10, 1, 2, 0 -0x32 0x20 0x0a 0x41 # CHECK: mftr $4, $10, 1, 2, 1 -0x23 0x20 0x1a 0x41 # CHECK: mftr $4, $26, 1, 3, 0 -0x23 0x20 0x1f 0x41 # CHECK: mftr $4, $ra, 1, 3, 0 -0x24 0x20 0x0e 0x41 # CHECK: mftr $4, $14, 1, 4, 0 -0x25 0x20 0x0f 0x41 # CHECK: mftr $4, $15, 1, 5, 0 -0x02 0x28 0x84 0x41 # CHECK: mttr $4, $5, 0, 2, 0 -0x20 0x28 0x84 0x41 # CHECK: mttr $4, $5, 1, 0, 0 -0x21 0x00 0x84 0x41 # CHECK: mttr $4, $zero, 1, 1, 0 -0x21 0x50 0x84 0x41 # CHECK: mttr $4, $10, 1, 1, 0 -0x22 0x50 0x84 0x41 # CHECK: mttr $4, $10, 1, 2, 0 -0x32 0x50 0x84 0x41 # CHECK: mttr $4, $10, 1, 2, 1 -0x23 0xd0 0x84 0x41 # CHECK: mttr $4, $26, 1, 3, 0 -0x23 0xf8 0x84 0x41 # CHECK: mttr $4, $ra, 1, 3, 0 -0x24 0x70 0x84 0x41 # CHECK: mttr $4, $14, 1, 4, 0 -0x25 0x78 0x84 0x41 # CHECK: mttr $4, $15, 1, 5, 0 + diff --git a/test/MC/Disassembler/Mips/mt/valid-r2.txt b/test/MC/Disassembler/Mips/mt/valid-r2.txt index 4786d8b5591f..17c42c0614a5 100644 --- a/test/MC/Disassembler/Mips/mt/valid-r2.txt +++ b/test/MC/Disassembler/Mips/mt/valid-r2.txt @@ -10,23 +10,4 @@ 0x7c 0x65 0x10 0x08 # CHECK: fork $2, $3, $5 0x7c 0x80 0x00 0x09 # CHECK: yield $4 0x7c 0xa0 0x20 0x09 # CHECK: yield $4, $5 -0x41 0x05 0x20 0x02 # CHECK: mftr $4, $5, 0, 2, 0 -0x41 0x05 0x20 0x20 # CHECK: mftr $4, $5, 1, 0, 0 -0x41 0x00 0x20 0x21 # CHECK: mftr $4, $zero, 1, 1, 0 -0x41 0x0a 0x20 0x21 # CHECK: mftr $4, $10, 1, 1, 0 -0x41 0x0a 0x20 0x22 # CHECK: mftr $4, $10, 1, 2, 0 -0x41 0x0a 0x20 0x32 # CHECK: mftr $4, $10, 1, 2, 1 -0x41 0x1a 0x20 0x23 # CHECK: mftr $4, $26, 1, 3, 0 -0x41 0x1f 0x20 0x23 # CHECK: mftr $4, $ra, 1, 3, 0 -0x41 0x0e 0x20 0x24 # CHECK: mftr $4, $14, 1, 4, 0 -0x41 0x0f 0x20 0x25 # CHECK: mftr $4, $15, 1, 5, 0 -0x41 0x84 0x28 0x02 # CHECK: mttr $4, $5, 0, 2, 0 -0x41 0x84 0x28 0x20 # CHECK: mttr $4, $5, 1, 0, 0 -0x41 0x84 0x00 0x21 # CHECK: mttr $4, $zero, 1, 1, 0 -0x41 0x84 0x50 0x21 # CHECK: mttr $4, $10, 1, 1, 0 -0x41 0x84 0x50 0x22 # CHECK: mttr $4, $10, 1, 2, 0 -0x41 0x84 0x50 0x32 # CHECK: mttr $4, $10, 1, 2, 1 -0x41 0x84 0xd0 0x23 # CHECK: mttr $4, $26, 1, 3, 0 -0x41 0x84 0xf8 0x23 # CHECK: mttr $4, $ra, 1, 3, 0 -0x41 0x84 0x70 0x24 # CHECK: mttr $4, $14, 1, 4, 0 -0x41 0x84 0x78 0x25 # CHECK: mttr $4, $15, 1, 5, 0 + diff --git a/test/MC/Disassembler/SystemZ/insns-z14.txt b/test/MC/Disassembler/SystemZ/insns-z14.txt new file mode 100644 index 000000000000..c73b50c1c2fb --- /dev/null +++ b/test/MC/Disassembler/SystemZ/insns-z14.txt @@ -0,0 +1,3253 @@ +# Test z14 instructions that don't have PC-relative operands. +# RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=z14 \ +# RUN: | FileCheck %s + +# CHECK: agh %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x38 + +# CHECK: agh %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x38 + +# CHECK: agh %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x38 + +# CHECK: agh %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x38 + +# CHECK: agh %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x38 + +# CHECK: agh %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x38 + +# CHECK: agh %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x38 + +# CHECK: agh %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x38 + +# CHECK: agh %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x38 + +# CHECK: agh %r15, 0 +0xe3 0xf0 0x00 0x00 0x00 0x38 + +# CHECK: bi -524288 +0xe3 0xf0 0x00 0x00 0x80 0x47 + +# CHECK: bi -1 +0xe3 0xf0 0x0f 0xff 0xff 0x47 + +# CHECK: bi 0 +0xe3 0xf0 0x00 0x00 0x00 0x47 + +# CHECK: bi 1 +0xe3 0xf0 0x00 0x01 0x00 0x47 + +# CHECK: bi 524287 +0xe3 0xf0 0x0f 0xff 0x7f 0x47 + +# CHECK: bi 0(%r1) +0xe3 0xf0 0x10 0x00 0x00 0x47 + +# CHECK: bi 0(%r15) +0xe3 0xf0 0xf0 0x00 0x00 0x47 + +# CHECK: bi 524287(%r1,%r15) +0xe3 0xf1 0xff 0xff 0x7f 0x47 + +# CHECK: bi 524287(%r15,%r1) +0xe3 0xff 0x1f 0xff 0x7f 0x47 + +# CHECK: bic 0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x47 + +# CHECK: bic 0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x47 + +# CHECK: bic 0, 0 +0xe3 0x00 0x00 0x00 0x00 0x47 + +# CHECK: bic 0, 1 +0xe3 0x00 0x00 0x01 0x00 0x47 + +# CHECK: bic 0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x47 + +# CHECK: bic 0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x47 + +# CHECK: bic 0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x47 + +# CHECK: bic 0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x47 + +# CHECK: bic 0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x47 + +# CHECK: bio 0(%r15) +0xe3 0x10 0xf0 0x00 0x00 0x47 + +# CHECK: bih 0(%r15) +0xe3 0x20 0xf0 0x00 0x00 0x47 + +# CHECK: binle 0(%r15) +0xe3 0x30 0xf0 0x00 0x00 0x47 + +# CHECK: bil 0(%r15) +0xe3 0x40 0xf0 0x00 0x00 0x47 + +# CHECK: binhe 0(%r15) +0xe3 0x50 0xf0 0x00 0x00 0x47 + +# CHECK: bilh 0(%r15) +0xe3 0x60 0xf0 0x00 0x00 0x47 + +# CHECK: bine 0(%r15) +0xe3 0x70 0xf0 0x00 0x00 0x47 + +# CHECK: bie 0(%r15) +0xe3 0x80 0xf0 0x00 0x00 0x47 + +# CHECK: binlh 0(%r15) +0xe3 0x90 0xf0 0x00 0x00 0x47 + +# CHECK: bihe 0(%r15) +0xe3 0xa0 0xf0 0x00 0x00 0x47 + +# CHECK: binl 0(%r15) +0xe3 0xb0 0xf0 0x00 0x00 0x47 + +# CHECK: bile 0(%r15) +0xe3 0xc0 0xf0 0x00 0x00 0x47 + +# CHECK: binh 0(%r15) +0xe3 0xd0 0xf0 0x00 0x00 0x47 + +# CHECK: bino 0(%r15) +0xe3 0xe0 0xf0 0x00 0x00 0x47 + +# CHECK: irbm %r0, %r0 +0xb9 0xac 0x00 0x00 + +# CHECK: irbm %r0, %r15 +0xb9 0xac 0x00 0x0f + +# CHECK: irbm %r15, %r0 +0xb9 0xac 0x00 0xf0 + +# CHECK: irbm %r7, %r8 +0xb9 0xac 0x00 0x78 + +# CHECK: irbm %r15, %r15 +0xb9 0xac 0x00 0xff + +# CHECK: kma %r2, %r2, %r2 +0xb9 0x29 0x20 0x22 + +# CHECK: kma %r2, %r8, %r14 +0xb9 0x29 0x80 0x2e + +# CHECK: kma %r14, %r8, %r2 +0xb9 0x29 0x80 0xe2 + +# CHECK: kma %r6, %r8, %r10 +0xb9 0x29 0x80 0x6a + +# CHECK: lgg %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x4c + +# CHECK: lgg %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x4c + +# CHECK: lgg %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x4c + +# CHECK: lgg %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x4c + +# CHECK: lgg %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x4c + +# CHECK: lgg %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x4c + +# CHECK: lgg %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x4c + +# CHECK: lgg %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x4c + +# CHECK: lgg %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x4c + +# CHECK: lgg %r15, 0 +0xe3 0xf0 0x00 0x00 0x00 0x4c + +# CHECK: lgsc %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x4d + +# CHECK: lgsc %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x4d + +# CHECK: lgsc %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x4d + +# CHECK: lgsc %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x4d + +# CHECK: lgsc %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x4d + +# CHECK: lgsc %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x4d + +# CHECK: lgsc %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x4d + +# CHECK: lgsc %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x4d + +# CHECK: lgsc %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x4d + +# CHECK: llgfsg %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x48 + +# CHECK: llgfsg %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x48 + +# CHECK: llgfsg %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x48 + +# CHECK: llgfsg %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x48 + +# CHECK: llgfsg %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x48 + +# CHECK: llgfsg %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x48 + +# CHECK: llgfsg %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x48 + +# CHECK: llgfsg %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x48 + +# CHECK: llgfsg %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x48 + +# CHECK: llgfsg %r15, 0 +0xe3 0xf0 0x00 0x00 0x00 0x48 + +# CHECK: mg %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x84 + +# CHECK: mg %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x84 + +# CHECK: mg %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x84 + +# CHECK: mg %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x84 + +# CHECK: mg %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x84 + +# CHECK: mg %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x84 + +# CHECK: mg %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x84 + +# CHECK: mg %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x84 + +# CHECK: mg %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x84 + +# CHECK: mg %r14, 0 +0xe3 0xe0 0x00 0x00 0x00 0x84 + +# CHECK: mgh %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x3c + +# CHECK: mgh %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x3c + +# CHECK: mgh %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x3c + +# CHECK: mgh %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x3c + +# CHECK: mgh %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x3c + +# CHECK: mgh %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x3c + +# CHECK: mgh %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x3c + +# CHECK: mgh %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x3c + +# CHECK: mgh %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x3c + +# CHECK: mgh %r15, 0 +0xe3 0xf0 0x00 0x00 0x00 0x3c + +# CHECK: mgrk %r0, %r0, %r0 +0xb9 0xec 0x00 0x00 + +# CHECK: mgrk %r0, %r0, %r15 +0xb9 0xec 0xf0 0x00 + +# CHECK: mgrk %r0, %r15, %r0 +0xb9 0xec 0x00 0x0f + +# CHECK: mgrk %r14, %r0, %r0 +0xb9 0xec 0x00 0xe0 + +# CHECK: mgrk %r6, %r8, %r9 +0xb9 0xec 0x90 0x68 + +# CHECK: msc %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x53 + +# CHECK: msc %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x53 + +# CHECK: msc %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x53 + +# CHECK: msc %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x53 + +# CHECK: msc %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x53 + +# CHECK: msc %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x53 + +# CHECK: msc %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x53 + +# CHECK: msc %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x53 + +# CHECK: msc %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x53 + +# CHECK: msc %r15, 0 +0xe3 0xf0 0x00 0x00 0x00 0x53 + +# CHECK: msgc %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x83 + +# CHECK: msgc %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x83 + +# CHECK: msgc %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x83 + +# CHECK: msgc %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x83 + +# CHECK: msgc %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x83 + +# CHECK: msgc %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x83 + +# CHECK: msgc %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x83 + +# CHECK: msgc %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x83 + +# CHECK: msgc %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x83 + +# CHECK: msgc %r15, 0 +0xe3 0xf0 0x00 0x00 0x00 0x83 + +# CHECK: msrkc %r0, %r0, %r0 +0xb9 0xfd 0x00 0x00 + +# CHECK: msrkc %r0, %r0, %r15 +0xb9 0xfd 0xf0 0x00 + +# CHECK: msrkc %r0, %r15, %r0 +0xb9 0xfd 0x00 0x0f + +# CHECK: msrkc %r15, %r0, %r0 +0xb9 0xfd 0x00 0xf0 + +# CHECK: msrkc %r7, %r8, %r9 +0xb9 0xfd 0x90 0x78 + +# CHECK: msgrkc %r0, %r0, %r0 +0xb9 0xed 0x00 0x00 + +# CHECK: msgrkc %r0, %r0, %r15 +0xb9 0xed 0xf0 0x00 + +# CHECK: msgrkc %r0, %r15, %r0 +0xb9 0xed 0x00 0x0f + +# CHECK: msgrkc %r15, %r0, %r0 +0xb9 0xed 0x00 0xf0 + +# CHECK: msgrkc %r7, %r8, %r9 +0xb9 0xed 0x90 0x78 + +# CHECK: sgh %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x39 + +# CHECK: sgh %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x39 + +# CHECK: sgh %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x39 + +# CHECK: sgh %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x39 + +# CHECK: sgh %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x39 + +# CHECK: sgh %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x39 + +# CHECK: sgh %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x39 + +# CHECK: sgh %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x39 + +# CHECK: sgh %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x39 + +# CHECK: sgh %r15, 0 +0xe3 0xf0 0x00 0x00 0x00 0x39 + +# CHECK: stgsc %r0, -524288 +0xe3 0x00 0x00 0x00 0x80 0x49 + +# CHECK: stgsc %r0, -1 +0xe3 0x00 0x0f 0xff 0xff 0x49 + +# CHECK: stgsc %r0, 0 +0xe3 0x00 0x00 0x00 0x00 0x49 + +# CHECK: stgsc %r0, 1 +0xe3 0x00 0x00 0x01 0x00 0x49 + +# CHECK: stgsc %r0, 524287 +0xe3 0x00 0x0f 0xff 0x7f 0x49 + +# CHECK: stgsc %r0, 0(%r1) +0xe3 0x00 0x10 0x00 0x00 0x49 + +# CHECK: stgsc %r0, 0(%r15) +0xe3 0x00 0xf0 0x00 0x00 0x49 + +# CHECK: stgsc %r0, 524287(%r1,%r15) +0xe3 0x01 0xff 0xff 0x7f 0x49 + +# CHECK: stgsc %r0, 524287(%r15,%r1) +0xe3 0x0f 0x1f 0xff 0x7f 0x49 + +# CHECK: vap %v0, %v0, %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x71 + +# CHECK: vap %v0, %v0, %v0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x71 + +# CHECK: vap %v0, %v0, %v0, 255, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x71 + +# CHECK: vap %v0, %v0, %v31, 0, 0 +0xe6 0x00 0xf0 0x00 0x02 0x71 + +# CHECK: vap %v0, %v31, %v0, 0, 0 +0xe6 0x0f 0x00 0x00 0x04 0x71 + +# CHECK: vap %v31, %v0, %v0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x71 + +# CHECK: vap %v13, %v17, %v21, 121, 11 +0xe6 0xd1 0x50 0xb7 0x96 0x71 + +# CHECK: vbperm %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x00 0x85 + +# CHECK: vbperm %v0, %v0, %v15 +0xe7 0x00 0xf0 0x00 0x00 0x85 + +# CHECK: vbperm %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x02 0x85 + +# CHECK: vbperm %v0, %v15, %v0 +0xe7 0x0f 0x00 0x00 0x00 0x85 + +# CHECK: vbperm %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x04 0x85 + +# CHECK: vbperm %v15, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x00 0x85 + +# CHECK: vbperm %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x08 0x85 + +# CHECK: vbperm %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x0a 0x85 + +# CHECK: vcp %v0, %v0, 0 +0xe6 0x00 0x00 0x00 0x00 0x77 + +# CHECK: vcp %v0, %v0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x77 + +# CHECK: vcp %v15, %v0, 0 +0xe6 0x0f 0x00 0x00 0x00 0x77 + +# CHECK: vcp %v31, %v0, 0 +0xe6 0x0f 0x00 0x00 0x04 0x77 + +# CHECK: vcp %v0, %v15, 0 +0xe6 0x00 0xf0 0x00 0x00 0x77 + +# CHECK: vcp %v0, %v31, 0 +0xe6 0x00 0xf0 0x00 0x02 0x77 + +# CHECK: vcp %v3, %v18, 4 +0xe6 0x03 0x20 0x40 0x02 0x77 + +# CHECK: vcvb %r0, %v0, 0 +0xe6 0x00 0x00 0x00 0x00 0x50 + +# CHECK: vcvb %r0, %v0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x50 + +# CHECK: vcvb %r15, %v0, 0 +0xe6 0xf0 0x00 0x00 0x00 0x50 + +# CHECK: vcvb %r0, %v15, 0 +0xe6 0x0f 0x00 0x00 0x00 0x50 + +# CHECK: vcvb %r0, %v31, 0 +0xe6 0x0f 0x00 0x00 0x04 0x50 + +# CHECK: vcvb %r3, %v18, 4 +0xe6 0x32 0x00 0x40 0x04 0x50 + +# CHECK: vcvbg %r0, %v0, 0 +0xe6 0x00 0x00 0x00 0x00 0x52 + +# CHECK: vcvbg %r0, %v0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x52 + +# CHECK: vcvbg %r15, %v0, 0 +0xe6 0xf0 0x00 0x00 0x00 0x52 + +# CHECK: vcvbg %r0, %v15, 0 +0xe6 0x0f 0x00 0x00 0x00 0x52 + +# CHECK: vcvbg %r0, %v31, 0 +0xe6 0x0f 0x00 0x00 0x04 0x52 + +# CHECK: vcvbg %r3, %v18, 4 +0xe6 0x32 0x00 0x40 0x04 0x52 + +# CHECK: vcvd %v0, %r0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x58 + +# CHECK: vcvd %v0, %r0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x58 + +# CHECK: vcvd %v0, %r0, 255, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x58 + +# CHECK: vcvd %v0, %r15, 0, 0 +0xe6 0x0f 0x00 0x00 0x00 0x58 + +# CHECK: vcvd %v15, %r0, 0, 0 +0xe6 0xf0 0x00 0x00 0x00 0x58 + +# CHECK: vcvd %v31, %r0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x58 + +# CHECK: vcvd %v18, %r9, 52, 11 +0xe6 0x29 0x00 0xb3 0x48 0x58 + +# CHECK: vcvdg %v0, %r0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x5a + +# CHECK: vcvdg %v0, %r0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x5a + +# CHECK: vcvdg %v0, %r0, 255, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x5a + +# CHECK: vcvdg %v0, %r15, 0, 0 +0xe6 0x0f 0x00 0x00 0x00 0x5a + +# CHECK: vcvdg %v15, %r0, 0, 0 +0xe6 0xf0 0x00 0x00 0x00 0x5a + +# CHECK: vcvdg %v31, %r0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x5a + +# CHECK: vcvdg %v18, %r9, 52, 11 +0xe6 0x29 0x00 0xb3 0x48 0x5a + +# CHECK: vdp %v0, %v0, %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x7a + +# CHECK: vdp %v0, %v0, %v0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x7a + +# CHECK: vdp %v0, %v0, %v0, 255, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x7a + +# CHECK: vdp %v0, %v0, %v31, 0, 0 +0xe6 0x00 0xf0 0x00 0x02 0x7a + +# CHECK: vdp %v0, %v31, %v0, 0, 0 +0xe6 0x0f 0x00 0x00 0x04 0x7a + +# CHECK: vdp %v31, %v0, %v0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x7a + +# CHECK: vdp %v13, %v17, %v21, 121, 11 +0xe6 0xd1 0x50 0xb7 0x96 0x7a + +# CHECK: vfasb %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x20 0xe3 + +# CHECK: vfasb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x22 0xe3 + +# CHECK: vfasb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x24 0xe3 + +# CHECK: vfasb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x28 0xe3 + +# CHECK: vfasb %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x2a 0xe3 + +# CHECK: vfcesb %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x20 0xe8 + +# CHECK: vfcesb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x22 0xe8 + +# CHECK: vfcesb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x24 0xe8 + +# CHECK: vfcesb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x28 0xe8 + +# CHECK: vfcesb %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x2a 0xe8 + +# CHECK: vfcesbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x10 0x20 0xe8 + +# CHECK: vfcesbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x10 0x22 0xe8 + +# CHECK: vfcesbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x10 0x24 0xe8 + +# CHECK: vfcesbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x10 0x28 0xe8 + +# CHECK: vfcesbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x10 0x2a 0xe8 + +# CHECK: vfchsb %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x20 0xeb + +# CHECK: vfchsb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x22 0xeb + +# CHECK: vfchsb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x24 0xeb + +# CHECK: vfchsb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x28 0xeb + +# CHECK: vfchsb %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x2a 0xeb + +# CHECK: vfchsbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x10 0x20 0xeb + +# CHECK: vfchsbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x10 0x22 0xeb + +# CHECK: vfchsbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x10 0x24 0xeb + +# CHECK: vfchsbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x10 0x28 0xeb + +# CHECK: vfchsbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x10 0x2a 0xeb + +# CHECK: vfchesb %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x20 0xea + +# CHECK: vfchesb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x22 0xea + +# CHECK: vfchesb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x24 0xea + +# CHECK: vfchesb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x28 0xea + +# CHECK: vfchesb %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x2a 0xea + +# CHECK: vfchesbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x10 0x20 0xea + +# CHECK: vfchesbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x10 0x22 0xea + +# CHECK: vfchesbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x10 0x24 0xea + +# CHECK: vfchesbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x10 0x28 0xea + +# CHECK: vfchesbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x10 0x2a 0xea + +# CHECK: vfdsb %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x20 0xe5 + +# CHECK: vfdsb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x22 0xe5 + +# CHECK: vfdsb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x24 0xe5 + +# CHECK: vfdsb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x28 0xe5 + +# CHECK: vfdsb %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x2a 0xe5 + +# CHECK: vfisb %v0, %v0, 0, 0 +0xe7 0x00 0x00 0x00 0x20 0xc7 + +# CHECK: vfisb %v0, %v0, 0, 15 +0xe7 0x00 0x00 0xf0 0x20 0xc7 + +# CHECK: vfisb %v0, %v0, 4, 0 +0xe7 0x00 0x00 0x04 0x20 0xc7 + +# CHECK: vfisb %v0, %v0, 7, 0 +0xe7 0x00 0x00 0x07 0x20 0xc7 + +# CHECK: vfisb %v0, %v31, 0, 0 +0xe7 0x0f 0x00 0x00 0x24 0xc7 + +# CHECK: vfisb %v31, %v0, 0, 0 +0xe7 0xf0 0x00 0x00 0x28 0xc7 + +# CHECK: vfisb %v14, %v17, 4, 10 +0xe7 0xe1 0x00 0xa4 0x24 0xc7 + +# CHECK: vfkedb %v0, %v0, %v0 +0xe7 0x00 0x00 0x04 0x30 0xe8 + +# CHECK: vfkedb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x04 0x32 0xe8 + +# CHECK: vfkedb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x04 0x34 0xe8 + +# CHECK: vfkedb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x04 0x38 0xe8 + +# CHECK: vfkedb %v18, %v3, %v20 +0xe7 0x23 0x40 0x04 0x3a 0xe8 + +# CHECK: vfkedbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x14 0x30 0xe8 + +# CHECK: vfkedbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x14 0x32 0xe8 + +# CHECK: vfkedbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x14 0x34 0xe8 + +# CHECK: vfkedbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x14 0x38 0xe8 + +# CHECK: vfkedbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x14 0x3a 0xe8 + +# CHECK: vfkesb %v0, %v0, %v0 +0xe7 0x00 0x00 0x04 0x20 0xe8 + +# CHECK: vfkesb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x04 0x22 0xe8 + +# CHECK: vfkesb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x04 0x24 0xe8 + +# CHECK: vfkesb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x04 0x28 0xe8 + +# CHECK: vfkesb %v18, %v3, %v20 +0xe7 0x23 0x40 0x04 0x2a 0xe8 + +# CHECK: vfkesbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x14 0x20 0xe8 + +# CHECK: vfkesbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x14 0x22 0xe8 + +# CHECK: vfkesbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x14 0x24 0xe8 + +# CHECK: vfkesbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x14 0x28 0xe8 + +# CHECK: vfkesbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x14 0x2a 0xe8 + +# CHECK: vfkhdb %v0, %v0, %v0 +0xe7 0x00 0x00 0x04 0x30 0xeb + +# CHECK: vfkhdb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x04 0x32 0xeb + +# CHECK: vfkhdb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x04 0x34 0xeb + +# CHECK: vfkhdb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x04 0x38 0xeb + +# CHECK: vfkhdb %v18, %v3, %v20 +0xe7 0x23 0x40 0x04 0x3a 0xeb + +# CHECK: vfkhdbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x14 0x30 0xeb + +# CHECK: vfkhdbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x14 0x32 0xeb + +# CHECK: vfkhdbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x14 0x34 0xeb + +# CHECK: vfkhdbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x14 0x38 0xeb + +# CHECK: vfkhdbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x14 0x3a 0xeb + +# CHECK: vfkhsb %v0, %v0, %v0 +0xe7 0x00 0x00 0x04 0x20 0xeb + +# CHECK: vfkhsb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x04 0x22 0xeb + +# CHECK: vfkhsb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x04 0x24 0xeb + +# CHECK: vfkhsb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x04 0x28 0xeb + +# CHECK: vfkhsb %v18, %v3, %v20 +0xe7 0x23 0x40 0x04 0x2a 0xeb + +# CHECK: vfkhsbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x14 0x20 0xeb + +# CHECK: vfkhsbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x14 0x22 0xeb + +# CHECK: vfkhsbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x14 0x24 0xeb + +# CHECK: vfkhsbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x14 0x28 0xeb + +# CHECK: vfkhsbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x14 0x2a 0xeb + +# CHECK: vfkhedb %v0, %v0, %v0 +0xe7 0x00 0x00 0x04 0x30 0xea + +# CHECK: vfkhedb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x04 0x32 0xea + +# CHECK: vfkhedb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x04 0x34 0xea + +# CHECK: vfkhedb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x04 0x38 0xea + +# CHECK: vfkhedb %v18, %v3, %v20 +0xe7 0x23 0x40 0x04 0x3a 0xea + +# CHECK: vfkhedbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x14 0x30 0xea + +# CHECK: vfkhedbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x14 0x32 0xea + +# CHECK: vfkhedbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x14 0x34 0xea + +# CHECK: vfkhedbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x14 0x38 0xea + +# CHECK: vfkhedbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x14 0x3a 0xea + +# CHECK: vfkhesb %v0, %v0, %v0 +0xe7 0x00 0x00 0x04 0x20 0xea + +# CHECK: vfkhesb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x04 0x22 0xea + +# CHECK: vfkhesb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x04 0x24 0xea + +# CHECK: vfkhesb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x04 0x28 0xea + +# CHECK: vfkhesb %v18, %v3, %v20 +0xe7 0x23 0x40 0x04 0x2a 0xea + +# CHECK: vfkhesbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x14 0x20 0xea + +# CHECK: vfkhesbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x14 0x22 0xea + +# CHECK: vfkhesbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x14 0x24 0xea + +# CHECK: vfkhesbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x14 0x28 0xea + +# CHECK: vfkhesbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x14 0x2a 0xea + +# CHECK: vfpsosb %v0, %v0, 3 +0xe7 0x00 0x00 0x30 0x20 0xcc + +# CHECK: vfpsosb %v0, %v0, 15 +0xe7 0x00 0x00 0xf0 0x20 0xcc + +# CHECK: vfpsosb %v0, %v15, 3 +0xe7 0x0f 0x00 0x30 0x20 0xcc + +# CHECK: vfpsosb %v0, %v31, 3 +0xe7 0x0f 0x00 0x30 0x24 0xcc + +# CHECK: vfpsosb %v15, %v0, 3 +0xe7 0xf0 0x00 0x30 0x20 0xcc + +# CHECK: vfpsosb %v31, %v0, 3 +0xe7 0xf0 0x00 0x30 0x28 0xcc + +# CHECK: vfpsosb %v14, %v17, 7 +0xe7 0xe1 0x00 0x70 0x24 0xcc + +# CHECK: vflcsb %v0, %v0 +0xe7 0x00 0x00 0x00 0x20 0xcc + +# CHECK: vflcsb %v0, %v15 +0xe7 0x0f 0x00 0x00 0x20 0xcc + +# CHECK: vflcsb %v0, %v31 +0xe7 0x0f 0x00 0x00 0x24 0xcc + +# CHECK: vflcsb %v15, %v0 +0xe7 0xf0 0x00 0x00 0x20 0xcc + +# CHECK: vflcsb %v31, %v0 +0xe7 0xf0 0x00 0x00 0x28 0xcc + +# CHECK: vflcsb %v14, %v17 +0xe7 0xe1 0x00 0x00 0x24 0xcc + +# CHECK: vflnsb %v0, %v0 +0xe7 0x00 0x00 0x10 0x20 0xcc + +# CHECK: vflnsb %v0, %v15 +0xe7 0x0f 0x00 0x10 0x20 0xcc + +# CHECK: vflnsb %v0, %v31 +0xe7 0x0f 0x00 0x10 0x24 0xcc + +# CHECK: vflnsb %v15, %v0 +0xe7 0xf0 0x00 0x10 0x20 0xcc + +# CHECK: vflnsb %v31, %v0 +0xe7 0xf0 0x00 0x10 0x28 0xcc + +# CHECK: vflnsb %v14, %v17 +0xe7 0xe1 0x00 0x10 0x24 0xcc + +# CHECK: vflpsb %v0, %v0 +0xe7 0x00 0x00 0x20 0x20 0xcc + +# CHECK: vflpsb %v0, %v15 +0xe7 0x0f 0x00 0x20 0x20 0xcc + +# CHECK: vflpsb %v0, %v31 +0xe7 0x0f 0x00 0x20 0x24 0xcc + +# CHECK: vflpsb %v15, %v0 +0xe7 0xf0 0x00 0x20 0x20 0xcc + +# CHECK: vflpsb %v31, %v0 +0xe7 0xf0 0x00 0x20 0x28 0xcc + +# CHECK: vflpsb %v14, %v17 +0xe7 0xe1 0x00 0x20 0x24 0xcc + +# CHECK: vfmax %v0, %v0, %v0, 0, 0, 0 +0xe7 0x00 0x00 0x00 0x00 0xef + +# CHECK: vfmax %v0, %v0, %v0, 15, 0, 0 +0xe7 0x00 0x00 0x00 0xf0 0xef + +# CHECK: vfmax %v0, %v0, %v0, 0, 15, 0 +0xe7 0x00 0x00 0x0f 0x00 0xef + +# CHECK: vfmax %v0, %v0, %v0, 0, 0, 4 +0xe7 0x00 0x00 0x40 0x00 0xef + +# CHECK: vfmax %v0, %v0, %v31, 0, 0, 0 +0xe7 0x00 0xf0 0x00 0x02 0xef + +# CHECK: vfmax %v0, %v31, %v0, 0, 0, 0 +0xe7 0x0f 0x00 0x00 0x04 0xef + +# CHECK: vfmax %v31, %v0, %v0, 0, 0, 0 +0xe7 0xf0 0x00 0x00 0x08 0xef + +# CHECK: vfmax %v18, %v3, %v20, 11, 9, 12 +0xe7 0x23 0x40 0xc9 0xba 0xef + +# CHECK: vfmaxdb %v0, %v0, %v0, 0 +0xe7 0x00 0x00 0x00 0x30 0xef + +# CHECK: vfmaxdb %v0, %v0, %v0, 4 +0xe7 0x00 0x00 0x40 0x30 0xef + +# CHECK: vfmaxdb %v0, %v0, %v31, 0 +0xe7 0x00 0xf0 0x00 0x32 0xef + +# CHECK: vfmaxdb %v0, %v31, %v0, 0 +0xe7 0x0f 0x00 0x00 0x34 0xef + +# CHECK: vfmaxdb %v31, %v0, %v0, 0 +0xe7 0xf0 0x00 0x00 0x38 0xef + +# CHECK: vfmaxdb %v18, %v3, %v20, 12 +0xe7 0x23 0x40 0xc0 0x3a 0xef + +# CHECK: vfmaxsb %v0, %v0, %v0, 0 +0xe7 0x00 0x00 0x00 0x20 0xef + +# CHECK: vfmaxsb %v0, %v0, %v0, 4 +0xe7 0x00 0x00 0x40 0x20 0xef + +# CHECK: vfmaxsb %v0, %v0, %v31, 0 +0xe7 0x00 0xf0 0x00 0x22 0xef + +# CHECK: vfmaxsb %v0, %v31, %v0, 0 +0xe7 0x0f 0x00 0x00 0x24 0xef + +# CHECK: vfmaxsb %v31, %v0, %v0, 0 +0xe7 0xf0 0x00 0x00 0x28 0xef + +# CHECK: vfmaxsb %v18, %v3, %v20, 12 +0xe7 0x23 0x40 0xc0 0x2a 0xef + +# CHECK: vfmin %v0, %v0, %v0, 0, 0, 0 +0xe7 0x00 0x00 0x00 0x00 0xee + +# CHECK: vfmin %v0, %v0, %v0, 15, 0, 0 +0xe7 0x00 0x00 0x00 0xf0 0xee + +# CHECK: vfmin %v0, %v0, %v0, 0, 15, 0 +0xe7 0x00 0x00 0x0f 0x00 0xee + +# CHECK: vfmin %v0, %v0, %v0, 0, 0, 4 +0xe7 0x00 0x00 0x40 0x00 0xee + +# CHECK: vfmin %v0, %v0, %v31, 0, 0, 0 +0xe7 0x00 0xf0 0x00 0x02 0xee + +# CHECK: vfmin %v0, %v31, %v0, 0, 0, 0 +0xe7 0x0f 0x00 0x00 0x04 0xee + +# CHECK: vfmin %v31, %v0, %v0, 0, 0, 0 +0xe7 0xf0 0x00 0x00 0x08 0xee + +# CHECK: vfmin %v18, %v3, %v20, 11, 9, 12 +0xe7 0x23 0x40 0xc9 0xba 0xee + +# CHECK: vfmindb %v0, %v0, %v0, 0 +0xe7 0x00 0x00 0x00 0x30 0xee + +# CHECK: vfmindb %v0, %v0, %v0, 4 +0xe7 0x00 0x00 0x40 0x30 0xee + +# CHECK: vfmindb %v0, %v0, %v31, 0 +0xe7 0x00 0xf0 0x00 0x32 0xee + +# CHECK: vfmindb %v0, %v31, %v0, 0 +0xe7 0x0f 0x00 0x00 0x34 0xee + +# CHECK: vfmindb %v31, %v0, %v0, 0 +0xe7 0xf0 0x00 0x00 0x38 0xee + +# CHECK: vfmindb %v18, %v3, %v20, 12 +0xe7 0x23 0x40 0xc0 0x3a 0xee + +# CHECK: vfminsb %v0, %v0, %v0, 0 +0xe7 0x00 0x00 0x00 0x20 0xee + +# CHECK: vfminsb %v0, %v0, %v0, 4 +0xe7 0x00 0x00 0x40 0x20 0xee + +# CHECK: vfminsb %v0, %v0, %v31, 0 +0xe7 0x00 0xf0 0x00 0x22 0xee + +# CHECK: vfminsb %v0, %v31, %v0, 0 +0xe7 0x0f 0x00 0x00 0x24 0xee + +# CHECK: vfminsb %v31, %v0, %v0, 0 +0xe7 0xf0 0x00 0x00 0x28 0xee + +# CHECK: vfminsb %v18, %v3, %v20, 12 +0xe7 0x23 0x40 0xc0 0x2a 0xee + +# CHECK: vfmasb %v0, %v0, %v0, %v0 +0xe7 0x00 0x02 0x00 0x00 0x8f + +# CHECK: vfmasb %v0, %v0, %v0, %v31 +0xe7 0x00 0x02 0x00 0xf1 0x8f + +# CHECK: vfmasb %v0, %v0, %v31, %v0 +0xe7 0x00 0xf2 0x00 0x02 0x8f + +# CHECK: vfmasb %v0, %v31, %v0, %v0 +0xe7 0x0f 0x02 0x00 0x04 0x8f + +# CHECK: vfmasb %v31, %v0, %v0, %v0 +0xe7 0xf0 0x02 0x00 0x08 0x8f + +# CHECK: vfmasb %v13, %v17, %v21, %v25 +0xe7 0xd1 0x52 0x00 0x97 0x8f + +# CHECK: vfmsb %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x20 0xe7 + +# CHECK: vfmsb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x22 0xe7 + +# CHECK: vfmsb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x24 0xe7 + +# CHECK: vfmsb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x28 0xe7 + +# CHECK: vfmsb %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x2a 0xe7 + +# CHECK: vfmssb %v0, %v0, %v0, %v0 +0xe7 0x00 0x02 0x00 0x00 0x8e + +# CHECK: vfmssb %v0, %v0, %v0, %v31 +0xe7 0x00 0x02 0x00 0xf1 0x8e + +# CHECK: vfmssb %v0, %v0, %v31, %v0 +0xe7 0x00 0xf2 0x00 0x02 0x8e + +# CHECK: vfmssb %v0, %v31, %v0, %v0 +0xe7 0x0f 0x02 0x00 0x04 0x8e + +# CHECK: vfmssb %v31, %v0, %v0, %v0 +0xe7 0xf0 0x02 0x00 0x08 0x8e + +# CHECK: vfmssb %v13, %v17, %v21, %v25 +0xe7 0xd1 0x52 0x00 0x97 0x8e + +# CHECK: vfnma %v0, %v0, %v0, %v0, 0, 0 +0xe7 0x00 0x00 0x00 0x00 0x9f + +# CHECK: vfnma %v0, %v0, %v0, %v0, 0, 15 +0xe7 0x00 0x0f 0x00 0x00 0x9f + +# CHECK: vfnma %v0, %v0, %v0, %v0, 15, 0 +0xe7 0x00 0x00 0x0f 0x00 0x9f + +# CHECK: vfnma %v0, %v0, %v0, %v31, 0, 0 +0xe7 0x00 0x00 0x00 0xf1 0x9f + +# CHECK: vfnma %v0, %v0, %v31, %v0, 0, 0 +0xe7 0x00 0xf0 0x00 0x02 0x9f + +# CHECK: vfnma %v0, %v31, %v0, %v0, 0, 0 +0xe7 0x0f 0x00 0x00 0x04 0x9f + +# CHECK: vfnma %v31, %v0, %v0, %v0, 0, 0 +0xe7 0xf0 0x00 0x00 0x08 0x9f + +# CHECK: vfnma %v13, %v17, %v21, %v25, 9, 11 +0xe7 0xd1 0x5b 0x09 0x97 0x9f + +# CHECK: vfnmadb %v0, %v0, %v0, %v0 +0xe7 0x00 0x03 0x00 0x00 0x9f + +# CHECK: vfnmadb %v0, %v0, %v0, %v31 +0xe7 0x00 0x03 0x00 0xf1 0x9f + +# CHECK: vfnmadb %v0, %v0, %v31, %v0 +0xe7 0x00 0xf3 0x00 0x02 0x9f + +# CHECK: vfnmadb %v0, %v31, %v0, %v0 +0xe7 0x0f 0x03 0x00 0x04 0x9f + +# CHECK: vfnmadb %v31, %v0, %v0, %v0 +0xe7 0xf0 0x03 0x00 0x08 0x9f + +# CHECK: vfnmadb %v13, %v17, %v21, %v25 +0xe7 0xd1 0x53 0x00 0x97 0x9f + +# CHECK: vfnmasb %v0, %v0, %v0, %v0 +0xe7 0x00 0x02 0x00 0x00 0x9f + +# CHECK: vfnmasb %v0, %v0, %v0, %v31 +0xe7 0x00 0x02 0x00 0xf1 0x9f + +# CHECK: vfnmasb %v0, %v0, %v31, %v0 +0xe7 0x00 0xf2 0x00 0x02 0x9f + +# CHECK: vfnmasb %v0, %v31, %v0, %v0 +0xe7 0x0f 0x02 0x00 0x04 0x9f + +# CHECK: vfnmasb %v31, %v0, %v0, %v0 +0xe7 0xf0 0x02 0x00 0x08 0x9f + +# CHECK: vfnmasb %v13, %v17, %v21, %v25 +0xe7 0xd1 0x52 0x00 0x97 0x9f + +# CHECK: vfnms %v0, %v0, %v0, %v0, 0, 0 +0xe7 0x00 0x00 0x00 0x00 0x9e + +# CHECK: vfnms %v0, %v0, %v0, %v0, 0, 15 +0xe7 0x00 0x0f 0x00 0x00 0x9e + +# CHECK: vfnms %v0, %v0, %v0, %v0, 15, 0 +0xe7 0x00 0x00 0x0f 0x00 0x9e + +# CHECK: vfnms %v0, %v0, %v0, %v31, 0, 0 +0xe7 0x00 0x00 0x00 0xf1 0x9e + +# CHECK: vfnms %v0, %v0, %v31, %v0, 0, 0 +0xe7 0x00 0xf0 0x00 0x02 0x9e + +# CHECK: vfnms %v0, %v31, %v0, %v0, 0, 0 +0xe7 0x0f 0x00 0x00 0x04 0x9e + +# CHECK: vfnms %v31, %v0, %v0, %v0, 0, 0 +0xe7 0xf0 0x00 0x00 0x08 0x9e + +# CHECK: vfnms %v13, %v17, %v21, %v25, 9, 11 +0xe7 0xd1 0x5b 0x09 0x97 0x9e + +# CHECK: vfnmsdb %v0, %v0, %v0, %v0 +0xe7 0x00 0x03 0x00 0x00 0x9e + +# CHECK: vfnmsdb %v0, %v0, %v0, %v31 +0xe7 0x00 0x03 0x00 0xf1 0x9e + +# CHECK: vfnmsdb %v0, %v0, %v31, %v0 +0xe7 0x00 0xf3 0x00 0x02 0x9e + +# CHECK: vfnmsdb %v0, %v31, %v0, %v0 +0xe7 0x0f 0x03 0x00 0x04 0x9e + +# CHECK: vfnmsdb %v31, %v0, %v0, %v0 +0xe7 0xf0 0x03 0x00 0x08 0x9e + +# CHECK: vfnmsdb %v13, %v17, %v21, %v25 +0xe7 0xd1 0x53 0x00 0x97 0x9e + +# CHECK: vfnmssb %v0, %v0, %v0, %v0 +0xe7 0x00 0x02 0x00 0x00 0x9e + +# CHECK: vfnmssb %v0, %v0, %v0, %v31 +0xe7 0x00 0x02 0x00 0xf1 0x9e + +# CHECK: vfnmssb %v0, %v0, %v31, %v0 +0xe7 0x00 0xf2 0x00 0x02 0x9e + +# CHECK: vfnmssb %v0, %v31, %v0, %v0 +0xe7 0x0f 0x02 0x00 0x04 0x9e + +# CHECK: vfnmssb %v31, %v0, %v0, %v0 +0xe7 0xf0 0x02 0x00 0x08 0x9e + +# CHECK: vfnmssb %v13, %v17, %v21, %v25 +0xe7 0xd1 0x52 0x00 0x97 0x9e + +# CHECK: vfssb %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x20 0xe2 + +# CHECK: vfssb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x22 0xe2 + +# CHECK: vfssb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x24 0xe2 + +# CHECK: vfssb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x28 0xe2 + +# CHECK: vfssb %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x2a 0xe2 + +# CHECK: vfsqsb %v0, %v0 +0xe7 0x00 0x00 0x00 0x20 0xce + +# CHECK: vfsqsb %v0, %v15 +0xe7 0x0f 0x00 0x00 0x20 0xce + +# CHECK: vfsqsb %v0, %v31 +0xe7 0x0f 0x00 0x00 0x24 0xce + +# CHECK: vfsqsb %v15, %v0 +0xe7 0xf0 0x00 0x00 0x20 0xce + +# CHECK: vfsqsb %v31, %v0 +0xe7 0xf0 0x00 0x00 0x28 0xce + +# CHECK: vfsqsb %v14, %v17 +0xe7 0xe1 0x00 0x00 0x24 0xce + +# CHECK: vftcisb %v0, %v0, 0 +0xe7 0x00 0x00 0x00 0x20 0x4a + +# CHECK: vftcisb %v0, %v0, 4095 +0xe7 0x00 0xff 0xf0 0x20 0x4a + +# CHECK: vftcisb %v0, %v15, 0 +0xe7 0x0f 0x00 0x00 0x20 0x4a + +# CHECK: vftcisb %v0, %v31, 0 +0xe7 0x0f 0x00 0x00 0x24 0x4a + +# CHECK: vftcisb %v15, %v0, 0 +0xe7 0xf0 0x00 0x00 0x20 0x4a + +# CHECK: vftcisb %v31, %v0, 0 +0xe7 0xf0 0x00 0x00 0x28 0x4a + +# CHECK: vftcisb %v4, %v21, 1656 +0xe7 0x45 0x67 0x80 0x24 0x4a + +# CHECK: vlip %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x49 + +# CHECK: vlip %v0, 0, 15 +0xe6 0x00 0x00 0x00 0xf0 0x49 + +# CHECK: vlip %v0, 65535, 0 +0xe6 0x00 0xff 0xff 0x00 0x49 + +# CHECK: vlip %v15, 0, 0 +0xe6 0xf0 0x00 0x00 0x00 0x49 + +# CHECK: vlip %v31, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x49 + +# CHECK: vlip %v17, 4660, 7 +0xe6 0x10 0x12 0x34 0x78 0x49 + +# CHECK: vllezlf %v0, 0 +0xe7 0x00 0x00 0x00 0x60 0x04 + +# CHECK: vllezlf %v0, 4095 +0xe7 0x00 0x0f 0xff 0x60 0x04 + +# CHECK: vllezlf %v0, 0(%r15) +0xe7 0x00 0xf0 0x00 0x60 0x04 + +# CHECK: vllezlf %v0, 0(%r15,%r1) +0xe7 0x0f 0x10 0x00 0x60 0x04 + +# CHECK: vllezlf %v15, 0 +0xe7 0xf0 0x00 0x00 0x60 0x04 + +# CHECK: vllezlf %v31, 0 +0xe7 0xf0 0x00 0x00 0x68 0x04 + +# CHECK: vllezlf %v18, 1383(%r3,%r4) +0xe7 0x23 0x45 0x67 0x68 0x04 + +# CHECK: vlrl %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x35 + +# CHECK: vlrl %v0, 4095, 0 +0xe6 0x00 0x0f 0xff 0x00 0x35 + +# CHECK: vlrl %v0, 0(%r15), 0 +0xe6 0x00 0xf0 0x00 0x00 0x35 + +# CHECK: vlrl %v0, 0, 255 +0xe6 0xff 0x00 0x00 0x00 0x35 + +# CHECK: vlrl %v15, 0, 0 +0xe6 0x00 0x00 0x00 0xf0 0x35 + +# CHECK: vlrl %v31, 0, 0 +0xe6 0x00 0x00 0x00 0xf1 0x35 + +# CHECK: vlrl %v18, 1383(%r4), 3 +0xe6 0x03 0x45 0x67 0x21 0x35 + +# CHECK: vlrlr %v0, %r0, 0 +0xe6 0x00 0x00 0x00 0x00 0x37 + +# CHECK: vlrlr %v0, %r0, 4095 +0xe6 0x00 0x0f 0xff 0x00 0x37 + +# CHECK: vlrlr %v0, %r0, 0(%r15) +0xe6 0x00 0xf0 0x00 0x00 0x37 + +# CHECK: vlrlr %v0, %r15, 0 +0xe6 0x0f 0x00 0x00 0x00 0x37 + +# CHECK: vlrlr %v15, %r0, 0 +0xe6 0x00 0x00 0x00 0xf0 0x37 + +# CHECK: vlrlr %v31, %r0, 0 +0xe6 0x00 0x00 0x00 0xf1 0x37 + +# CHECK: vlrlr %v18, %r3, 1383(%r4) +0xe6 0x03 0x45 0x67 0x21 0x37 + +# CHECK: vmsl %v0, %v0, %v0, %v0, 0, 0 +0xe7 0x00 0x00 0x00 0x00 0xb8 + +# CHECK: vmsl %v0, %v0, %v0, %v0, 15, 0 +0xe7 0x00 0x0f 0x00 0x00 0xb8 + +# CHECK: vmsl %v0, %v0, %v0, %v0, 0, 12 +0xe7 0x00 0x00 0xc0 0x00 0xb8 + +# CHECK: vmsl %v0, %v0, %v0, %v15, 0, 0 +0xe7 0x00 0x00 0x00 0xf0 0xb8 + +# CHECK: vmsl %v0, %v0, %v0, %v31, 0, 0 +0xe7 0x00 0x00 0x00 0xf1 0xb8 + +# CHECK: vmsl %v0, %v0, %v15, %v0, 0, 0 +0xe7 0x00 0xf0 0x00 0x00 0xb8 + +# CHECK: vmsl %v0, %v0, %v31, %v0, 0, 0 +0xe7 0x00 0xf0 0x00 0x02 0xb8 + +# CHECK: vmsl %v0, %v15, %v0, %v0, 0, 0 +0xe7 0x0f 0x00 0x00 0x00 0xb8 + +# CHECK: vmsl %v0, %v31, %v0, %v0, 0, 0 +0xe7 0x0f 0x00 0x00 0x04 0xb8 + +# CHECK: vmsl %v15, %v0, %v0, %v0, 0, 0 +0xe7 0xf0 0x00 0x00 0x00 0xb8 + +# CHECK: vmsl %v31, %v0, %v0, %v0, 0, 0 +0xe7 0xf0 0x00 0x00 0x08 0xb8 + +# CHECK: vmsl %v18, %v3, %v20, %v5, 0, 4 +0xe7 0x23 0x40 0x40 0x5a 0xb8 + +# CHECK: vmsl %v18, %v3, %v20, %v5, 11, 8 +0xe7 0x23 0x4b 0x80 0x5a 0xb8 + +# CHECK: vmslg %v0, %v0, %v0, %v0, 0 +0xe7 0x00 0x03 0x00 0x00 0xb8 + +# CHECK: vmslg %v0, %v0, %v0, %v0, 12 +0xe7 0x00 0x03 0xc0 0x00 0xb8 + +# CHECK: vmslg %v0, %v0, %v0, %v15, 0 +0xe7 0x00 0x03 0x00 0xf0 0xb8 + +# CHECK: vmslg %v0, %v0, %v0, %v31, 0 +0xe7 0x00 0x03 0x00 0xf1 0xb8 + +# CHECK: vmslg %v0, %v0, %v15, %v0, 0 +0xe7 0x00 0xf3 0x00 0x00 0xb8 + +# CHECK: vmslg %v0, %v0, %v31, %v0, 0 +0xe7 0x00 0xf3 0x00 0x02 0xb8 + +# CHECK: vmslg %v0, %v15, %v0, %v0, 0 +0xe7 0x0f 0x03 0x00 0x00 0xb8 + +# CHECK: vmslg %v0, %v31, %v0, %v0, 0 +0xe7 0x0f 0x03 0x00 0x04 0xb8 + +# CHECK: vmslg %v15, %v0, %v0, %v0, 0 +0xe7 0xf0 0x03 0x00 0x00 0xb8 + +# CHECK: vmslg %v31, %v0, %v0, %v0, 0 +0xe7 0xf0 0x03 0x00 0x08 0xb8 + +# CHECK: vmslg %v18, %v3, %v20, %v5, 4 +0xe7 0x23 0x43 0x40 0x5a 0xb8 + +# CHECK: vmslg %v18, %v3, %v20, %v5, 8 +0xe7 0x23 0x43 0x80 0x5a 0xb8 + +# CHECK: vmp %v0, %v0, %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x78 + +# CHECK: vmp %v0, %v0, %v0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x78 + +# CHECK: vmp %v0, %v0, %v0, 255, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x78 + +# CHECK: vmp %v0, %v0, %v31, 0, 0 +0xe6 0x00 0xf0 0x00 0x02 0x78 + +# CHECK: vmp %v0, %v31, %v0, 0, 0 +0xe6 0x0f 0x00 0x00 0x04 0x78 + +# CHECK: vmp %v31, %v0, %v0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x78 + +# CHECK: vmp %v13, %v17, %v21, 121, 11 +0xe6 0xd1 0x50 0xb7 0x96 0x78 + +# CHECK: vmsp %v0, %v0, %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x79 + +# CHECK: vmsp %v0, %v0, %v0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x79 + +# CHECK: vmsp %v0, %v0, %v0, 255, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x79 + +# CHECK: vmsp %v0, %v0, %v31, 0, 0 +0xe6 0x00 0xf0 0x00 0x02 0x79 + +# CHECK: vmsp %v0, %v31, %v0, 0, 0 +0xe6 0x0f 0x00 0x00 0x04 0x79 + +# CHECK: vmsp %v31, %v0, %v0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x79 + +# CHECK: vmsp %v13, %v17, %v21, 121, 11 +0xe6 0xd1 0x50 0xb7 0x96 0x79 + +# CHECK: vnn %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x00 0x6e + +# CHECK: vnn %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x02 0x6e + +# CHECK: vnn %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x04 0x6e + +# CHECK: vnn %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x08 0x6e + +# CHECK: vnn %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x0a 0x6e + +# CHECK: vnx %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x00 0x6c + +# CHECK: vnx %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x02 0x6c + +# CHECK: vnx %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x04 0x6c + +# CHECK: vnx %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x08 0x6c + +# CHECK: vnx %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x0a 0x6c + +# CHECK: voc %v0, %v0, %v0 +0xe7 0x00 0x00 0x00 0x00 0x6f + +# CHECK: voc %v0, %v0, %v31 +0xe7 0x00 0xf0 0x00 0x02 0x6f + +# CHECK: voc %v0, %v31, %v0 +0xe7 0x0f 0x00 0x00 0x04 0x6f + +# CHECK: voc %v31, %v0, %v0 +0xe7 0xf0 0x00 0x00 0x08 0x6f + +# CHECK: voc %v18, %v3, %v20 +0xe7 0x23 0x40 0x00 0x0a 0x6f + +# CHECK: vpkz %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x34 + +# CHECK: vpkz %v0, 4095, 0 +0xe6 0x00 0x0f 0xff 0x00 0x34 + +# CHECK: vpkz %v0, 0(%r15), 0 +0xe6 0x00 0xf0 0x00 0x00 0x34 + +# CHECK: vpkz %v0, 0, 255 +0xe6 0xff 0x00 0x00 0x00 0x34 + +# CHECK: vpkz %v15, 0, 0 +0xe6 0x00 0x00 0x00 0xf0 0x34 + +# CHECK: vpkz %v31, 0, 0 +0xe6 0x00 0x00 0x00 0xf1 0x34 + +# CHECK: vpkz %v18, 1383(%r4), 3 +0xe6 0x03 0x45 0x67 0x21 0x34 + +# CHECK: vpopctb %v0, %v0 +0xe7 0x00 0x00 0x00 0x00 0x50 + +# CHECK: vpopctb %v0, %v15 +0xe7 0x0f 0x00 0x00 0x00 0x50 + +# CHECK: vpopctb %v0, %v31 +0xe7 0x0f 0x00 0x00 0x04 0x50 + +# CHECK: vpopctb %v15, %v0 +0xe7 0xf0 0x00 0x00 0x00 0x50 + +# CHECK: vpopctb %v31, %v0 +0xe7 0xf0 0x00 0x00 0x08 0x50 + +# CHECK: vpopctb %v14, %v17 +0xe7 0xe1 0x00 0x00 0x04 0x50 + +# CHECK: vpopctf %v0, %v0 +0xe7 0x00 0x00 0x00 0x20 0x50 + +# CHECK: vpopctf %v0, %v15 +0xe7 0x0f 0x00 0x00 0x20 0x50 + +# CHECK: vpopctf %v0, %v31 +0xe7 0x0f 0x00 0x00 0x24 0x50 + +# CHECK: vpopctf %v15, %v0 +0xe7 0xf0 0x00 0x00 0x20 0x50 + +# CHECK: vpopctf %v31, %v0 +0xe7 0xf0 0x00 0x00 0x28 0x50 + +# CHECK: vpopctf %v14, %v17 +0xe7 0xe1 0x00 0x00 0x24 0x50 + +# CHECK: vpopctg %v0, %v0 +0xe7 0x00 0x00 0x00 0x30 0x50 + +# CHECK: vpopctg %v0, %v15 +0xe7 0x0f 0x00 0x00 0x30 0x50 + +# CHECK: vpopctg %v0, %v31 +0xe7 0x0f 0x00 0x00 0x34 0x50 + +# CHECK: vpopctg %v15, %v0 +0xe7 0xf0 0x00 0x00 0x30 0x50 + +# CHECK: vpopctg %v31, %v0 +0xe7 0xf0 0x00 0x00 0x38 0x50 + +# CHECK: vpopctg %v14, %v17 +0xe7 0xe1 0x00 0x00 0x34 0x50 + +# CHECK: vpopcth %v0, %v0 +0xe7 0x00 0x00 0x00 0x10 0x50 + +# CHECK: vpopcth %v0, %v15 +0xe7 0x0f 0x00 0x00 0x10 0x50 + +# CHECK: vpopcth %v0, %v31 +0xe7 0x0f 0x00 0x00 0x14 0x50 + +# CHECK: vpopcth %v15, %v0 +0xe7 0xf0 0x00 0x00 0x10 0x50 + +# CHECK: vpopcth %v31, %v0 +0xe7 0xf0 0x00 0x00 0x18 0x50 + +# CHECK: vpopcth %v14, %v17 +0xe7 0xe1 0x00 0x00 0x14 0x50 + +# CHECK: vpsop %v0, %v0, 0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x5b + +# CHECK: vpsop %v0, %v0, 0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x5b + +# CHECK: vpsop %v0, %v0, 0, 255, 0 +0xe6 0x00 0xff 0x00 0x00 0x5b + +# CHECK: vpsop %v0, %v0, 255, 0, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x5b + +# CHECK: vpsop %v0, %v31, 0, 0, 0 +0xe6 0x0f 0x00 0x00 0x04 0x5b + +# CHECK: vpsop %v31, %v0, 0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x5b + +# CHECK: vpsop %v13, %v17, 52, 121, 11 +0xe6 0xd1 0x79 0xb3 0x44 0x5b + +# CHECK: vrp %v0, %v0, %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x7b + +# CHECK: vrp %v0, %v0, %v0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x7b + +# CHECK: vrp %v0, %v0, %v0, 255, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x7b + +# CHECK: vrp %v0, %v0, %v31, 0, 0 +0xe6 0x00 0xf0 0x00 0x02 0x7b + +# CHECK: vrp %v0, %v31, %v0, 0, 0 +0xe6 0x0f 0x00 0x00 0x04 0x7b + +# CHECK: vrp %v31, %v0, %v0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x7b + +# CHECK: vrp %v13, %v17, %v21, 121, 11 +0xe6 0xd1 0x50 0xb7 0x96 0x7b + +# CHECK: vsdp %v0, %v0, %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x7e + +# CHECK: vsdp %v0, %v0, %v0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x7e + +# CHECK: vsdp %v0, %v0, %v0, 255, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x7e + +# CHECK: vsdp %v0, %v0, %v31, 0, 0 +0xe6 0x00 0xf0 0x00 0x02 0x7e + +# CHECK: vsdp %v0, %v31, %v0, 0, 0 +0xe6 0x0f 0x00 0x00 0x04 0x7e + +# CHECK: vsdp %v31, %v0, %v0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x7e + +# CHECK: vsdp %v13, %v17, %v21, 121, 11 +0xe6 0xd1 0x50 0xb7 0x96 0x7e + +# CHECK: vsp %v0, %v0, %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x73 + +# CHECK: vsp %v0, %v0, %v0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x73 + +# CHECK: vsp %v0, %v0, %v0, 255, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x73 + +# CHECK: vsp %v0, %v0, %v31, 0, 0 +0xe6 0x00 0xf0 0x00 0x02 0x73 + +# CHECK: vsp %v0, %v31, %v0, 0, 0 +0xe6 0x0f 0x00 0x00 0x04 0x73 + +# CHECK: vsp %v31, %v0, %v0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x73 + +# CHECK: vsp %v13, %v17, %v21, 121, 11 +0xe6 0xd1 0x50 0xb7 0x96 0x73 + +# CHECK: vsrp %v0, %v0, 0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x59 + +# CHECK: vsrp %v0, %v0, 0, 0, 15 +0xe6 0x00 0x00 0xf0 0x00 0x59 + +# CHECK: vsrp %v0, %v0, 0, 255, 0 +0xe6 0x00 0xff 0x00 0x00 0x59 + +# CHECK: vsrp %v0, %v0, 255, 0, 0 +0xe6 0x00 0x00 0x0f 0xf0 0x59 + +# CHECK: vsrp %v0, %v31, 0, 0, 0 +0xe6 0x0f 0x00 0x00 0x04 0x59 + +# CHECK: vsrp %v31, %v0, 0, 0, 0 +0xe6 0xf0 0x00 0x00 0x08 0x59 + +# CHECK: vsrp %v13, %v17, 52, 121, 11 +0xe6 0xd1 0x79 0xb3 0x44 0x59 + +# CHECK: vstrl %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x3d + +# CHECK: vstrl %v0, 4095, 0 +0xe6 0x00 0x0f 0xff 0x00 0x3d + +# CHECK: vstrl %v0, 0(%r15), 0 +0xe6 0x00 0xf0 0x00 0x00 0x3d + +# CHECK: vstrl %v0, 0, 255 +0xe6 0xff 0x00 0x00 0x00 0x3d + +# CHECK: vstrl %v15, 0, 0 +0xe6 0x00 0x00 0x00 0xf0 0x3d + +# CHECK: vstrl %v31, 0, 0 +0xe6 0x00 0x00 0x00 0xf1 0x3d + +# CHECK: vstrl %v18, 1383(%r4), 3 +0xe6 0x03 0x45 0x67 0x21 0x3d + +# CHECK: vstrlr %v0, %r0, 0 +0xe6 0x00 0x00 0x00 0x00 0x3f + +# CHECK: vstrlr %v0, %r0, 4095 +0xe6 0x00 0x0f 0xff 0x00 0x3f + +# CHECK: vstrlr %v0, %r0, 0(%r15) +0xe6 0x00 0xf0 0x00 0x00 0x3f + +# CHECK: vstrlr %v0, %r15, 0 +0xe6 0x0f 0x00 0x00 0x00 0x3f + +# CHECK: vstrlr %v15, %r0, 0 +0xe6 0x00 0x00 0x00 0xf0 0x3f + +# CHECK: vstrlr %v31, %r0, 0 +0xe6 0x00 0x00 0x00 0xf1 0x3f + +# CHECK: vstrlr %v18, %r3, 1383(%r4) +0xe6 0x03 0x45 0x67 0x21 0x3f + +# CHECK: vtp %v0 +0xe6 0x00 0x00 0x00 0x00 0x5f + +# CHECK: vtp %v15 +0xe6 0x0f 0x00 0x00 0x00 0x5f + +# CHECK: vtp %v31 +0xe6 0x0f 0x00 0x00 0x04 0x5f + +# CHECK: vupkz %v0, 0, 0 +0xe6 0x00 0x00 0x00 0x00 0x3c + +# CHECK: vupkz %v0, 4095, 0 +0xe6 0x00 0x0f 0xff 0x00 0x3c + +# CHECK: vupkz %v0, 0(%r15), 0 +0xe6 0x00 0xf0 0x00 0x00 0x3c + +# CHECK: vupkz %v0, 0, 255 +0xe6 0xff 0x00 0x00 0x00 0x3c + +# CHECK: vupkz %v15, 0, 0 +0xe6 0x00 0x00 0x00 0xf0 0x3c + +# CHECK: vupkz %v31, 0, 0 +0xe6 0x00 0x00 0x00 0xf1 0x3c + +# CHECK: vupkz %v18, 1383(%r4), 3 +0xe6 0x03 0x45 0x67 0x21 0x3c + +# CHECK: wfasb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xe3 + +# CHECK: wfasb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xe3 + +# CHECK: wfasb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x08 0x22 0xe3 + +# CHECK: wfasb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x08 0x24 0xe3 + +# CHECK: wfasb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x08 0x28 0xe3 + +# CHECK: wfasb %v18, %f3, %v20 +0xe7 0x23 0x40 0x08 0x2a 0xe3 + +# CHECK: wfaxb %v0, %v0, %v0 +0xe7 0x00 0x00 0x08 0x40 0xe3 + +# CHECK: wfaxb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x08 0x42 0xe3 + +# CHECK: wfaxb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x08 0x44 0xe3 + +# CHECK: wfaxb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x08 0x48 0xe3 + +# CHECK: wfaxb %v18, %v3, %v20 +0xe7 0x23 0x40 0x08 0x4a 0xe3 + +# CHECK: wfcsb %f0, %f0 +0xe7 0x00 0x00 0x00 0x20 0xcb + +# CHECK: wfcsb %f0, %f0 +0xe7 0x00 0x00 0x00 0x20 0xcb + +# CHECK: wfcsb %f0, %f15 +0xe7 0x0f 0x00 0x00 0x20 0xcb + +# CHECK: wfcsb %f0, %v31 +0xe7 0x0f 0x00 0x00 0x24 0xcb + +# CHECK: wfcsb %f15, %f0 +0xe7 0xf0 0x00 0x00 0x20 0xcb + +# CHECK: wfcsb %v31, %f0 +0xe7 0xf0 0x00 0x00 0x28 0xcb + +# CHECK: wfcsb %f14, %v17 +0xe7 0xe1 0x00 0x00 0x24 0xcb + +# CHECK: wfcxb %v0, %v0 +0xe7 0x00 0x00 0x00 0x40 0xcb + +# CHECK: wfcxb %v0, %v15 +0xe7 0x0f 0x00 0x00 0x40 0xcb + +# CHECK: wfcxb %v0, %v31 +0xe7 0x0f 0x00 0x00 0x44 0xcb + +# CHECK: wfcxb %v15, %v0 +0xe7 0xf0 0x00 0x00 0x40 0xcb + +# CHECK: wfcxb %v31, %v0 +0xe7 0xf0 0x00 0x00 0x48 0xcb + +# CHECK: wfcxb %v14, %v17 +0xe7 0xe1 0x00 0x00 0x44 0xcb + +# CHECK: wfcesb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xe8 + +# CHECK: wfcesb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xe8 + +# CHECK: wfcesb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x08 0x22 0xe8 + +# CHECK: wfcesb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x08 0x24 0xe8 + +# CHECK: wfcesb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x08 0x28 0xe8 + +# CHECK: wfcesb %v18, %f3, %v20 +0xe7 0x23 0x40 0x08 0x2a 0xe8 + +# CHECK: wfcesbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x18 0x20 0xe8 + +# CHECK: wfcesbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x18 0x20 0xe8 + +# CHECK: wfcesbs %f0, %f0, %v31 +0xe7 0x00 0xf0 0x18 0x22 0xe8 + +# CHECK: wfcesbs %f0, %v31, %f0 +0xe7 0x0f 0x00 0x18 0x24 0xe8 + +# CHECK: wfcesbs %v31, %f0, %f0 +0xe7 0xf0 0x00 0x18 0x28 0xe8 + +# CHECK: wfcesbs %v18, %f3, %v20 +0xe7 0x23 0x40 0x18 0x2a 0xe8 + +# CHECK: wfcexb %v0, %v0, %v0 +0xe7 0x00 0x00 0x08 0x40 0xe8 + +# CHECK: wfcexb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x08 0x42 0xe8 + +# CHECK: wfcexb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x08 0x44 0xe8 + +# CHECK: wfcexb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x08 0x48 0xe8 + +# CHECK: wfcexb %v18, %v3, %v20 +0xe7 0x23 0x40 0x08 0x4a 0xe8 + +# CHECK: wfcexbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x18 0x40 0xe8 + +# CHECK: wfcexbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x18 0x42 0xe8 + +# CHECK: wfcexbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x18 0x44 0xe8 + +# CHECK: wfcexbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x18 0x48 0xe8 + +# CHECK: wfcexbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x18 0x4a 0xe8 + +# CHECK: wfchsb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xeb + +# CHECK: wfchsb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xeb + +# CHECK: wfchsb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x08 0x22 0xeb + +# CHECK: wfchsb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x08 0x24 0xeb + +# CHECK: wfchsb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x08 0x28 0xeb + +# CHECK: wfchsb %v18, %f3, %v20 +0xe7 0x23 0x40 0x08 0x2a 0xeb + +# CHECK: wfchsbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x18 0x20 0xeb + +# CHECK: wfchsbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x18 0x20 0xeb + +# CHECK: wfchsbs %f0, %f0, %v31 +0xe7 0x00 0xf0 0x18 0x22 0xeb + +# CHECK: wfchsbs %f0, %v31, %f0 +0xe7 0x0f 0x00 0x18 0x24 0xeb + +# CHECK: wfchsbs %v31, %f0, %f0 +0xe7 0xf0 0x00 0x18 0x28 0xeb + +# CHECK: wfchsbs %v18, %f3, %v20 +0xe7 0x23 0x40 0x18 0x2a 0xeb + +# CHECK: wfchxb %v0, %v0, %v0 +0xe7 0x00 0x00 0x08 0x40 0xeb + +# CHECK: wfchxb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x08 0x42 0xeb + +# CHECK: wfchxb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x08 0x44 0xeb + +# CHECK: wfchxb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x08 0x48 0xeb + +# CHECK: wfchxb %v18, %v3, %v20 +0xe7 0x23 0x40 0x08 0x4a 0xeb + +# CHECK: wfchxbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x18 0x40 0xeb + +# CHECK: wfchxbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x18 0x42 0xeb + +# CHECK: wfchxbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x18 0x44 0xeb + +# CHECK: wfchxbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x18 0x48 0xeb + +# CHECK: wfchxbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x18 0x4a 0xeb + +# CHECK: wfchesb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xea + +# CHECK: wfchesb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xea + +# CHECK: wfchesb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x08 0x22 0xea + +# CHECK: wfchesb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x08 0x24 0xea + +# CHECK: wfchesb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x08 0x28 0xea + +# CHECK: wfchesb %v18, %f3, %v20 +0xe7 0x23 0x40 0x08 0x2a 0xea + +# CHECK: wfchesbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x18 0x20 0xea + +# CHECK: wfchesbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x18 0x20 0xea + +# CHECK: wfchesbs %f0, %f0, %v31 +0xe7 0x00 0xf0 0x18 0x22 0xea + +# CHECK: wfchesbs %f0, %v31, %f0 +0xe7 0x0f 0x00 0x18 0x24 0xea + +# CHECK: wfchesbs %v31, %f0, %f0 +0xe7 0xf0 0x00 0x18 0x28 0xea + +# CHECK: wfchesbs %v18, %f3, %v20 +0xe7 0x23 0x40 0x18 0x2a 0xea + +# CHECK: wfchexb %v0, %v0, %v0 +0xe7 0x00 0x00 0x08 0x40 0xea + +# CHECK: wfchexb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x08 0x42 0xea + +# CHECK: wfchexb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x08 0x44 0xea + +# CHECK: wfchexb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x08 0x48 0xea + +# CHECK: wfchexb %v18, %v3, %v20 +0xe7 0x23 0x40 0x08 0x4a 0xea + +# CHECK: wfchexbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x18 0x40 0xea + +# CHECK: wfchexbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x18 0x42 0xea + +# CHECK: wfchexbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x18 0x44 0xea + +# CHECK: wfchexbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x18 0x48 0xea + +# CHECK: wfchexbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x18 0x4a 0xea + +# CHECK: wfdsb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xe5 + +# CHECK: wfdsb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xe5 + +# CHECK: wfdsb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x08 0x22 0xe5 + +# CHECK: wfdsb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x08 0x24 0xe5 + +# CHECK: wfdsb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x08 0x28 0xe5 + +# CHECK: wfdsb %v18, %f3, %v20 +0xe7 0x23 0x40 0x08 0x2a 0xe5 + +# CHECK: wfdxb %v0, %v0, %v0 +0xe7 0x00 0x00 0x08 0x40 0xe5 + +# CHECK: wfdxb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x08 0x42 0xe5 + +# CHECK: wfdxb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x08 0x44 0xe5 + +# CHECK: wfdxb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x08 0x48 0xe5 + +# CHECK: wfdxb %v18, %v3, %v20 +0xe7 0x23 0x40 0x08 0x4a 0xe5 + +# CHECK: wfisb %f0, %f0, 0, 0 +0xe7 0x00 0x00 0x08 0x20 0xc7 + +# CHECK: wfisb %f0, %f0, 0, 0 +0xe7 0x00 0x00 0x08 0x20 0xc7 + +# CHECK: wfisb %f0, %f0, 0, 15 +0xe7 0x00 0x00 0xf8 0x20 0xc7 + +# CHECK: wfisb %f0, %f0, 4, 0 +0xe7 0x00 0x00 0x0c 0x20 0xc7 + +# CHECK: wfisb %f0, %f0, 7, 0 +0xe7 0x00 0x00 0x0f 0x20 0xc7 + +# CHECK: wfisb %f0, %v31, 0, 0 +0xe7 0x0f 0x00 0x08 0x24 0xc7 + +# CHECK: wfisb %v31, %f0, 0, 0 +0xe7 0xf0 0x00 0x08 0x28 0xc7 + +# CHECK: wfisb %f14, %v17, 4, 10 +0xe7 0xe1 0x00 0xac 0x24 0xc7 + +# CHECK: wfixb %v0, %v0, 0, 0 +0xe7 0x00 0x00 0x08 0x40 0xc7 + +# CHECK: wfixb %v0, %v0, 0, 15 +0xe7 0x00 0x00 0xf8 0x40 0xc7 + +# CHECK: wfixb %v0, %v0, 4, 0 +0xe7 0x00 0x00 0x0c 0x40 0xc7 + +# CHECK: wfixb %v0, %v0, 7, 0 +0xe7 0x00 0x00 0x0f 0x40 0xc7 + +# CHECK: wfixb %v0, %v31, 0, 0 +0xe7 0x0f 0x00 0x08 0x44 0xc7 + +# CHECK: wfixb %v31, %v0, 0, 0 +0xe7 0xf0 0x00 0x08 0x48 0xc7 + +# CHECK: wfixb %v14, %v17, 4, 10 +0xe7 0xe1 0x00 0xac 0x44 0xc7 + +# CHECK: wfksb %f0, %f0 +0xe7 0x00 0x00 0x00 0x20 0xca + +# CHECK: wfksb %f0, %f0 +0xe7 0x00 0x00 0x00 0x20 0xca + +# CHECK: wfksb %f0, %f15 +0xe7 0x0f 0x00 0x00 0x20 0xca + +# CHECK: wfksb %f0, %v31 +0xe7 0x0f 0x00 0x00 0x24 0xca + +# CHECK: wfksb %f15, %f0 +0xe7 0xf0 0x00 0x00 0x20 0xca + +# CHECK: wfksb %v31, %f0 +0xe7 0xf0 0x00 0x00 0x28 0xca + +# CHECK: wfksb %f14, %v17 +0xe7 0xe1 0x00 0x00 0x24 0xca + +# CHECK: wfkxb %v0, %v0 +0xe7 0x00 0x00 0x00 0x40 0xca + +# CHECK: wfkxb %v0, %v15 +0xe7 0x0f 0x00 0x00 0x40 0xca + +# CHECK: wfkxb %v0, %v31 +0xe7 0x0f 0x00 0x00 0x44 0xca + +# CHECK: wfkxb %v15, %v0 +0xe7 0xf0 0x00 0x00 0x40 0xca + +# CHECK: wfkxb %v31, %v0 +0xe7 0xf0 0x00 0x00 0x48 0xca + +# CHECK: wfkxb %v14, %v17 +0xe7 0xe1 0x00 0x00 0x44 0xca + +# CHECK: wfkedb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x30 0xe8 + +# CHECK: wfkedb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x30 0xe8 + +# CHECK: wfkedb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x0c 0x32 0xe8 + +# CHECK: wfkedb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x0c 0x34 0xe8 + +# CHECK: wfkedb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x0c 0x38 0xe8 + +# CHECK: wfkedb %v18, %f3, %v20 +0xe7 0x23 0x40 0x0c 0x3a 0xe8 + +# CHECK: wfkedbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x30 0xe8 + +# CHECK: wfkedbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x30 0xe8 + +# CHECK: wfkedbs %f0, %f0, %v31 +0xe7 0x00 0xf0 0x1c 0x32 0xe8 + +# CHECK: wfkedbs %f0, %v31, %f0 +0xe7 0x0f 0x00 0x1c 0x34 0xe8 + +# CHECK: wfkedbs %v31, %f0, %f0 +0xe7 0xf0 0x00 0x1c 0x38 0xe8 + +# CHECK: wfkedbs %v18, %f3, %v20 +0xe7 0x23 0x40 0x1c 0x3a 0xe8 + +# CHECK: wfkesb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x20 0xe8 + +# CHECK: wfkesb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x20 0xe8 + +# CHECK: wfkesb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x0c 0x22 0xe8 + +# CHECK: wfkesb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x0c 0x24 0xe8 + +# CHECK: wfkesb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x0c 0x28 0xe8 + +# CHECK: wfkesb %v18, %f3, %v20 +0xe7 0x23 0x40 0x0c 0x2a 0xe8 + +# CHECK: wfkesbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x20 0xe8 + +# CHECK: wfkesbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x20 0xe8 + +# CHECK: wfkesbs %f0, %f0, %v31 +0xe7 0x00 0xf0 0x1c 0x22 0xe8 + +# CHECK: wfkesbs %f0, %v31, %f0 +0xe7 0x0f 0x00 0x1c 0x24 0xe8 + +# CHECK: wfkesbs %v31, %f0, %f0 +0xe7 0xf0 0x00 0x1c 0x28 0xe8 + +# CHECK: wfkesbs %v18, %f3, %v20 +0xe7 0x23 0x40 0x1c 0x2a 0xe8 + +# CHECK: wfkexb %v0, %v0, %v0 +0xe7 0x00 0x00 0x0c 0x40 0xe8 + +# CHECK: wfkexb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x0c 0x42 0xe8 + +# CHECK: wfkexb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x0c 0x44 0xe8 + +# CHECK: wfkexb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x0c 0x48 0xe8 + +# CHECK: wfkexb %v18, %v3, %v20 +0xe7 0x23 0x40 0x0c 0x4a 0xe8 + +# CHECK: wfkexbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x1c 0x40 0xe8 + +# CHECK: wfkexbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x1c 0x42 0xe8 + +# CHECK: wfkexbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x1c 0x44 0xe8 + +# CHECK: wfkexbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x1c 0x48 0xe8 + +# CHECK: wfkexbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x1c 0x4a 0xe8 + +# CHECK: wfkhdb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x30 0xeb + +# CHECK: wfkhdb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x30 0xeb + +# CHECK: wfkhdb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x0c 0x32 0xeb + +# CHECK: wfkhdb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x0c 0x34 0xeb + +# CHECK: wfkhdb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x0c 0x38 0xeb + +# CHECK: wfkhdb %v18, %f3, %v20 +0xe7 0x23 0x40 0x0c 0x3a 0xeb + +# CHECK: wfkhdbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x30 0xeb + +# CHECK: wfkhdbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x30 0xeb + +# CHECK: wfkhdbs %f0, %f0, %v31 +0xe7 0x00 0xf0 0x1c 0x32 0xeb + +# CHECK: wfkhdbs %f0, %v31, %f0 +0xe7 0x0f 0x00 0x1c 0x34 0xeb + +# CHECK: wfkhdbs %v31, %f0, %f0 +0xe7 0xf0 0x00 0x1c 0x38 0xeb + +# CHECK: wfkhdbs %v18, %f3, %v20 +0xe7 0x23 0x40 0x1c 0x3a 0xeb + +# CHECK: wfkhsb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x20 0xeb + +# CHECK: wfkhsb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x20 0xeb + +# CHECK: wfkhsb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x0c 0x22 0xeb + +# CHECK: wfkhsb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x0c 0x24 0xeb + +# CHECK: wfkhsb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x0c 0x28 0xeb + +# CHECK: wfkhsb %v18, %f3, %v20 +0xe7 0x23 0x40 0x0c 0x2a 0xeb + +# CHECK: wfkhsbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x20 0xeb + +# CHECK: wfkhsbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x20 0xeb + +# CHECK: wfkhsbs %f0, %f0, %v31 +0xe7 0x00 0xf0 0x1c 0x22 0xeb + +# CHECK: wfkhsbs %f0, %v31, %f0 +0xe7 0x0f 0x00 0x1c 0x24 0xeb + +# CHECK: wfkhsbs %v31, %f0, %f0 +0xe7 0xf0 0x00 0x1c 0x28 0xeb + +# CHECK: wfkhsbs %v18, %f3, %v20 +0xe7 0x23 0x40 0x1c 0x2a 0xeb + +# CHECK: wfkhxb %v0, %v0, %v0 +0xe7 0x00 0x00 0x0c 0x40 0xeb + +# CHECK: wfkhxb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x0c 0x42 0xeb + +# CHECK: wfkhxb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x0c 0x44 0xeb + +# CHECK: wfkhxb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x0c 0x48 0xeb + +# CHECK: wfkhxb %v18, %v3, %v20 +0xe7 0x23 0x40 0x0c 0x4a 0xeb + +# CHECK: wfkhxbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x1c 0x40 0xeb + +# CHECK: wfkhxbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x1c 0x42 0xeb + +# CHECK: wfkhxbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x1c 0x44 0xeb + +# CHECK: wfkhxbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x1c 0x48 0xeb + +# CHECK: wfkhxbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x1c 0x4a 0xeb + +# CHECK: wfkhedb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x30 0xea + +# CHECK: wfkhedb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x30 0xea + +# CHECK: wfkhedb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x0c 0x32 0xea + +# CHECK: wfkhedb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x0c 0x34 0xea + +# CHECK: wfkhedb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x0c 0x38 0xea + +# CHECK: wfkhedb %v18, %f3, %v20 +0xe7 0x23 0x40 0x0c 0x3a 0xea + +# CHECK: wfkhedbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x30 0xea + +# CHECK: wfkhedbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x30 0xea + +# CHECK: wfkhedbs %f0, %f0, %v31 +0xe7 0x00 0xf0 0x1c 0x32 0xea + +# CHECK: wfkhedbs %f0, %v31, %f0 +0xe7 0x0f 0x00 0x1c 0x34 0xea + +# CHECK: wfkhedbs %v31, %f0, %f0 +0xe7 0xf0 0x00 0x1c 0x38 0xea + +# CHECK: wfkhedbs %v18, %f3, %v20 +0xe7 0x23 0x40 0x1c 0x3a 0xea + +# CHECK: wfkhesb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x20 0xea + +# CHECK: wfkhesb %f0, %f0, %f0 +0xe7 0x00 0x00 0x0c 0x20 0xea + +# CHECK: wfkhesb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x0c 0x22 0xea + +# CHECK: wfkhesb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x0c 0x24 0xea + +# CHECK: wfkhesb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x0c 0x28 0xea + +# CHECK: wfkhesb %v18, %f3, %v20 +0xe7 0x23 0x40 0x0c 0x2a 0xea + +# CHECK: wfkhesbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x20 0xea + +# CHECK: wfkhesbs %f0, %f0, %f0 +0xe7 0x00 0x00 0x1c 0x20 0xea + +# CHECK: wfkhesbs %f0, %f0, %v31 +0xe7 0x00 0xf0 0x1c 0x22 0xea + +# CHECK: wfkhesbs %f0, %v31, %f0 +0xe7 0x0f 0x00 0x1c 0x24 0xea + +# CHECK: wfkhesbs %v31, %f0, %f0 +0xe7 0xf0 0x00 0x1c 0x28 0xea + +# CHECK: wfkhesbs %v18, %f3, %v20 +0xe7 0x23 0x40 0x1c 0x2a 0xea + +# CHECK: wfkhexb %v0, %v0, %v0 +0xe7 0x00 0x00 0x0c 0x40 0xea + +# CHECK: wfkhexb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x0c 0x42 0xea + +# CHECK: wfkhexb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x0c 0x44 0xea + +# CHECK: wfkhexb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x0c 0x48 0xea + +# CHECK: wfkhexb %v18, %v3, %v20 +0xe7 0x23 0x40 0x0c 0x4a 0xea + +# CHECK: wfkhexbs %v0, %v0, %v0 +0xe7 0x00 0x00 0x1c 0x40 0xea + +# CHECK: wfkhexbs %v0, %v0, %v31 +0xe7 0x00 0xf0 0x1c 0x42 0xea + +# CHECK: wfkhexbs %v0, %v31, %v0 +0xe7 0x0f 0x00 0x1c 0x44 0xea + +# CHECK: wfkhexbs %v31, %v0, %v0 +0xe7 0xf0 0x00 0x1c 0x48 0xea + +# CHECK: wfkhexbs %v18, %v3, %v20 +0xe7 0x23 0x40 0x1c 0x4a 0xea + +# CHECK: wfpsosb %f0, %f0, 3 +0xe7 0x00 0x00 0x38 0x20 0xcc + +# CHECK: wfpsosb %f0, %f0, 3 +0xe7 0x00 0x00 0x38 0x20 0xcc + +# CHECK: wfpsosb %f0, %f0, 15 +0xe7 0x00 0x00 0xf8 0x20 0xcc + +# CHECK: wfpsosb %f0, %f15, 3 +0xe7 0x0f 0x00 0x38 0x20 0xcc + +# CHECK: wfpsosb %f0, %v31, 3 +0xe7 0x0f 0x00 0x38 0x24 0xcc + +# CHECK: wfpsosb %f15, %f0, 3 +0xe7 0xf0 0x00 0x38 0x20 0xcc + +# CHECK: wfpsosb %v31, %f0, 3 +0xe7 0xf0 0x00 0x38 0x28 0xcc + +# CHECK: wfpsosb %f14, %v17, 7 +0xe7 0xe1 0x00 0x78 0x24 0xcc + +# CHECK: wfpsoxb %v0, %v0, 3 +0xe7 0x00 0x00 0x38 0x40 0xcc + +# CHECK: wfpsoxb %v0, %v0, 15 +0xe7 0x00 0x00 0xf8 0x40 0xcc + +# CHECK: wfpsoxb %v0, %v15, 3 +0xe7 0x0f 0x00 0x38 0x40 0xcc + +# CHECK: wfpsoxb %v0, %v31, 3 +0xe7 0x0f 0x00 0x38 0x44 0xcc + +# CHECK: wfpsoxb %v15, %v0, 3 +0xe7 0xf0 0x00 0x38 0x40 0xcc + +# CHECK: wfpsoxb %v31, %v0, 3 +0xe7 0xf0 0x00 0x38 0x48 0xcc + +# CHECK: wfpsoxb %v14, %v17, 7 +0xe7 0xe1 0x00 0x78 0x44 0xcc + +# CHECK: wflcsb %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xcc + +# CHECK: wflcsb %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xcc + +# CHECK: wflcsb %f0, %f15 +0xe7 0x0f 0x00 0x08 0x20 0xcc + +# CHECK: wflcsb %f0, %v31 +0xe7 0x0f 0x00 0x08 0x24 0xcc + +# CHECK: wflcsb %f15, %f0 +0xe7 0xf0 0x00 0x08 0x20 0xcc + +# CHECK: wflcsb %v31, %f0 +0xe7 0xf0 0x00 0x08 0x28 0xcc + +# CHECK: wflcsb %f14, %v17 +0xe7 0xe1 0x00 0x08 0x24 0xcc + +# CHECK: wflcxb %v0, %v0 +0xe7 0x00 0x00 0x08 0x40 0xcc + +# CHECK: wflcxb %v0, %v15 +0xe7 0x0f 0x00 0x08 0x40 0xcc + +# CHECK: wflcxb %v0, %v31 +0xe7 0x0f 0x00 0x08 0x44 0xcc + +# CHECK: wflcxb %v15, %v0 +0xe7 0xf0 0x00 0x08 0x40 0xcc + +# CHECK: wflcxb %v31, %v0 +0xe7 0xf0 0x00 0x08 0x48 0xcc + +# CHECK: wflcxb %v14, %v17 +0xe7 0xe1 0x00 0x08 0x44 0xcc + +# CHECK: wflnsb %f0, %f0 +0xe7 0x00 0x00 0x18 0x20 0xcc + +# CHECK: wflnsb %f0, %f0 +0xe7 0x00 0x00 0x18 0x20 0xcc + +# CHECK: wflnsb %f0, %f15 +0xe7 0x0f 0x00 0x18 0x20 0xcc + +# CHECK: wflnsb %f0, %v31 +0xe7 0x0f 0x00 0x18 0x24 0xcc + +# CHECK: wflnsb %f15, %f0 +0xe7 0xf0 0x00 0x18 0x20 0xcc + +# CHECK: wflnsb %v31, %f0 +0xe7 0xf0 0x00 0x18 0x28 0xcc + +# CHECK: wflnsb %f14, %v17 +0xe7 0xe1 0x00 0x18 0x24 0xcc + +# CHECK: wflnxb %v0, %v0 +0xe7 0x00 0x00 0x18 0x40 0xcc + +# CHECK: wflnxb %v0, %v15 +0xe7 0x0f 0x00 0x18 0x40 0xcc + +# CHECK: wflnxb %v0, %v31 +0xe7 0x0f 0x00 0x18 0x44 0xcc + +# CHECK: wflnxb %v15, %v0 +0xe7 0xf0 0x00 0x18 0x40 0xcc + +# CHECK: wflnxb %v31, %v0 +0xe7 0xf0 0x00 0x18 0x48 0xcc + +# CHECK: wflnxb %v14, %v17 +0xe7 0xe1 0x00 0x18 0x44 0xcc + +# CHECK: wflpsb %f0, %f0 +0xe7 0x00 0x00 0x28 0x20 0xcc + +# CHECK: wflpsb %f0, %f0 +0xe7 0x00 0x00 0x28 0x20 0xcc + +# CHECK: wflpsb %f0, %f15 +0xe7 0x0f 0x00 0x28 0x20 0xcc + +# CHECK: wflpsb %f0, %v31 +0xe7 0x0f 0x00 0x28 0x24 0xcc + +# CHECK: wflpsb %f15, %f0 +0xe7 0xf0 0x00 0x28 0x20 0xcc + +# CHECK: wflpsb %v31, %f0 +0xe7 0xf0 0x00 0x28 0x28 0xcc + +# CHECK: wflpsb %f14, %v17 +0xe7 0xe1 0x00 0x28 0x24 0xcc + +# CHECK: wflpxb %v0, %v0 +0xe7 0x00 0x00 0x28 0x40 0xcc + +# CHECK: wflpxb %v0, %v15 +0xe7 0x0f 0x00 0x28 0x40 0xcc + +# CHECK: wflpxb %v0, %v31 +0xe7 0x0f 0x00 0x28 0x44 0xcc + +# CHECK: wflpxb %v15, %v0 +0xe7 0xf0 0x00 0x28 0x40 0xcc + +# CHECK: wflpxb %v31, %v0 +0xe7 0xf0 0x00 0x28 0x48 0xcc + +# CHECK: wflpxb %v14, %v17 +0xe7 0xe1 0x00 0x28 0x44 0xcc + +# CHECK: wflld %v0, %f0 +0xe7 0x00 0x00 0x08 0x30 0xc4 + +# CHECK: wflld %v0, %f0 +0xe7 0x00 0x00 0x08 0x30 0xc4 + +# CHECK: wflld %v0, %f15 +0xe7 0x0f 0x00 0x08 0x30 0xc4 + +# CHECK: wflld %v0, %v31 +0xe7 0x0f 0x00 0x08 0x34 0xc4 + +# CHECK: wflld %v15, %f0 +0xe7 0xf0 0x00 0x08 0x30 0xc4 + +# CHECK: wflld %v31, %f0 +0xe7 0xf0 0x00 0x08 0x38 0xc4 + +# CHECK: wflld %v14, %v17 +0xe7 0xe1 0x00 0x08 0x34 0xc4 + +# CHECK: wflrx %f0, %v0, 0, 0 +0xe7 0x00 0x00 0x08 0x40 0xc5 + +# CHECK: wflrx %f0, %v0, 0, 0 +0xe7 0x00 0x00 0x08 0x40 0xc5 + +# CHECK: wflrx %f0, %v0, 0, 15 +0xe7 0x00 0x00 0xf8 0x40 0xc5 + +# CHECK: wflrx %f0, %v0, 4, 0 +0xe7 0x00 0x00 0x0c 0x40 0xc5 + +# CHECK: wflrx %f0, %v0, 7, 0 +0xe7 0x00 0x00 0x0f 0x40 0xc5 + +# CHECK: wflrx %f0, %v31, 0, 0 +0xe7 0x0f 0x00 0x08 0x44 0xc5 + +# CHECK: wflrx %v31, %v0, 0, 0 +0xe7 0xf0 0x00 0x08 0x48 0xc5 + +# CHECK: wflrx %f14, %v17, 4, 10 +0xe7 0xe1 0x00 0xac 0x44 0xc5 + +# CHECK: wfmaxdb %f0, %f0, %f0, 0 +0xe7 0x00 0x00 0x08 0x30 0xef + +# CHECK: wfmaxdb %f0, %f0, %f0, 0 +0xe7 0x00 0x00 0x08 0x30 0xef + +# CHECK: wfmaxdb %f0, %f0, %f0, 4 +0xe7 0x00 0x00 0x48 0x30 0xef + +# CHECK: wfmaxdb %f0, %f0, %v31, 0 +0xe7 0x00 0xf0 0x08 0x32 0xef + +# CHECK: wfmaxdb %f0, %v31, %f0, 0 +0xe7 0x0f 0x00 0x08 0x34 0xef + +# CHECK: wfmaxdb %v31, %f0, %f0, 0 +0xe7 0xf0 0x00 0x08 0x38 0xef + +# CHECK: wfmaxdb %v18, %f3, %v20, 11 +0xe7 0x23 0x40 0xb8 0x3a 0xef + +# CHECK: wfmaxsb %f0, %f0, %f0, 0 +0xe7 0x00 0x00 0x08 0x20 0xef + +# CHECK: wfmaxsb %f0, %f0, %f0, 0 +0xe7 0x00 0x00 0x08 0x20 0xef + +# CHECK: wfmaxsb %f0, %f0, %f0, 4 +0xe7 0x00 0x00 0x48 0x20 0xef + +# CHECK: wfmaxsb %f0, %f0, %v31, 0 +0xe7 0x00 0xf0 0x08 0x22 0xef + +# CHECK: wfmaxsb %f0, %v31, %f0, 0 +0xe7 0x0f 0x00 0x08 0x24 0xef + +# CHECK: wfmaxsb %v31, %f0, %f0, 0 +0xe7 0xf0 0x00 0x08 0x28 0xef + +# CHECK: wfmaxsb %v18, %f3, %v20, 11 +0xe7 0x23 0x40 0xb8 0x2a 0xef + +# CHECK: wfmaxxb %v0, %v0, %v0, 0 +0xe7 0x00 0x00 0x08 0x40 0xef + +# CHECK: wfmaxxb %v0, %v0, %v0, 4 +0xe7 0x00 0x00 0x48 0x40 0xef + +# CHECK: wfmaxxb %v0, %v0, %v31, 0 +0xe7 0x00 0xf0 0x08 0x42 0xef + +# CHECK: wfmaxxb %v0, %v31, %v0, 0 +0xe7 0x0f 0x00 0x08 0x44 0xef + +# CHECK: wfmaxxb %v31, %v0, %v0, 0 +0xe7 0xf0 0x00 0x08 0x48 0xef + +# CHECK: wfmaxxb %v18, %v3, %v20, 11 +0xe7 0x23 0x40 0xb8 0x4a 0xef + +# CHECK: wfmindb %f0, %f0, %f0, 0 +0xe7 0x00 0x00 0x08 0x30 0xee + +# CHECK: wfmindb %f0, %f0, %f0, 0 +0xe7 0x00 0x00 0x08 0x30 0xee + +# CHECK: wfmindb %f0, %f0, %f0, 4 +0xe7 0x00 0x00 0x48 0x30 0xee + +# CHECK: wfmindb %f0, %f0, %v31, 0 +0xe7 0x00 0xf0 0x08 0x32 0xee + +# CHECK: wfmindb %f0, %v31, %f0, 0 +0xe7 0x0f 0x00 0x08 0x34 0xee + +# CHECK: wfmindb %v31, %f0, %f0, 0 +0xe7 0xf0 0x00 0x08 0x38 0xee + +# CHECK: wfmindb %v18, %f3, %v20, 11 +0xe7 0x23 0x40 0xb8 0x3a 0xee + +# CHECK: wfminsb %f0, %f0, %f0, 0 +0xe7 0x00 0x00 0x08 0x20 0xee + +# CHECK: wfminsb %f0, %f0, %f0, 0 +0xe7 0x00 0x00 0x08 0x20 0xee + +# CHECK: wfminsb %f0, %f0, %f0, 4 +0xe7 0x00 0x00 0x48 0x20 0xee + +# CHECK: wfminsb %f0, %f0, %v31, 0 +0xe7 0x00 0xf0 0x08 0x22 0xee + +# CHECK: wfminsb %f0, %v31, %f0, 0 +0xe7 0x0f 0x00 0x08 0x24 0xee + +# CHECK: wfminsb %v31, %f0, %f0, 0 +0xe7 0xf0 0x00 0x08 0x28 0xee + +# CHECK: wfminsb %v18, %f3, %v20, 11 +0xe7 0x23 0x40 0xb8 0x2a 0xee + +# CHECK: wfminxb %v0, %v0, %v0, 0 +0xe7 0x00 0x00 0x08 0x40 0xee + +# CHECK: wfminxb %v0, %v0, %v0, 4 +0xe7 0x00 0x00 0x48 0x40 0xee + +# CHECK: wfminxb %v0, %v0, %v31, 0 +0xe7 0x00 0xf0 0x08 0x42 0xee + +# CHECK: wfminxb %v0, %v31, %v0, 0 +0xe7 0x0f 0x00 0x08 0x44 0xee + +# CHECK: wfminxb %v31, %v0, %v0, 0 +0xe7 0xf0 0x00 0x08 0x48 0xee + +# CHECK: wfminxb %v18, %v3, %v20, 11 +0xe7 0x23 0x40 0xb8 0x4a 0xee + +# CHECK: wfmasb %f0, %f0, %f0, %f0 +0xe7 0x00 0x02 0x08 0x00 0x8f + +# CHECK: wfmasb %f0, %f0, %f0, %f0 +0xe7 0x00 0x02 0x08 0x00 0x8f + +# CHECK: wfmasb %f0, %f0, %f0, %v31 +0xe7 0x00 0x02 0x08 0xf1 0x8f + +# CHECK: wfmasb %f0, %f0, %v31, %f0 +0xe7 0x00 0xf2 0x08 0x02 0x8f + +# CHECK: wfmasb %f0, %v31, %f0, %f0 +0xe7 0x0f 0x02 0x08 0x04 0x8f + +# CHECK: wfmasb %v31, %f0, %f0, %f0 +0xe7 0xf0 0x02 0x08 0x08 0x8f + +# CHECK: wfmasb %f13, %v17, %v21, %v25 +0xe7 0xd1 0x52 0x08 0x97 0x8f + +# CHECK: wfmaxb %v0, %v0, %v0, %v0 +0xe7 0x00 0x04 0x08 0x00 0x8f + +# CHECK: wfmaxb %v0, %v0, %v0, %v31 +0xe7 0x00 0x04 0x08 0xf1 0x8f + +# CHECK: wfmaxb %v0, %v0, %v31, %v0 +0xe7 0x00 0xf4 0x08 0x02 0x8f + +# CHECK: wfmaxb %v0, %v31, %v0, %v0 +0xe7 0x0f 0x04 0x08 0x04 0x8f + +# CHECK: wfmaxb %v31, %v0, %v0, %v0 +0xe7 0xf0 0x04 0x08 0x08 0x8f + +# CHECK: wfmaxb %v13, %v17, %v21, %v25 +0xe7 0xd1 0x54 0x08 0x97 0x8f + +# CHECK: wfmsb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xe7 + +# CHECK: wfmsb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xe7 + +# CHECK: wfmsb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x08 0x22 0xe7 + +# CHECK: wfmsb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x08 0x24 0xe7 + +# CHECK: wfmsb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x08 0x28 0xe7 + +# CHECK: wfmsb %v18, %f3, %v20 +0xe7 0x23 0x40 0x08 0x2a 0xe7 + +# CHECK: wfmxb %v0, %v0, %v0 +0xe7 0x00 0x00 0x08 0x40 0xe7 + +# CHECK: wfmxb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x08 0x42 0xe7 + +# CHECK: wfmxb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x08 0x44 0xe7 + +# CHECK: wfmxb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x08 0x48 0xe7 + +# CHECK: wfmxb %v18, %v3, %v20 +0xe7 0x23 0x40 0x08 0x4a 0xe7 + +# CHECK: wfmssb %f0, %f0, %f0, %f0 +0xe7 0x00 0x02 0x08 0x00 0x8e + +# CHECK: wfmssb %f0, %f0, %f0, %f0 +0xe7 0x00 0x02 0x08 0x00 0x8e + +# CHECK: wfmssb %f0, %f0, %f0, %v31 +0xe7 0x00 0x02 0x08 0xf1 0x8e + +# CHECK: wfmssb %f0, %f0, %v31, %f0 +0xe7 0x00 0xf2 0x08 0x02 0x8e + +# CHECK: wfmssb %f0, %v31, %f0, %f0 +0xe7 0x0f 0x02 0x08 0x04 0x8e + +# CHECK: wfmssb %v31, %f0, %f0, %f0 +0xe7 0xf0 0x02 0x08 0x08 0x8e + +# CHECK: wfmssb %f13, %v17, %v21, %v25 +0xe7 0xd1 0x52 0x08 0x97 0x8e + +# CHECK: wfmsxb %v0, %v0, %v0, %v0 +0xe7 0x00 0x04 0x08 0x00 0x8e + +# CHECK: wfmsxb %v0, %v0, %v0, %v31 +0xe7 0x00 0x04 0x08 0xf1 0x8e + +# CHECK: wfmsxb %v0, %v0, %v31, %v0 +0xe7 0x00 0xf4 0x08 0x02 0x8e + +# CHECK: wfmsxb %v0, %v31, %v0, %v0 +0xe7 0x0f 0x04 0x08 0x04 0x8e + +# CHECK: wfmsxb %v31, %v0, %v0, %v0 +0xe7 0xf0 0x04 0x08 0x08 0x8e + +# CHECK: wfmsxb %v13, %v17, %v21, %v25 +0xe7 0xd1 0x54 0x08 0x97 0x8e + +# CHECK: wfnmadb %f0, %f0, %f0, %f0 +0xe7 0x00 0x03 0x08 0x00 0x9f + +# CHECK: wfnmadb %f0, %f0, %f0, %f0 +0xe7 0x00 0x03 0x08 0x00 0x9f + +# CHECK: wfnmadb %f0, %f0, %f0, %v31 +0xe7 0x00 0x03 0x08 0xf1 0x9f + +# CHECK: wfnmadb %f0, %f0, %v31, %f0 +0xe7 0x00 0xf3 0x08 0x02 0x9f + +# CHECK: wfnmadb %f0, %v31, %f0, %f0 +0xe7 0x0f 0x03 0x08 0x04 0x9f + +# CHECK: wfnmadb %v31, %f0, %f0, %f0 +0xe7 0xf0 0x03 0x08 0x08 0x9f + +# CHECK: wfnmadb %f13, %v17, %v21, %v25 +0xe7 0xd1 0x53 0x08 0x97 0x9f + +# CHECK: wfnmasb %f0, %f0, %f0, %f0 +0xe7 0x00 0x02 0x08 0x00 0x9f + +# CHECK: wfnmasb %f0, %f0, %f0, %f0 +0xe7 0x00 0x02 0x08 0x00 0x9f + +# CHECK: wfnmasb %f0, %f0, %f0, %v31 +0xe7 0x00 0x02 0x08 0xf1 0x9f + +# CHECK: wfnmasb %f0, %f0, %v31, %f0 +0xe7 0x00 0xf2 0x08 0x02 0x9f + +# CHECK: wfnmasb %f0, %v31, %f0, %f0 +0xe7 0x0f 0x02 0x08 0x04 0x9f + +# CHECK: wfnmasb %v31, %f0, %f0, %f0 +0xe7 0xf0 0x02 0x08 0x08 0x9f + +# CHECK: wfnmasb %f13, %v17, %v21, %v25 +0xe7 0xd1 0x52 0x08 0x97 0x9f + +# CHECK: wfnmaxb %v0, %v0, %v0, %v0 +0xe7 0x00 0x04 0x08 0x00 0x9f + +# CHECK: wfnmaxb %v0, %v0, %v0, %v31 +0xe7 0x00 0x04 0x08 0xf1 0x9f + +# CHECK: wfnmaxb %v0, %v0, %v31, %v0 +0xe7 0x00 0xf4 0x08 0x02 0x9f + +# CHECK: wfnmaxb %v0, %v31, %v0, %v0 +0xe7 0x0f 0x04 0x08 0x04 0x9f + +# CHECK: wfnmaxb %v31, %v0, %v0, %v0 +0xe7 0xf0 0x04 0x08 0x08 0x9f + +# CHECK: wfnmaxb %v13, %v17, %v21, %v25 +0xe7 0xd1 0x54 0x08 0x97 0x9f + +# CHECK: wfnmsdb %f0, %f0, %f0, %f0 +0xe7 0x00 0x03 0x08 0x00 0x9e + +# CHECK: wfnmsdb %f0, %f0, %f0, %f0 +0xe7 0x00 0x03 0x08 0x00 0x9e + +# CHECK: wfnmsdb %f0, %f0, %f0, %v31 +0xe7 0x00 0x03 0x08 0xf1 0x9e + +# CHECK: wfnmsdb %f0, %f0, %v31, %f0 +0xe7 0x00 0xf3 0x08 0x02 0x9e + +# CHECK: wfnmsdb %f0, %v31, %f0, %f0 +0xe7 0x0f 0x03 0x08 0x04 0x9e + +# CHECK: wfnmsdb %v31, %f0, %f0, %f0 +0xe7 0xf0 0x03 0x08 0x08 0x9e + +# CHECK: wfnmsdb %f13, %v17, %v21, %v25 +0xe7 0xd1 0x53 0x08 0x97 0x9e + +# CHECK: wfnmssb %f0, %f0, %f0, %f0 +0xe7 0x00 0x02 0x08 0x00 0x9e + +# CHECK: wfnmssb %f0, %f0, %f0, %f0 +0xe7 0x00 0x02 0x08 0x00 0x9e + +# CHECK: wfnmssb %f0, %f0, %f0, %v31 +0xe7 0x00 0x02 0x08 0xf1 0x9e + +# CHECK: wfnmssb %f0, %f0, %v31, %f0 +0xe7 0x00 0xf2 0x08 0x02 0x9e + +# CHECK: wfnmssb %f0, %v31, %f0, %f0 +0xe7 0x0f 0x02 0x08 0x04 0x9e + +# CHECK: wfnmssb %v31, %f0, %f0, %f0 +0xe7 0xf0 0x02 0x08 0x08 0x9e + +# CHECK: wfnmssb %f13, %v17, %v21, %v25 +0xe7 0xd1 0x52 0x08 0x97 0x9e + +# CHECK: wfnmsxb %v0, %v0, %v0, %v0 +0xe7 0x00 0x04 0x08 0x00 0x9e + +# CHECK: wfnmsxb %v0, %v0, %v0, %v31 +0xe7 0x00 0x04 0x08 0xf1 0x9e + +# CHECK: wfnmsxb %v0, %v0, %v31, %v0 +0xe7 0x00 0xf4 0x08 0x02 0x9e + +# CHECK: wfnmsxb %v0, %v31, %v0, %v0 +0xe7 0x0f 0x04 0x08 0x04 0x9e + +# CHECK: wfnmsxb %v31, %v0, %v0, %v0 +0xe7 0xf0 0x04 0x08 0x08 0x9e + +# CHECK: wfnmsxb %v13, %v17, %v21, %v25 +0xe7 0xd1 0x54 0x08 0x97 0x9e + +# CHECK: wfssb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xe2 + +# CHECK: wfssb %f0, %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xe2 + +# CHECK: wfssb %f0, %f0, %v31 +0xe7 0x00 0xf0 0x08 0x22 0xe2 + +# CHECK: wfssb %f0, %v31, %f0 +0xe7 0x0f 0x00 0x08 0x24 0xe2 + +# CHECK: wfssb %v31, %f0, %f0 +0xe7 0xf0 0x00 0x08 0x28 0xe2 + +# CHECK: wfssb %v18, %f3, %v20 +0xe7 0x23 0x40 0x08 0x2a 0xe2 + +# CHECK: wfsxb %v0, %v0, %v0 +0xe7 0x00 0x00 0x08 0x40 0xe2 + +# CHECK: wfsxb %v0, %v0, %v31 +0xe7 0x00 0xf0 0x08 0x42 0xe2 + +# CHECK: wfsxb %v0, %v31, %v0 +0xe7 0x0f 0x00 0x08 0x44 0xe2 + +# CHECK: wfsxb %v31, %v0, %v0 +0xe7 0xf0 0x00 0x08 0x48 0xe2 + +# CHECK: wfsxb %v18, %v3, %v20 +0xe7 0x23 0x40 0x08 0x4a 0xe2 + +# CHECK: wfsqsb %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xce + +# CHECK: wfsqsb %f0, %f0 +0xe7 0x00 0x00 0x08 0x20 0xce + +# CHECK: wfsqsb %f0, %f15 +0xe7 0x0f 0x00 0x08 0x20 0xce + +# CHECK: wfsqsb %f0, %v31 +0xe7 0x0f 0x00 0x08 0x24 0xce + +# CHECK: wfsqsb %f15, %f0 +0xe7 0xf0 0x00 0x08 0x20 0xce + +# CHECK: wfsqsb %v31, %f0 +0xe7 0xf0 0x00 0x08 0x28 0xce + +# CHECK: wfsqsb %f14, %v17 +0xe7 0xe1 0x00 0x08 0x24 0xce + +# CHECK: wfsqxb %v0, %v0 +0xe7 0x00 0x00 0x08 0x40 0xce + +# CHECK: wfsqxb %v0, %v15 +0xe7 0x0f 0x00 0x08 0x40 0xce + +# CHECK: wfsqxb %v0, %v31 +0xe7 0x0f 0x00 0x08 0x44 0xce + +# CHECK: wfsqxb %v15, %v0 +0xe7 0xf0 0x00 0x08 0x40 0xce + +# CHECK: wfsqxb %v31, %v0 +0xe7 0xf0 0x00 0x08 0x48 0xce + +# CHECK: wfsqxb %v14, %v17 +0xe7 0xe1 0x00 0x08 0x44 0xce + +# CHECK: wftcisb %f0, %f0, 0 +0xe7 0x00 0x00 0x08 0x20 0x4a + +# CHECK: wftcisb %f0, %f0, 0 +0xe7 0x00 0x00 0x08 0x20 0x4a + +# CHECK: wftcisb %f0, %f0, 4095 +0xe7 0x00 0xff 0xf8 0x20 0x4a + +# CHECK: wftcisb %f0, %f15, 0 +0xe7 0x0f 0x00 0x08 0x20 0x4a + +# CHECK: wftcisb %f0, %v31, 0 +0xe7 0x0f 0x00 0x08 0x24 0x4a + +# CHECK: wftcisb %f15, %f0, 0 +0xe7 0xf0 0x00 0x08 0x20 0x4a + +# CHECK: wftcisb %v31, %f0, 0 +0xe7 0xf0 0x00 0x08 0x28 0x4a + +# CHECK: wftcisb %f4, %v21, 1656 +0xe7 0x45 0x67 0x88 0x24 0x4a + +# CHECK: wftcixb %v0, %v0, 0 +0xe7 0x00 0x00 0x08 0x40 0x4a + +# CHECK: wftcixb %v0, %v0, 4095 +0xe7 0x00 0xff 0xf8 0x40 0x4a + +# CHECK: wftcixb %v0, %v15, 0 +0xe7 0x0f 0x00 0x08 0x40 0x4a + +# CHECK: wftcixb %v0, %v31, 0 +0xe7 0x0f 0x00 0x08 0x44 0x4a + +# CHECK: wftcixb %v15, %v0, 0 +0xe7 0xf0 0x00 0x08 0x40 0x4a + +# CHECK: wftcixb %v31, %v0, 0 +0xe7 0xf0 0x00 0x08 0x48 0x4a + +# CHECK: wftcixb %v4, %v21, 1656 +0xe7 0x45 0x67 0x88 0x44 0x4a + diff --git a/test/MC/Mips/mt/invalid-wrong-error.s b/test/MC/Mips/mt/invalid-wrong-error.s deleted file mode 100644 index 0247089b70ae..000000000000 --- a/test/MC/Mips/mt/invalid-wrong-error.s +++ /dev/null @@ -1,3 +0,0 @@ -# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt < %s 2>&1 | FileCheck %s - mftr 0($4), $5, 0, 0, 0 # CHECK: error: unexpected token in argument list - mttr 0($4), $5, 0, 0, 0 # CHECK: error: unexpected token in argument list diff --git a/test/MC/Mips/mt/invalid.s b/test/MC/Mips/mt/invalid.s index d4055c4a50f4..5a145a7e0850 100644 --- a/test/MC/Mips/mt/invalid.s +++ b/test/MC/Mips/mt/invalid.s @@ -1,27 +1,13 @@ # RUN: not llvm-mc -arch=mips -mcpu=mips32 -mattr=+mt < %s 2>&1 | FileCheck %s - dmt 4 # CHECK: error: invalid operand for instruction - dmt $4, $5 # CHECK: error: invalid operand for instruction - dmt $5, 0($4) # CHECK: error: invalid operand for instruction - emt 4 # CHECK: error: invalid operand for instruction - emt $4, $5 # CHECK: error: invalid operand for instruction - emt $5, 0($5) # CHECK: error: invalid operand for instruction - dvpe 4 # CHECK: error: invalid operand for instruction - dvpe $4, $5 # CHECK: error: invalid operand for instruction - dvpe $5, 0($4) # CHECK: error: invalid operand for instruction - evpe 4 # CHECK: error: invalid operand for instruction - evpe $4, $5 # CHECK: error: invalid operand for instruction - evpe $5, 0($5) # CHECK: error: invalid operand for instruction - mftr $4, 0($5), 0, 0, 0 # CHECK: error: invalid operand for instruction - mftr $4, $5, 2, 0, 0 # CHECK: error: expected 1-bit unsigned immediate - mftr $4, $5, -1, 0, 0 # CHECK: error: expected 1-bit unsigned immediate - mftr $4, $5, 0, 8, 0 # CHECK: error: expected 3-bit unsigned immediate - mftr $4, $5, 0, -1, 0 # CHECK: error: expected 3-bit unsigned immediate - mftr $4, $4, 0, 0, 2 # CHECK: error: expected 1-bit unsigned immediate - mftr $4, $5, 0, 0, -1 # CHECK: error: expected 1-bit unsigned immediate - mttr $4, 0($5), 0, 0, 0 # CHECK: error: invalid operand for instruction - mttr $4, $5, 2, 0, 0 # CHECK: error: expected 1-bit unsigned immediate - mttr $4, $5, -1, 0, 0 # CHECK: error: expected 1-bit unsigned immediate - mttr $4, $5, 0, 8, 0 # CHECK: error: expected 3-bit unsigned immediate - mttr $4, $5, 0, -1, 0 # CHECK: error: expected 3-bit unsigned immediate - mttr $4, $4, 0, 0, 2 # CHECK: error: expected 1-bit unsigned immediate - mttr $4, $5, 0, 0, -1 # CHECK: error: expected 1-bit unsigned immediate + dmt 4 # CHECK: error: invalid operand for instruction + dmt $4, $5 # CHECK: error: invalid operand for instruction + dmt $5, 0($4) # CHECK: error: invalid operand for instruction + emt 4 # CHECK: error: invalid operand for instruction + emt $4, $5 # CHECK: error: invalid operand for instruction + emt $5, 0($5) # CHECK: error: invalid operand for instruction + dvpe 4 # CHECK: error: invalid operand for instruction + dvpe $4, $5 # CHECK: error: invalid operand for instruction + dvpe $5, 0($4) # CHECK: error: invalid operand for instruction + evpe 4 # CHECK: error: invalid operand for instruction + evpe $4, $5 # CHECK: error: invalid operand for instruction + evpe $5, 0($5) # CHECK: error: invalid operand for instruction diff --git a/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s b/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s deleted file mode 100644 index 4e872412e6ef..000000000000 --- a/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s +++ /dev/null @@ -1,18 +0,0 @@ -# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \ -# RUN: 2>&1 | FileCheck %s - -# The integrated assembler produces a wrong or misleading error message. - - mftc0 0($4), $5 # CHECK: error: unexpected token in argument list - mftc0 0($4), $5, 1 # CHECK: error: unexpected token in argument list - mftgpr 0($4), $5 # CHECK: error: unexpected token in argument list - mftlo 0($3) # CHECK: error: unexpected token in argument list - mftlo 0($3), $ac1 # CHECK: error: unexpected token in argument list - mfthi 0($3) # CHECK: error: unexpected token in argument list - mfthi 0($3), $ac1 # CHECK: error: unexpected token in argument list - mftacx 0($3) # CHECK: error: unexpected token in argument list - mftacx 0($3), $ac1 # CHECK: error: unexpected token in argument list - mftdsp 0($4) # CHECK: error: unexpected token in argument list - mftc1 0($4), $f4 # CHECK: error: unexpected token in argument list - mfthc1 0($4), $f4 # CHECK: error: unexpected token in argument list - cftc1 0($4), $f8 # CHECK: error: unexpected token in argument list diff --git a/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s b/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s deleted file mode 100644 index 06ae8c72e654..000000000000 --- a/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s +++ /dev/null @@ -1,23 +0,0 @@ -# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \ -# RUN: 2>&1 | FileCheck %s - - mftc0 $4, 0($5) # CHECK: error: invalid operand for instruction - mftc0 $4, 0($5), 1 # CHECK: error: invalid operand for instruction - mftc0 $4, $5, -1 # CHECK: error: expected 3-bit unsigned immediate - mftc0 $4, $5, 9 # CHECK: error: expected 3-bit unsigned immediate - mftc0 $4, $5, $6 # CHECK: error: expected 3-bit unsigned immediate - mftgpr $4, 0($5) # CHECK: error: invalid operand for instruction - mftgpr $4, $5, $6 # CHECK: error: invalid operand for instruction - mftlo $3, 0($ac1) # CHECK: error: invalid operand for instruction - mftlo $4, $ac1, $4 # CHECK: error: invalid operand for instruction - mfthi $3, 0($ac1) # CHECK: error: invalid operand for instruction - mfthi $4, $ac1, $4 # CHECK: error: invalid operand for instruction - mftacx $3, 0($ac1) # CHECK: error: invalid operand for instruction - mftacx $4, $ac1, $4 # CHECK: error: invalid operand for instruction - mftdsp $4, $5 # CHECK: error: invalid operand for instruction - mftdsp $4, $f5 # CHECK: error: invalid operand for instruction - mftdsp $4, $ac0 # CHECK: error: invalid operand for instruction - mftc1 $4, 0($f4) # CHECK: error: invalid operand for instruction - mfthc1 $4, 0($f4) # CHECK: error: invalid operand for instruction - cftc1 $4, 0($f4) # CHECK: error: invalid operand for instruction - cftc1 $4, $f4, $5 # CHECK: error: invalid operand for instruction diff --git a/test/MC/Mips/mt/mftr-mttr-aliases.s b/test/MC/Mips/mt/mftr-mttr-aliases.s deleted file mode 100644 index 92ed9f9281f2..000000000000 --- a/test/MC/Mips/mt/mftr-mttr-aliases.s +++ /dev/null @@ -1,47 +0,0 @@ -# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s | FileCheck %s - -# Check the various aliases of the m[ft]tr instruction. - - mftc0 $4, $5 # CHECK: mftr $4, $5, 0, 0, 0 # encoding: [0x41,0x05,0x20,0x00] - mftc0 $6, $7, 1 # CHECK: mftr $6, $7, 0, 1, 0 # encoding: [0x41,0x07,0x30,0x01] - mftgpr $5, $9 # CHECK: mftr $5, $9, 1, 0, 0 # encoding: [0x41,0x09,0x28,0x20] - mftlo $3 # CHECK: mftr $3, $zero, 1, 1, 0 # encoding: [0x41,0x00,0x18,0x21] - mftlo $3, $ac0 # CHECK: mftr $3, $zero, 1, 1, 0 # encoding: [0x41,0x00,0x18,0x21] - mftlo $3, $ac1 # CHECK: mftr $3, $4, 1, 1, 0 # encoding: [0x41,0x04,0x18,0x21] - mftlo $3, $ac2 # CHECK: mftr $3, $8, 1, 1, 0 # encoding: [0x41,0x08,0x18,0x21] - mftlo $3, $ac3 # CHECK: mftr $3, $12, 1, 1, 0 # encoding: [0x41,0x0c,0x18,0x21] - mfthi $3, $ac0 # CHECK: mftr $3, $1, 1, 1, 0 # encoding: [0x41,0x01,0x18,0x21] - mfthi $3, $ac1 # CHECK: mftr $3, $5, 1, 1, 0 # encoding: [0x41,0x05,0x18,0x21] - mfthi $3, $ac2 # CHECK: mftr $3, $9, 1, 1, 0 # encoding: [0x41,0x09,0x18,0x21] - mfthi $3, $ac3 # CHECK: mftr $3, $13, 1, 1, 0 # encoding: [0x41,0x0d,0x18,0x21] - mftacx $3, $ac0 # CHECK: mftr $3, $2, 1, 1, 0 # encoding: [0x41,0x02,0x18,0x21] - mftacx $3, $ac1 # CHECK: mftr $3, $6, 1, 1, 0 # encoding: [0x41,0x06,0x18,0x21] - mftacx $3, $ac2 # CHECK: mftr $3, $10, 1, 1, 0 # encoding: [0x41,0x0a,0x18,0x21] - mftacx $3, $ac3 # CHECK: mftr $3, $14, 1, 1, 0 # encoding: [0x41,0x0e,0x18,0x21] - mftdsp $4 # CHECK: mftr $4, $16, 1, 1, 0 # encoding: [0x41,0x10,0x20,0x21] - mftc1 $4, $f5 # CHECK: mftr $4, $5, 1, 2, 0 # encoding: [0x41,0x05,0x20,0x22] - mfthc1 $4, $f5 # CHECK: mftr $4, $5, 1, 2, 1 # encoding: [0x41,0x05,0x20,0x32] - cftc1 $4, $f9 # CHECK: mftr $4, $9, 1, 3, 0 # encoding: [0x41,0x09,0x20,0x23] - - mttc0 $4, $5 # CHECK: mttr $4, $5, 0, 0, 0 # encoding: [0x41,0x84,0x28,0x00] - mttc0 $6, $7, 1 # CHECK: mttr $6, $7, 0, 1, 0 # encoding: [0x41,0x86,0x38,0x01] - mttgpr $5, $9 # CHECK: mttr $5, $9, 1, 0, 0 # encoding: [0x41,0x85,0x48,0x20] - mttlo $3 # CHECK: mttr $3, $zero, 1, 1, 0 # encoding: [0x41,0x83,0x00,0x21] - mttlo $3, $ac0 # CHECK: mttr $3, $zero, 1, 1, 0 # encoding: [0x41,0x83,0x00,0x21] - mttlo $3, $ac1 # CHECK: mttr $3, $4, 1, 1, 0 # encoding: [0x41,0x83,0x20,0x21] - mttlo $3, $ac2 # CHECK: mttr $3, $8, 1, 1, 0 # encoding: [0x41,0x83,0x40,0x21] - mttlo $3, $ac3 # CHECK: mttr $3, $12, 1, 1, 0 # encoding: [0x41,0x83,0x60,0x21] - mtthi $3 # CHECK: mttr $3, $1, 1, 1, 0 # encoding: [0x41,0x83,0x08,0x21] - mtthi $3, $ac0 # CHECK: mttr $3, $1, 1, 1, 0 # encoding: [0x41,0x83,0x08,0x21] - mtthi $3, $ac1 # CHECK: mttr $3, $5, 1, 1, 0 # encoding: [0x41,0x83,0x28,0x21] - mtthi $3, $ac2 # CHECK: mttr $3, $9, 1, 1, 0 # encoding: [0x41,0x83,0x48,0x21] - mtthi $3, $ac3 # CHECK: mttr $3, $13, 1, 1, 0 # encoding: [0x41,0x83,0x68,0x21] - mttacx $3 # CHECK: mttr $3, $2, 1, 1, 0 # encoding: [0x41,0x83,0x10,0x21] - mttacx $3, $ac0 # CHECK: mttr $3, $2, 1, 1, 0 # encoding: [0x41,0x83,0x10,0x21] - mttacx $3, $ac1 # CHECK: mttr $3, $6, 1, 1, 0 # encoding: [0x41,0x83,0x30,0x21] - mttacx $3, $ac2 # CHECK: mttr $3, $10, 1, 1, 0 # encoding: [0x41,0x83,0x50,0x21] - mttacx $3, $ac3 # CHECK: mttr $3, $14, 1, 1, 0 # encoding: [0x41,0x83,0x70,0x21] - mttdsp $4 # CHECK: mttr $4, $16, 1, 1, 0 # encoding: [0x41,0x84,0x80,0x21] - mttc1 $4, $f5 # CHECK: mttr $4, $5, 1, 2, 0 # encoding: [0x41,0x84,0x28,0x22] - mtthc1 $4, $f5 # CHECK: mttr $4, $5, 1, 2, 1 # encoding: [0x41,0x84,0x28,0x32] - cttc1 $4, $f9 # CHECK: mttr $4, $9, 1, 3, 0 # encoding: [0x41,0x84,0x48,0x23] diff --git a/test/MC/Mips/mt/mftr-mttr-reserved-valid.s b/test/MC/Mips/mt/mftr-mttr-reserved-valid.s deleted file mode 100644 index c40e81bfc7d7..000000000000 --- a/test/MC/Mips/mt/mftr-mttr-reserved-valid.s +++ /dev/null @@ -1,8 +0,0 @@ -# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s | FileCheck %s - -# The selector value and register values here are marked as reserved in the -# documentation, but GAS accepts them without warning. - mftr $31, $31, 1, 1, 0 # CHECK: mftr $ra, $ra, 1, 1, 0 # encoding: [0x41,0x1f,0xf8,0x21] - mttr $31, $31, 1, 1, 0 # CHECK: mttr $ra, $ra, 1, 1, 0 # encoding: [0x41,0x9f,0xf8,0x21] - mftr $31, $13, 1, 6, 0 # CHECK: mftr $ra, $13, 1, 6, 0 # encoding: [0x41,0x0d,0xf8,0x26] - mttr $31, $13, 1, 6, 0 # CHECK: mttr $ra, $13, 1, 6, 0 # encoding: [0x41,0x9f,0x68,0x26] diff --git a/test/MC/Mips/mt/valid.s b/test/MC/Mips/mt/valid.s index 9fa07870a61f..ab1179d05c6a 100644 --- a/test/MC/Mips/mt/valid.s +++ b/test/MC/Mips/mt/valid.s @@ -1,33 +1,13 @@ # RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \ # RUN: | FileCheck %s - dmt # CHECK: dmt # encoding: [0x41,0x60,0x0b,0xc1] - dmt $5 # CHECK: dmt $5 # encoding: [0x41,0x65,0x0b,0xc1] - emt # CHECK: emt # encoding: [0x41,0x60,0x0b,0xe1] - emt $4 # CHECK: emt $4 # encoding: [0x41,0x64,0x0b,0xe1] - dvpe # CHECK: dvpe # encoding: [0x41,0x60,0x00,0x01] - dvpe $6 # CHECK: dvpe $6 # encoding: [0x41,0x66,0x00,0x01] - evpe # CHECK: evpe # encoding: [0x41,0x60,0x00,0x21] - evpe $4 # CHECK: evpe $4 # encoding: [0x41,0x64,0x00,0x21] - fork $2, $3, $5 # CHECK: fork $2, $3, $5 # encoding: [0x7c,0x65,0x10,0x08] - yield $4 # CHECK: yield $4 # encoding: [0x7c,0x80,0x00,0x09] - yield $4, $5 # CHECK: yield $4, $5 # encoding: [0x7c,0xa0,0x20,0x09] - mftr $4, $5, 0, 2, 0 # CHECK: mftr $4, $5, 0, 2, 0 # encoding: [0x41,0x05,0x20,0x02] - mftr $4, $5, 1, 0, 0 # CHECK: mftr $4, $5, 1, 0, 0 # encoding: [0x41,0x05,0x20,0x20] - mftr $4, $0, 1, 1, 0 # CHECK: mftr $4, $zero, 1, 1, 0 # encoding: [0x41,0x00,0x20,0x21] - mftr $4, $10, 1, 1, 0 # CHECK: mftr $4, $10, 1, 1, 0 # encoding: [0x41,0x0a,0x20,0x21] - mftr $4, $10, 1, 2, 0 # CHECK: mftr $4, $10, 1, 2, 0 # encoding: [0x41,0x0a,0x20,0x22] - mftr $4, $10, 1, 2, 1 # CHECK: mftr $4, $10, 1, 2, 1 # encoding: [0x41,0x0a,0x20,0x32] - mftr $4, $26, 1, 3, 0 # CHECK: mftr $4, $26, 1, 3, 0 # encoding: [0x41,0x1a,0x20,0x23] - mftr $4, $31, 1, 3, 0 # CHECK: mftr $4, $ra, 1, 3, 0 # encoding: [0x41,0x1f,0x20,0x23] - mftr $4, $14, 1, 4, 0 # CHECK: mftr $4, $14, 1, 4, 0 # encoding: [0x41,0x0e,0x20,0x24] - mftr $4, $15, 1, 5, 0 # CHECK: mftr $4, $15, 1, 5, 0 # encoding: [0x41,0x0f,0x20,0x25] - mttr $4, $5, 0, 2, 0 # CHECK: mttr $4, $5, 0, 2, 0 # encoding: [0x41,0x84,0x28,0x02] - mttr $4, $5, 1, 0, 0 # CHECK: mttr $4, $5, 1, 0, 0 # encoding: [0x41,0x84,0x28,0x20] - mttr $4, $0, 1, 1, 0 # CHECK: mttr $4, $zero, 1, 1, 0 # encoding: [0x41,0x84,0x00,0x21] - mttr $4, $10, 1, 1, 0 # CHECK: mttr $4, $10, 1, 1, 0 # encoding: [0x41,0x84,0x50,0x21] - mttr $4, $10, 1, 2, 0 # CHECK: mttr $4, $10, 1, 2, 0 # encoding: [0x41,0x84,0x50,0x22] - mttr $4, $10, 1, 2, 1 # CHECK: mttr $4, $10, 1, 2, 1 # encoding: [0x41,0x84,0x50,0x32] - mttr $4, $26, 1, 3, 0 # CHECK: mttr $4, $26, 1, 3, 0 # encoding: [0x41,0x84,0xd0,0x23] - mttr $4, $31, 1, 3, 0 # CHECK: mttr $4, $ra, 1, 3, 0 # encoding: [0x41,0x84,0xf8,0x23] - mttr $4, $14, 1, 4, 0 # CHECK: mttr $4, $14, 1, 4, 0 # encoding: [0x41,0x84,0x70,0x24] - mttr $4, $15, 1, 5, 0 # CHECK: mttr $4, $15, 1, 5, 0 # encoding: [0x41,0x84,0x78,0x25] + dmt # CHECK: dmt # encoding: [0x41,0x60,0x0b,0xc1] + dmt $5 # CHECK: dmt $5 # encoding: [0x41,0x65,0x0b,0xc1] + emt # CHECK: emt # encoding: [0x41,0x60,0x0b,0xe1] + emt $4 # CHECK: emt $4 # encoding: [0x41,0x64,0x0b,0xe1] + dvpe # CHECK: dvpe # encoding: [0x41,0x60,0x00,0x01] + dvpe $6 # CHECK: dvpe $6 # encoding: [0x41,0x66,0x00,0x01] + evpe # CHECK: evpe # encoding: [0x41,0x60,0x00,0x21] + evpe $4 # CHECK: evpe $4 # encoding: [0x41,0x64,0x00,0x21] + fork $2, $3, $5 # CHECK: fork $2, $3, $5 # encoding: [0x7c,0x65,0x10,0x08] + yield $4 # CHECK: yield $4 # encoding: [0x7c,0x80,0x00,0x09] + yield $4, $5 # CHECK: yield $4, $5 # encoding: [0x7c,0xa0,0x20,0x09] diff --git a/test/MC/SystemZ/insn-bad-z13.s b/test/MC/SystemZ/insn-bad-z13.s index e9fac44aa883..0ccdd11cbe97 100644 --- a/test/MC/SystemZ/insn-bad-z13.s +++ b/test/MC/SystemZ/insn-bad-z13.s @@ -4,6 +4,19 @@ # RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=arch11 < %s 2> %t # RUN: FileCheck < %t %s +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: agh %r0, 0 + + agh %r0, 0 + +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: bi 0 +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: bic 0, 0 + + bi 0 + bic 0, 0 + #CHECK: error: invalid operand #CHECK: cdpt %f0, 0(1), -1 #CHECK: error: invalid operand @@ -150,6 +163,16 @@ cxpt %f0, 0(-), 0 cxpt %f15, 0(1), 0 +#CHECK: error: instruction requires: insert-reference-bits-multiple +#CHECK: irbm %r0, %r0 + + irbm %r0, %r0 + +#CHECK: error: instruction requires: message-security-assist-extension8 +#CHECK: kma %r2, %r4, %r6 + + kma %r2, %r4, %r6 + #CHECK: error: invalid operand #CHECK: lcbb %r0, 0, -1 #CHECK: error: invalid operand @@ -167,6 +190,21 @@ lcbb %r0, 4096, 0 lcbb %r0, 0(%v1,%r2), 0 +#CHECK: error: instruction requires: guarded-storage +#CHECK: lgg %r0, 0 + + lgg %r0, 0 + +#CHECK: error: instruction requires: guarded-storage +#CHECK: lgsc %r0, 0 + + lgsc %r0, 0 + +#CHECK: error: instruction requires: guarded-storage +#CHECK: llgfsg %r0, 0 + + llgfsg %r0, 0 + #CHECK: error: invalid operand #CHECK: llzrgf %r0, -524289 #CHECK: error: invalid operand @@ -249,6 +287,41 @@ lzrg %r0, -524289 lzrg %r0, 524288 +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: mg %r0, 0 + + mg %r0, 0 + +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: mgh %r0, 0 + + mgh %r0, 0 + +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: mgrk %r0, %r0, %r0 + + mgrk %r0, %r0, %r0 + +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: msc %r0, 0 + + msc %r0, 0 + +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: msgc %r0, 0 + + msgc %r0, 0 + +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: msrkc %r0, %r0, %r0 + + msrkc %r0, %r0, %r0 + +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: msgrkc %r0, %r0, %r0 + + msgrkc %r0, %r0, %r0 + #CHECK: error: invalid register pair #CHECK: ppno %r1, %r2 #CHECK: error: invalid register pair @@ -257,6 +330,21 @@ ppno %r1, %r2 ppno %r2, %r1 +#CHECK: error: instruction requires: message-security-assist-extension7 +#CHECK: prno %r2, %r4 + + prno %r2, %r4 + +#CHECK: error: instruction requires: miscellaneous-extensions-2 +#CHECK: sgh %r0, 0 + + sgh %r0, 0 + +#CHECK: error: instruction requires: guarded-storage +#CHECK: stgsc %r0, 0 + + stgsc %r0, 0 + #CHECK: error: invalid operand #CHECK: stocfh %r0, 0, -1 #CHECK: error: invalid operand @@ -274,6 +362,16 @@ stocfh %r0, 524288, 1 stocfh %r0, 0(%r1,%r2), 1 +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vap %v0, %v0, %v0, 0, 0 + + vap %v0, %v0, %v0, 0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vbperm %v0, %v0, %v0 + + vbperm %v0, %v0, %v0 + #CHECK: error: invalid operand #CHECK: vcdg %v0, %v0, 0, 0, -1 #CHECK: error: invalid operand @@ -410,6 +508,35 @@ vclgdb %v0, %v0, -1, 0 vclgdb %v0, %v0, 16, 0 +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vcp %v0, %v0, 0 + + vcp %v0, %v0, 0 + +#CHECK: vcvb %r0, %v0, 0 + + vcvb %r0, %v0, 0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vcvbg %r0, %v0, 0 + + vcvbg %r0, %v0, 0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vcvd %v0, %r0, 0, 0 + + vcvd %v0, %r0, 0, 0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vcvdg %v0, %r0, 0, 0 + + vcvdg %v0, %r0, 0, 0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vdp %v0, %v0, %v0, 0, 0 + + vdp %v0, %v0, %v0, 0, 0 + #CHECK: error: invalid operand #CHECK: verim %v0, %v0, %v0, 0, -1 #CHECK: error: invalid operand @@ -828,6 +955,40 @@ vfaezhs %v0, %v0 vfaezhs %v0, %v0, %v0, 0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfasb %v0, %v0, %v0 + + vfasb %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfcesb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfcesbs %v0, %v0, %v0 + + vfcesb %v0, %v0, %v0 + vfcesbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfchsb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfchsbs %v0, %v0, %v0 + + vfchsb %v0, %v0, %v0 + vfchsbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfchesb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfchesbs %v0, %v0, %v0 + + vfchesb %v0, %v0, %v0 + vfchesbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfdsb %v0, %v0, %v0 + + vfdsb %v0, %v0, %v0 + #CHECK: error: invalid operand #CHECK: vfee %v0, %v0, %v0, 0, -1 #CHECK: error: invalid operand @@ -1130,6 +1291,152 @@ vfidb %v0, %v0, -1, 0 vfidb %v0, %v0, 16, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfisb %v0, %v0, 0, 0 + + vfisb %v0, %v0, 0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkedb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkedbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkesb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkesbs %v0, %v0, %v0 + + vfkedb %v0, %v0, %v0 + vfkedbs %v0, %v0, %v0 + vfkesb %v0, %v0, %v0 + vfkesbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkhdb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkhdbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkhsb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkhsbs %v0, %v0, %v0 + + vfkhdb %v0, %v0, %v0 + vfkhdbs %v0, %v0, %v0 + vfkhsb %v0, %v0, %v0 + vfkhsbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkhedb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkhedbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkhesb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfkhesbs %v0, %v0, %v0 + + vfkhedb %v0, %v0, %v0 + vfkhedbs %v0, %v0, %v0 + vfkhesb %v0, %v0, %v0 + vfkhesbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfpsosb %v0, %v0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vflcsb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vflnsb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vflpsb %v0, %v0 + + vfpsosb %v0, %v0, 0 + vflcsb %v0, %v0 + vflnsb %v0, %v0 + vflpsb %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfll %v0, %v0, 0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vflls %v0, %v0 + + vfll %v0, %v0, 0, 0 + vflls %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vflr %v0, %v0, 0, 0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vflrd %v0, %v0, 0, 0 + + vflr %v0, %v0, 0, 0, 0 + vflrd %v0, %v0, 0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfmax %v0, %v0, %v0, 0, 0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfmaxdb %v0, %v0, %v0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfmaxsb %v0, %v0, %v0, 0 + + vfmax %v0, %v0, %v0, 0, 0, 0 + vfmaxdb %v0, %v0, %v0, 0 + vfmaxsb %v0, %v0, %v0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfmin %v0, %v0, %v0, 0, 0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfmindb %v0, %v0, %v0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfminsb %v0, %v0, %v0, 0 + + vfmin %v0, %v0, %v0, 0, 0, 0 + vfmindb %v0, %v0, %v0, 0 + vfminsb %v0, %v0, %v0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfmasb %v0, %v0, %v0, %v0 + + vfmasb %v0, %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfmsb %v0, %v0, %v0 + + vfmsb %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfmssb %v0, %v0, %v0, %v0 + + vfmssb %v0, %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfnma %v0, %v0, %v0, %v0, 0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfnmadb %v0, %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfnmasb %v0, %v0, %v0, %v0 + + vfnma %v0, %v0, %v0, %v0, 0, 0 + vfnmadb %v0, %v0, %v0, %v0 + vfnmasb %v0, %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfnms %v0, %v0, %v0, %v0, 0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfnmsdb %v0, %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfnmssb %v0, %v0, %v0, %v0 + + vfnms %v0, %v0, %v0, %v0, 0, 0 + vfnmsdb %v0, %v0, %v0, %v0 + vfnmssb %v0, %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfssb %v0, %v0, %v0 + + vfssb %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vfsqsb %v0, %v0 + + vfsqsb %v0, %v0 + #CHECK: error: invalid operand #CHECK: vftci %v0, %v0, 0, 0, -1 #CHECK: error: invalid operand @@ -1158,6 +1465,11 @@ vftcidb %v0, %v0, -1 vftcidb %v0, %v0, 4096 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vftcisb %v0, %v0, 0 + + vftcisb %v0, %v0, 0 + #CHECK: error: invalid operand #CHECK: vgbm %v0, -1 #CHECK: error: invalid operand @@ -1615,6 +1927,11 @@ vlgvh %r0, %v0, 4096 vlgvh %r0, %v0, 0(%r0) +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vlip %v0, 0, 0 + + vlip %v0, 0, 0 + #CHECK: error: invalid operand #CHECK: vll %v0, %r0, -1 #CHECK: error: invalid operand @@ -1687,6 +2004,11 @@ vllezh %v0, 4096 vllezh %v0, 0(%v1,%r2) +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vllezlf %v0, 0 + + vllezlf %v0, 0 + #CHECK: error: invalid operand #CHECK: vlm %v0, %v0, -1 #CHECK: error: invalid operand @@ -1756,6 +2078,16 @@ vlreph %v0, 4096 vlreph %v0, 0(%v1,%r2) +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vlrl %v0, 0, 0 + + vlrl %v0, 0, 0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vlrlr %v0, %r0, 0 + + vlrlr %v0, %r0, 0 + #CHECK: error: invalid operand #CHECK: vlvg %v0, %r0, 0, -1 #CHECK: error: invalid operand @@ -1817,6 +2149,39 @@ vlvgh %v0, %r0, 4096 vlvgh %v0, %r0, 0(%r0) +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vmp %v0, %v0, %v0, 0, 0 + + vmp %v0, %v0, %v0, 0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vmsl %v0, %v0, %v0, %v0, 0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vmslg %v0, %v0, %v0, %v0, 0 + + vmsl %v0, %v0, %v0, %v0, 0, 0 + vmslg %v0, %v0, %v0, %v0, 0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vmsp %v0, %v0, %v0, 0, 0 + + vmsp %v0, %v0, %v0, 0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vnn %v0, %v0, %v0 + + vnn %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vnx %v0, %v0, %v0 + + vnx %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: voc %v0, %v0, %v0 + + voc %v0, %v0, %v0 + #CHECK: error: invalid operand #CHECK: vpdi %v0, %v0, %v0, -1 #CHECK: error: invalid operand @@ -1825,6 +2190,30 @@ vpdi %v0, %v0, %v0, -1 vpdi %v0, %v0, %v0, 16 +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vpkz %v0, 0, 0 + + vpkz %v0, 0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vpopctb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vpopctf %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vpopctg %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: vpopcth %v0, %v0 + + vpopctb %v0, %v0 + vpopctf %v0, %v0 + vpopctg %v0, %v0 + vpopcth %v0, %v0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vpsop %v0, %v0, 0, 0, 0 + + vpsop %v0, %v0, 0, 0, 0 + #CHECK: error: invalid operand #CHECK: vrep %v0, %v0, 0, -1 #CHECK: error: invalid operand @@ -1917,6 +2306,11 @@ vrepih %v0, -32769 vrepih %v0, 32768 +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vrp %v0, %v0, %v0, 0, 0 + + vrp %v0, %v0, %v0, 0, 0 + #CHECK: error: vector index required #CHECK: vscef %v0, 0(%r1), 0 #CHECK: error: vector index required @@ -1957,6 +2351,11 @@ vsceg %v0, -1(%v0,%r1), 0 vsceg %v0, 4096(%v0,%r1), 0 +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vsdp %v0, %v0, %v0, 0, 0 + + vsdp %v0, %v0, %v0, 0, 0 + #CHECK: error: invalid operand #CHECK: vsldb %v0, %v0, %v0, -1 #CHECK: error: invalid operand @@ -1965,6 +2364,16 @@ vsldb %v0, %v0, %v0, -1 vsldb %v0, %v0, %v0, 256 +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vsp %v0, %v0, %v0, 0, 0 + + vsp %v0, %v0, %v0, 0, 0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vsrp %v0, %v0, 0, 0, 0 + + vsrp %v0, %v0, 0, 0, 0 + #CHECK: error: invalid operand #CHECK: vst %v0, -1 #CHECK: error: invalid operand @@ -2251,6 +2660,26 @@ vstrczhs %v0, %v0, %v0 vstrczhs %v0, %v0, %v0, %v0, 0, 0 +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vstrl %v0, 0, 0 + + vstrl %v0, 0, 0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vstrlr %v0, %r0, 0 + + vstrlr %v0, %r0, 0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vtp %v0 + + vtp %v0 + +#CHECK: error: instruction requires: vector-packed-decimal +#CHECK: vupkz %v0, 0, 0 + + vupkz %v0, 0, 0 + #CHECK: error: invalid operand #CHECK: wcdgb %v0, %v0, 0, -1 #CHECK: error: invalid operand @@ -2307,6 +2736,72 @@ wclgdb %v0, %v0, -1, 0 wclgdb %v0, %v0, 16, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfasb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfaxb %v0, %v0, %v0 + + wfasb %v0, %v0, %v0 + wfaxb %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfcsb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfcxb %v0, %v0 + + wfcsb %v0, %v0 + wfcxb %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfcesb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfcesbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfcexb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfcexbs %v0, %v0, %v0 + + wfcesb %v0, %v0, %v0 + wfcesbs %v0, %v0, %v0 + wfcexb %v0, %v0, %v0 + wfcexbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfchsb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfchsbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfchxb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfchxbs %v0, %v0, %v0 + + wfchsb %v0, %v0, %v0 + wfchsbs %v0, %v0, %v0 + wfchxb %v0, %v0, %v0 + wfchxbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfchesb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfchesbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfchexb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfchexbs %v0, %v0, %v0 + + wfchesb %v0, %v0, %v0 + wfchesbs %v0, %v0, %v0 + wfchexb %v0, %v0, %v0 + wfchexbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfdsb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfdxb %v0, %v0, %v0 + + wfdsb %v0, %v0, %v0 + wfdxb %v0, %v0, %v0 + #CHECK: error: invalid operand #CHECK: wfidb %v0, %v0, 0, -1 #CHECK: error: invalid operand @@ -2321,6 +2816,208 @@ wfidb %v0, %v0, -1, 0 wfidb %v0, %v0, 16, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfisb %v0, %v0, 0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfixb %v0, %v0, 0, 0 + + wfisb %v0, %v0, 0, 0 + wfixb %v0, %v0, 0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfksb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkxb %v0, %v0 + + wfksb %v0, %v0 + wfkxb %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkedb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkedbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkesb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkesbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkexb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkexbs %v0, %v0, %v0 + + wfkedb %v0, %v0, %v0 + wfkedbs %v0, %v0, %v0 + wfkesb %v0, %v0, %v0 + wfkesbs %v0, %v0, %v0 + wfkexb %v0, %v0, %v0 + wfkexbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhdb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhdbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhsb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhsbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhxb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhxbs %v0, %v0, %v0 + + wfkhdb %v0, %v0, %v0 + wfkhdbs %v0, %v0, %v0 + wfkhsb %v0, %v0, %v0 + wfkhsbs %v0, %v0, %v0 + wfkhxb %v0, %v0, %v0 + wfkhxbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhedb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhedbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhesb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhesbs %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhexb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfkhexbs %v0, %v0, %v0 + + wfkhedb %v0, %v0, %v0 + wfkhedbs %v0, %v0, %v0 + wfkhesb %v0, %v0, %v0 + wfkhesbs %v0, %v0, %v0 + wfkhexb %v0, %v0, %v0 + wfkhexbs %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfpsosb %v0, %v0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfpsoxb %v0, %v0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wflcsb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wflcxb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wflnsb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wflnxb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wflpsb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wflpxb %v0, %v0 + + wfpsosb %v0, %v0, 0 + wfpsoxb %v0, %v0, 0 + wflcsb %v0, %v0 + wflcxb %v0, %v0 + wflnsb %v0, %v0 + wflnxb %v0, %v0 + wflpsb %v0, %v0 + wflpxb %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wflls %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wflld %v0, %v0 + + wflls %v0, %v0 + wflld %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wflrd %v0, %v0, 0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wflrx %v0, %v0, 0, 0 + + wflrd %v0, %v0, 0, 0 + wflrx %v0, %v0, 0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfmaxdb %v0, %v0, %v0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfmaxsb %v0, %v0, %v0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfmaxxb %v0, %v0, %v0, 0 + + wfmaxdb %v0, %v0, %v0, 0 + wfmaxsb %v0, %v0, %v0, 0 + wfmaxxb %v0, %v0, %v0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfmindb %v0, %v0, %v0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfminsb %v0, %v0, %v0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfminxb %v0, %v0, %v0, 0 + + wfmindb %v0, %v0, %v0, 0 + wfminsb %v0, %v0, %v0, 0 + wfminxb %v0, %v0, %v0, 0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfmasb %v0, %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfmaxb %v0, %v0, %v0, %v0 + + wfmasb %v0, %v0, %v0, %v0 + wfmaxb %v0, %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfmsb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfmxb %v0, %v0, %v0 + + wfmsb %v0, %v0, %v0 + wfmxb %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfmssb %v0, %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfmsxb %v0, %v0, %v0, %v0 + + wfmssb %v0, %v0, %v0, %v0 + wfmsxb %v0, %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfnmadb %v0, %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfnmasb %v0, %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfnmaxb %v0, %v0, %v0, %v0 + + wfnmadb %v0, %v0, %v0, %v0 + wfnmasb %v0, %v0, %v0, %v0 + wfnmaxb %v0, %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfnmsdb %v0, %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfnmssb %v0, %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfnmsxb %v0, %v0, %v0, %v0 + + wfnmsdb %v0, %v0, %v0, %v0 + wfnmssb %v0, %v0, %v0, %v0 + wfnmsxb %v0, %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfssb %v0, %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfsxb %v0, %v0, %v0 + + wfssb %v0, %v0, %v0 + wfsxb %v0, %v0, %v0 + +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfsqsb %v0, %v0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wfsqxb %v0, %v0 + + wfsqsb %v0, %v0 + wfsqxb %v0, %v0 + #CHECK: error: invalid operand #CHECK: wftcidb %v0, %v0, -1 #CHECK: error: invalid operand @@ -2329,6 +3026,14 @@ wftcidb %v0, %v0, -1 wftcidb %v0, %v0, 4096 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wftcisb %v0, %v0, 0 +#CHECK: error: instruction requires: vector-enhancements-1 +#CHECK: wftcixb %v0, %v0, 0 + + wftcisb %v0, %v0, 0 + wftcixb %v0, %v0, 0 + #CHECK: error: invalid operand #CHECK: wledb %v0, %v0, 0, -1 #CHECK: error: invalid operand diff --git a/test/MC/SystemZ/insn-bad-z14.s b/test/MC/SystemZ/insn-bad-z14.s new file mode 100644 index 000000000000..8bc736a7a1a4 --- /dev/null +++ b/test/MC/SystemZ/insn-bad-z14.s @@ -0,0 +1,752 @@ +# For z14 only. +# RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=z14 < %s 2> %t +# RUN: FileCheck < %t %s +# RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=arch12 < %s 2> %t +# RUN: FileCheck < %t %s + +#CHECK: error: invalid operand +#CHECK: bi -524289 +#CHECK: error: invalid operand +#CHECK: bi 524288 + + bi -524289 + bi 524288 + +#CHECK: error: invalid operand +#CHECK: bic -1, 0(%r1) +#CHECK: error: invalid operand +#CHECK: bic 16, 0(%r1) +#CHECK: error: invalid operand +#CHECK: bic 0, -524289 +#CHECK: error: invalid operand +#CHECK: bic 0, 524288 + + bic -1, 0(%r1) + bic 16, 0(%r1) + bic 0, -524289 + bic 0, 524288 + +#CHECK: error: invalid operand +#CHECK: agh %r0, -524289 +#CHECK: error: invalid operand +#CHECK: agh %r0, 524288 + + agh %r0, -524289 + agh %r0, 524288 + +#CHECK: error: invalid register pair +#CHECK: kma %r1, %r2, %r4 +#CHECK: error: invalid register pair +#CHECK: kma %r2, %r1, %r4 +#CHECK: error: invalid register pair +#CHECK: kma %r2, %r4, %r1 + + kma %r1, %r2, %r4 + kma %r2, %r1, %r4 + kma %r2, %r4, %r1 + +#CHECK: error: invalid operand +#CHECK: lgg %r0, -524289 +#CHECK: error: invalid operand +#CHECK: lgg %r0, 524288 + + lgg %r0, -524289 + lgg %r0, 524288 + +#CHECK: error: invalid operand +#CHECK: lgsc %r0, -524289 +#CHECK: error: invalid operand +#CHECK: lgsc %r0, 524288 + + lgsc %r0, -524289 + lgsc %r0, 524288 + +#CHECK: error: invalid operand +#CHECK: llgfsg %r0, -524289 +#CHECK: error: invalid operand +#CHECK: llgfsg %r0, 524288 + + llgfsg %r0, -524289 + llgfsg %r0, 524288 + +#CHECK: error: invalid operand +#CHECK: mg %r0, -524289 +#CHECK: error: invalid operand +#CHECK: mg %r0, 524288 +#CHECK: error: invalid register pair +#CHECK: mg %r1, 0 + + mg %r0, -524289 + mg %r0, 524288 + mg %r1, 0 + +#CHECK: error: invalid operand +#CHECK: mgh %r0, -524289 +#CHECK: error: invalid operand +#CHECK: mgh %r0, 524288 + + mgh %r0, -524289 + mgh %r0, 524288 + +#CHECK: error: invalid register pair +#CHECK: mgrk %r1, %r0, %r0 + + mgrk %r1, %r0, %r0 + +#CHECK: error: invalid operand +#CHECK: msc %r0, -524289 +#CHECK: error: invalid operand +#CHECK: msc %r0, 524288 + + msc %r0, -524289 + msc %r0, 524288 + +#CHECK: error: invalid operand +#CHECK: msgc %r0, -524289 +#CHECK: error: invalid operand +#CHECK: msgc %r0, 524288 + + msgc %r0, -524289 + msgc %r0, 524288 + +#CHECK: error: invalid register pair +#CHECK: prno %r1, %r2 +#CHECK: error: invalid register pair +#CHECK: prno %r2, %r1 + + prno %r1, %r2 + prno %r2, %r1 + +#CHECK: error: invalid operand +#CHECK: sgh %r0, -524289 +#CHECK: error: invalid operand +#CHECK: sgh %r0, 524288 + + sgh %r0, -524289 + sgh %r0, 524288 + +#CHECK: error: invalid operand +#CHECK: stgsc %r0, -524289 +#CHECK: error: invalid operand +#CHECK: stgsc %r0, 524288 + + stgsc %r0, -524289 + stgsc %r0, 524288 + +#CHECK: error: invalid operand +#CHECK: vap %v0, %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vap %v0, %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vap %v0, %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vap %v0, %v0, %v0, 256, 0 + + vap %v0, %v0, %v0, 0, -1 + vap %v0, %v0, %v0, 0, 16 + vap %v0, %v0, %v0, -1, 0 + vap %v0, %v0, %v0, 256, 0 + +#CHECK: error: invalid operand +#CHECK: vcp %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: vcp %v0, %v0, 16 + + vcp %v0, %v0, -1 + vcp %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: vcvb %r0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: vcvb %r0, %v0, 16 + + vcvb %r0, %v0, -1 + vcvb %r0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: vcvbg %r0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: vcvbg %r0, %v0, 16 + + vcvbg %r0, %v0, -1 + vcvbg %r0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: vcvd %r0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vcvd %r0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vcvd %r0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vcvd %r0, %v0, 256, 0 + + vcvd %r0, %v0, 0, -1 + vcvd %r0, %v0, 0, 16 + vcvd %r0, %v0, -1, 0 + vcvd %r0, %v0, 256, 0 + +#CHECK: error: invalid operand +#CHECK: vcvdg %r0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vcvdg %r0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vcvdg %r0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vcvdg %r0, %v0, 256, 0 + + vcvdg %r0, %v0, 0, -1 + vcvdg %r0, %v0, 0, 16 + vcvdg %r0, %v0, -1, 0 + vcvdg %r0, %v0, 256, 0 + +#CHECK: error: invalid operand +#CHECK: vdp %v0, %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vdp %v0, %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vdp %v0, %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vdp %v0, %v0, %v0, 256, 0 + + vdp %v0, %v0, %v0, 0, -1 + vdp %v0, %v0, %v0, 0, 16 + vdp %v0, %v0, %v0, -1, 0 + vdp %v0, %v0, %v0, 256, 0 + +#CHECK: error: invalid operand +#CHECK: vfisb %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vfisb %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vfisb %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vfisb %v0, %v0, 16, 0 + + vfisb %v0, %v0, 0, -1 + vfisb %v0, %v0, 0, 16 + vfisb %v0, %v0, -1, 0 + vfisb %v0, %v0, 16, 0 + +#CHECK: error: invalid operand +#CHECK: vfll %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vfll %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vfll %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vfll %v0, %v0, 16, 0 + + vfll %v0, %v0, 0, -1 + vfll %v0, %v0, 0, 16 + vfll %v0, %v0, -1, 0 + vfll %v0, %v0, 16, 0 + +#CHECK: error: invalid operand +#CHECK: vflr %v0, %v0, 0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vflr %v0, %v0, 0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vflr %v0, %v0, 0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vflr %v0, %v0, 0, 16, 0 +#CHECK: error: invalid operand +#CHECK: vflr %v0, %v0, -1, 0, 0 +#CHECK: error: invalid operand +#CHECK: vflr %v0, %v0, 16, 0, 0 + + vflr %v0, %v0, 0, 0, -1 + vflr %v0, %v0, 0, 0, 16 + vflr %v0, %v0, 0, -1, 0 + vflr %v0, %v0, 0, 16, 0 + vflr %v0, %v0, -1, 0, 0 + vflr %v0, %v0, 16, 0, 0 + +#CHECK: error: invalid operand +#CHECK: vflrd %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vflrd %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vflrd %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vflrd %v0, %v0, 16, 0 + + vflrd %v0, %v0, 0, -1 + vflrd %v0, %v0, 0, 16 + vflrd %v0, %v0, -1, 0 + vflrd %v0, %v0, 16, 0 + +#CHECK: error: invalid operand +#CHECK: vfmax %v0, %v0, %v0, 0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vfmax %v0, %v0, %v0, 0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vfmax %v0, %v0, %v0, 0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vfmax %v0, %v0, %v0, 0, 16, 0 +#CHECK: error: invalid operand +#CHECK: vfmax %v0, %v0, %v0, -1, 0, 0 +#CHECK: error: invalid operand +#CHECK: vfmax %v0, %v0, %v0, 16, 0, 0 + + vfmax %v0, %v0, %v0, 0, 0, -1 + vfmax %v0, %v0, %v0, 0, 0, 16 + vfmax %v0, %v0, %v0, 0, -1, 0 + vfmax %v0, %v0, %v0, 0, 16, 0 + vfmax %v0, %v0, %v0, -1, 0, 0 + vfmax %v0, %v0, %v0, 16, 0, 0 + +#CHECK: error: invalid operand +#CHECK: vfmaxdb %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: vfmaxdb %v0, %v0, %v0, 16 + + vfmaxdb %v0, %v0, %v0, -1 + vfmaxdb %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: vfmaxsb %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: vfmaxsb %v0, %v0, %v0, 16 + + vfmaxsb %v0, %v0, %v0, -1 + vfmaxsb %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: vfmin %v0, %v0, %v0, 0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vfmin %v0, %v0, %v0, 0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vfmin %v0, %v0, %v0, 0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vfmin %v0, %v0, %v0, 0, 16, 0 +#CHECK: error: invalid operand +#CHECK: vfmin %v0, %v0, %v0, -1, 0, 0 +#CHECK: error: invalid operand +#CHECK: vfmin %v0, %v0, %v0, 16, 0, 0 + + vfmin %v0, %v0, %v0, 0, 0, -1 + vfmin %v0, %v0, %v0, 0, 0, 16 + vfmin %v0, %v0, %v0, 0, -1, 0 + vfmin %v0, %v0, %v0, 0, 16, 0 + vfmin %v0, %v0, %v0, -1, 0, 0 + vfmin %v0, %v0, %v0, 16, 0, 0 + +#CHECK: error: invalid operand +#CHECK: vfmindb %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: vfmindb %v0, %v0, %v0, 16 + + vfmindb %v0, %v0, %v0, -1 + vfmindb %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: vfminsb %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: vfminsb %v0, %v0, %v0, 16 + + vfminsb %v0, %v0, %v0, -1 + vfminsb %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: vfnma %v0, %v0, %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vfnma %v0, %v0, %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vfnma %v0, %v0, %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vfnma %v0, %v0, %v0, %v0, 16, 0 + + vfnma %v0, %v0, %v0, %v0, 0, -1 + vfnma %v0, %v0, %v0, %v0, 0, 16 + vfnma %v0, %v0, %v0, %v0, -1, 0 + vfnma %v0, %v0, %v0, %v0, 16, 0 + +#CHECK: error: invalid operand +#CHECK: vfnms %v0, %v0, %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vfnms %v0, %v0, %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vfnms %v0, %v0, %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vfnms %v0, %v0, %v0, %v0, 16, 0 + + vfnms %v0, %v0, %v0, %v0, 0, -1 + vfnms %v0, %v0, %v0, %v0, 0, 16 + vfnms %v0, %v0, %v0, %v0, -1, 0 + vfnms %v0, %v0, %v0, %v0, 16, 0 + +#CHECK: error: invalid operand +#CHECK: vftcisb %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: vftcisb %v0, %v0, 4096 + + vftcisb %v0, %v0, -1 + vftcisb %v0, %v0, 4096 + +#CHECK: error: invalid operand +#CHECK: vlip %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vlip %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vlip %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vlip %v0, 65536, 0 + + vlip %v0, 0, -1 + vlip %v0, 0, 16 + vlip %v0, -1, 0 + vlip %v0, 65536, 0 + +#CHECK: error: invalid operand +#CHECK: vllezlf %v0, -1 +#CHECK: error: invalid operand +#CHECK: vllezlf %v0, 4096 +#CHECK: error: invalid use of vector addressing +#CHECK: vllezlf %v0, 0(%v1,%r2) + + vllezlf %v0, -1 + vllezlf %v0, 4096 + vllezlf %v0, 0(%v1,%r2) + +#CHECK: error: invalid operand +#CHECK: vlrl %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vlrl %v0, 0, 256 +#CHECK: error: invalid operand +#CHECK: vlrl %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vlrl %v0, 4096, 0 +#CHECK: error: %r0 used in an address +#CHECK: vlrl %v0, 0(%r0), 0 + + vlrl %v0, 0, -1 + vlrl %v0, 0, 256 + vlrl %v0, -1, 0 + vlrl %v0, 4096, 0 + vlrl %v0, 0(%r0), 0 + +#CHECK: error: invalid operand +#CHECK: vlrlr %v0, %r0, -1 +#CHECK: error: invalid operand +#CHECK: vlrlr %v0, %r0, 4096 +#CHECK: error: %r0 used in an address +#CHECK: vlrlr %v0, %r0, 0(%r0) + + vlrlr %v0, %r0, -1 + vlrlr %v0, %r0, 4096 + vlrlr %v0, %r0, 0(%r0) + +#CHECK: error: invalid operand +#CHECK: vmp %v0, %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vmp %v0, %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vmp %v0, %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vmp %v0, %v0, %v0, 256, 0 + + vmp %v0, %v0, %v0, 0, -1 + vmp %v0, %v0, %v0, 0, 16 + vmp %v0, %v0, %v0, -1, 0 + vmp %v0, %v0, %v0, 256, 0 + +#CHECK: error: invalid operand +#CHECK: vmsp %v0, %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vmsp %v0, %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vmsp %v0, %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vmsp %v0, %v0, %v0, 256, 0 + + vmsp %v0, %v0, %v0, 0, -1 + vmsp %v0, %v0, %v0, 0, 16 + vmsp %v0, %v0, %v0, -1, 0 + vmsp %v0, %v0, %v0, 256, 0 + +#CHECK: error: invalid operand +#CHECK: vmsl %v0, %v0, %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vmsl %v0, %v0, %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vmsl %v0, %v0, %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vmsl %v0, %v0, %v0, %v0, 16, 0 + + vmsl %v0, %v0, %v0, %v0, 0, -1 + vmsl %v0, %v0, %v0, %v0, 0, 16 + vmsl %v0, %v0, %v0, %v0, -1, 0 + vmsl %v0, %v0, %v0, %v0, 16, 0 + +#CHECK: error: invalid operand +#CHECK: vmslg %v0, %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: vmslg %v0, %v0, %v0, %v0, 16 + + vmslg %v0, %v0, %v0, %v0, -1 + vmslg %v0, %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: vpkz %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vpkz %v0, 0, 256 +#CHECK: error: invalid operand +#CHECK: vpkz %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vpkz %v0, 4096, 0 +#CHECK: error: %r0 used in an address +#CHECK: vpkz %v0, 0(%r0), 0 + + vpkz %v0, 0, -1 + vpkz %v0, 0, 256 + vpkz %v0, -1, 0 + vpkz %v0, 4096, 0 + vpkz %v0, 0(%r0), 0 + +#CHECK: error: invalid operand +#CHECK: vpsop %v0, %v0, 0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vpsop %v0, %v0, 0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vpsop %v0, %v0, 0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vpsop %v0, %v0, 0, 256, 0 +#CHECK: error: invalid operand +#CHECK: vpsop %v0, %v0, -1, 0, 0 +#CHECK: error: invalid operand +#CHECK: vpsop %v0, %v0, 256, 0, 0 + + vpsop %v0, %v0, 0, 0, -1 + vpsop %v0, %v0, 0, 0, 16 + vpsop %v0, %v0, 0, -1, 0 + vpsop %v0, %v0, 0, 256, 0 + vpsop %v0, %v0, -1, 0, 0 + vpsop %v0, %v0, 256, 0, 0 + +#CHECK: error: invalid operand +#CHECK: vrp %v0, %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vrp %v0, %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vrp %v0, %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vrp %v0, %v0, %v0, 256, 0 + + vrp %v0, %v0, %v0, 0, -1 + vrp %v0, %v0, %v0, 0, 16 + vrp %v0, %v0, %v0, -1, 0 + vrp %v0, %v0, %v0, 256, 0 + +#CHECK: error: invalid operand +#CHECK: vsdp %v0, %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vsdp %v0, %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vsdp %v0, %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vsdp %v0, %v0, %v0, 256, 0 + + vsdp %v0, %v0, %v0, 0, -1 + vsdp %v0, %v0, %v0, 0, 16 + vsdp %v0, %v0, %v0, -1, 0 + vsdp %v0, %v0, %v0, 256, 0 + +#CHECK: error: invalid operand +#CHECK: vsp %v0, %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vsp %v0, %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vsp %v0, %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vsp %v0, %v0, %v0, 256, 0 + + vsp %v0, %v0, %v0, 0, -1 + vsp %v0, %v0, %v0, 0, 16 + vsp %v0, %v0, %v0, -1, 0 + vsp %v0, %v0, %v0, 256, 0 + +#CHECK: error: invalid operand +#CHECK: vsrp %v0, %v0, 0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vsrp %v0, %v0, 0, 0, 16 +#CHECK: error: invalid operand +#CHECK: vsrp %v0, %v0, 0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vsrp %v0, %v0, 0, 256, 0 +#CHECK: error: invalid operand +#CHECK: vsrp %v0, %v0, -1, 0, 0 +#CHECK: error: invalid operand +#CHECK: vsrp %v0, %v0, 256, 0, 0 + + vsrp %v0, %v0, 0, 0, -1 + vsrp %v0, %v0, 0, 0, 16 + vsrp %v0, %v0, 0, -1, 0 + vsrp %v0, %v0, 0, 256, 0 + vsrp %v0, %v0, -1, 0, 0 + vsrp %v0, %v0, 256, 0, 0 + +#CHECK: error: invalid operand +#CHECK: vstrl %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vstrl %v0, 0, 256 +#CHECK: error: invalid operand +#CHECK: vstrl %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vstrl %v0, 4096, 0 +#CHECK: error: %r0 used in an address +#CHECK: vstrl %v0, 0(%r0), 0 + + vstrl %v0, 0, -1 + vstrl %v0, 0, 256 + vstrl %v0, -1, 0 + vstrl %v0, 4096, 0 + vstrl %v0, 0(%r0), 0 + +#CHECK: error: invalid operand +#CHECK: vstrlr %v0, %r0, -1 +#CHECK: error: invalid operand +#CHECK: vstrlr %v0, %r0, 4096 +#CHECK: error: %r0 used in an address +#CHECK: vstrlr %v0, %r0, 0(%r0) + + vstrlr %v0, %r0, -1 + vstrlr %v0, %r0, 4096 + vstrlr %v0, %r0, 0(%r0) + +#CHECK: error: invalid operand +#CHECK: vupkz %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: vupkz %v0, 0, 256 +#CHECK: error: invalid operand +#CHECK: vupkz %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: vupkz %v0, 4096, 0 +#CHECK: error: %r0 used in an address +#CHECK: vupkz %v0, 0(%r0), 0 + + vupkz %v0, 0, -1 + vupkz %v0, 0, 256 + vupkz %v0, -1, 0 + vupkz %v0, 4096, 0 + vupkz %v0, 0(%r0), 0 + +#CHECK: error: invalid operand +#CHECK: wfisb %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: wfisb %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: wfisb %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: wfisb %v0, %v0, 16, 0 + + wfisb %v0, %v0, 0, -1 + wfisb %v0, %v0, 0, 16 + wfisb %v0, %v0, -1, 0 + wfisb %v0, %v0, 16, 0 + +#CHECK: error: invalid operand +#CHECK: wfixb %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: wfixb %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: wfixb %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: wfixb %v0, %v0, 16, 0 + + wfixb %v0, %v0, 0, -1 + wfixb %v0, %v0, 0, 16 + wfixb %v0, %v0, -1, 0 + wfixb %v0, %v0, 16, 0 + +#CHECK: error: invalid operand +#CHECK: wflrd %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: wflrd %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: wflrd %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: wflrd %v0, %v0, 16, 0 + + wflrd %v0, %v0, 0, -1 + wflrd %v0, %v0, 0, 16 + wflrd %v0, %v0, -1, 0 + wflrd %v0, %v0, 16, 0 + +#CHECK: error: invalid operand +#CHECK: wflrx %v0, %v0, 0, -1 +#CHECK: error: invalid operand +#CHECK: wflrx %v0, %v0, 0, 16 +#CHECK: error: invalid operand +#CHECK: wflrx %v0, %v0, -1, 0 +#CHECK: error: invalid operand +#CHECK: wflrx %v0, %v0, 16, 0 + + wflrx %v0, %v0, 0, -1 + wflrx %v0, %v0, 0, 16 + wflrx %v0, %v0, -1, 0 + wflrx %v0, %v0, 16, 0 + +#CHECK: error: invalid operand +#CHECK: wfmaxdb %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: wfmaxdb %v0, %v0, %v0, 16 + + wfmaxdb %v0, %v0, %v0, -1 + wfmaxdb %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: wfmaxsb %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: wfmaxsb %v0, %v0, %v0, 16 + + wfmaxsb %v0, %v0, %v0, -1 + wfmaxsb %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: wfmaxxb %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: wfmaxxb %v0, %v0, %v0, 16 + + wfmaxxb %v0, %v0, %v0, -1 + wfmaxxb %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: wfmindb %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: wfmindb %v0, %v0, %v0, 16 + + wfmindb %v0, %v0, %v0, -1 + wfmindb %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: wfminsb %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: wfminsb %v0, %v0, %v0, 16 + + wfminsb %v0, %v0, %v0, -1 + wfminsb %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: wfminxb %v0, %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: wfminxb %v0, %v0, %v0, 16 + + wfminxb %v0, %v0, %v0, -1 + wfminxb %v0, %v0, %v0, 16 + +#CHECK: error: invalid operand +#CHECK: wftcisb %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: wftcisb %v0, %v0, 4096 + + wftcisb %v0, %v0, -1 + wftcisb %v0, %v0, 4096 + +#CHECK: error: invalid operand +#CHECK: wftcixb %v0, %v0, -1 +#CHECK: error: invalid operand +#CHECK: wftcixb %v0, %v0, 4096 + + wftcixb %v0, %v0, -1 + wftcixb %v0, %v0, 4096 + diff --git a/test/MC/SystemZ/insn-good-z14.s b/test/MC/SystemZ/insn-good-z14.s new file mode 100644 index 000000000000..1fcdcb4ccab0 --- /dev/null +++ b/test/MC/SystemZ/insn-good-z14.s @@ -0,0 +1,2674 @@ +# For z14 and above. +# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=z14 -show-encoding %s \ +# RUN: | FileCheck %s +# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=arch12 -show-encoding %s \ +# RUN: | FileCheck %s + +#CHECK: agh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x38] +#CHECK: agh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x38] +#CHECK: agh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x38] +#CHECK: agh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x38] +#CHECK: agh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x38] +#CHECK: agh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x38] +#CHECK: agh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x38] +#CHECK: agh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x38] +#CHECK: agh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x38] +#CHECK: agh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x38] + + agh %r0, -524288 + agh %r0, -1 + agh %r0, 0 + agh %r0, 1 + agh %r0, 524287 + agh %r0, 0(%r1) + agh %r0, 0(%r15) + agh %r0, 524287(%r1,%r15) + agh %r0, 524287(%r15,%r1) + agh %r15, 0 + +#CHECK: bi -524288 # encoding: [0xe3,0xf0,0x00,0x00,0x80,0x47] +#CHECK: bi -1 # encoding: [0xe3,0xf0,0x0f,0xff,0xff,0x47] +#CHECK: bi 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x47] +#CHECK: bi 1 # encoding: [0xe3,0xf0,0x00,0x01,0x00,0x47] +#CHECK: bi 524287 # encoding: [0xe3,0xf0,0x0f,0xff,0x7f,0x47] +#CHECK: bi 0(%r1) # encoding: [0xe3,0xf0,0x10,0x00,0x00,0x47] +#CHECK: bi 0(%r15) # encoding: [0xe3,0xf0,0xf0,0x00,0x00,0x47] +#CHECK: bi 524287(%r1,%r15) # encoding: [0xe3,0xf1,0xff,0xff,0x7f,0x47] +#CHECK: bi 524287(%r15,%r1) # encoding: [0xe3,0xff,0x1f,0xff,0x7f,0x47] + + bi -524288 + bi -1 + bi 0 + bi 1 + bi 524287 + bi 0(%r1) + bi 0(%r15) + bi 524287(%r1,%r15) + bi 524287(%r15,%r1) + +#CHECK: bic 0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x47] +#CHECK: bic 0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x47] +#CHECK: bic 0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x47] +#CHECK: bic 0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x47] +#CHECK: bic 0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x47] +#CHECK: bic 0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x47] +#CHECK: bic 0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x47] +#CHECK: bic 0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x47] +#CHECK: bic 0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x47] +#CHECK: bic 15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x47] + + bic 0, -524288 + bic 0, -1 + bic 0, 0 + bic 0, 1 + bic 0, 524287 + bic 0, 0(%r1) + bic 0, 0(%r15) + bic 0, 524287(%r1,%r15) + bic 0, 524287(%r15,%r1) + bic 15, 0 + +#CHECK: bic 1, 0(%r7) # encoding: [0xe3,0x10,0x70,0x00,0x00,0x47] +#CHECK: bio 0(%r15) # encoding: [0xe3,0x10,0xf0,0x00,0x00,0x47] + + bic 1, 0(%r7) + bio 0(%r15) + +#CHECK: bic 2, 0(%r7) # encoding: [0xe3,0x20,0x70,0x00,0x00,0x47] +#CHECK: bih 0(%r15) # encoding: [0xe3,0x20,0xf0,0x00,0x00,0x47] + + bic 2, 0(%r7) + bih 0(%r15) + +#CHECK: bic 3, 0(%r7) # encoding: [0xe3,0x30,0x70,0x00,0x00,0x47] +#CHECK: binle 0(%r15) # encoding: [0xe3,0x30,0xf0,0x00,0x00,0x47] + + bic 3, 0(%r7) + binle 0(%r15) + +#CHECK: bic 4, 0(%r7) # encoding: [0xe3,0x40,0x70,0x00,0x00,0x47] +#CHECK: bil 0(%r15) # encoding: [0xe3,0x40,0xf0,0x00,0x00,0x47] + + bic 4, 0(%r7) + bil 0(%r15) + +#CHECK: bic 5, 0(%r7) # encoding: [0xe3,0x50,0x70,0x00,0x00,0x47] +#CHECK: binhe 0(%r15) # encoding: [0xe3,0x50,0xf0,0x00,0x00,0x47] + + bic 5, 0(%r7) + binhe 0(%r15) + +#CHECK: bic 6, 0(%r7) # encoding: [0xe3,0x60,0x70,0x00,0x00,0x47] +#CHECK: bilh 0(%r15) # encoding: [0xe3,0x60,0xf0,0x00,0x00,0x47] + + bic 6, 0(%r7) + bilh 0(%r15) + +#CHECK: bic 7, 0(%r7) # encoding: [0xe3,0x70,0x70,0x00,0x00,0x47] +#CHECK: bine 0(%r15) # encoding: [0xe3,0x70,0xf0,0x00,0x00,0x47] + + bic 7, 0(%r7) + bine 0(%r15) + +#CHECK: bic 8, 0(%r7) # encoding: [0xe3,0x80,0x70,0x00,0x00,0x47] +#CHECK: bie 0(%r15) # encoding: [0xe3,0x80,0xf0,0x00,0x00,0x47] + + bic 8, 0(%r7) + bie 0(%r15) + +#CHECK: bic 9, 0(%r7) # encoding: [0xe3,0x90,0x70,0x00,0x00,0x47] +#CHECK: binlh 0(%r15) # encoding: [0xe3,0x90,0xf0,0x00,0x00,0x47] + + bic 9, 0(%r7) + binlh 0(%r15) + +#CHECK: bic 10, 0(%r7) # encoding: [0xe3,0xa0,0x70,0x00,0x00,0x47] +#CHECK: bihe 0(%r15) # encoding: [0xe3,0xa0,0xf0,0x00,0x00,0x47] + + bic 10, 0(%r7) + bihe 0(%r15) + +#CHECK: bic 11, 0(%r7) # encoding: [0xe3,0xb0,0x70,0x00,0x00,0x47] +#CHECK: binl 0(%r15) # encoding: [0xe3,0xb0,0xf0,0x00,0x00,0x47] + + bic 11, 0(%r7) + binl 0(%r15) + +#CHECK: bic 12, 0(%r7) # encoding: [0xe3,0xc0,0x70,0x00,0x00,0x47] +#CHECK: bile 0(%r15) # encoding: [0xe3,0xc0,0xf0,0x00,0x00,0x47] + + bic 12, 0(%r7) + bile 0(%r15) + +#CHECK: bic 13, 0(%r7) # encoding: [0xe3,0xd0,0x70,0x00,0x00,0x47] +#CHECK: binh 0(%r15) # encoding: [0xe3,0xd0,0xf0,0x00,0x00,0x47] + + bic 13, 0(%r7) + binh 0(%r15) + +#CHECK: bic 14, 0(%r7) # encoding: [0xe3,0xe0,0x70,0x00,0x00,0x47] +#CHECK: bino 0(%r15) # encoding: [0xe3,0xe0,0xf0,0x00,0x00,0x47] + + bic 14, 0(%r7) + bino 0(%r15) + +#CHECK: irbm %r0, %r0 # encoding: [0xb9,0xac,0x00,0x00] +#CHECK: irbm %r0, %r15 # encoding: [0xb9,0xac,0x00,0x0f] +#CHECK: irbm %r15, %r0 # encoding: [0xb9,0xac,0x00,0xf0] +#CHECK: irbm %r7, %r8 # encoding: [0xb9,0xac,0x00,0x78] +#CHECK: irbm %r15, %r15 # encoding: [0xb9,0xac,0x00,0xff] + + irbm %r0,%r0 + irbm %r0,%r15 + irbm %r15,%r0 + irbm %r7,%r8 + irbm %r15,%r15 + +#CHECK: kma %r2, %r2, %r2 # encoding: [0xb9,0x29,0x20,0x22] +#CHECK: kma %r2, %r8, %r14 # encoding: [0xb9,0x29,0x80,0x2e] +#CHECK: kma %r14, %r8, %r2 # encoding: [0xb9,0x29,0x80,0xe2] +#CHECK: kma %r6, %r8, %r10 # encoding: [0xb9,0x29,0x80,0x6a] + + kma %r2, %r2, %r2 + kma %r2, %r8, %r14 + kma %r14, %r8, %r2 + kma %r6, %r8, %r10 + +#CHECK: lgg %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x4c] +#CHECK: lgg %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x4c] +#CHECK: lgg %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x4c] +#CHECK: lgg %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x4c] +#CHECK: lgg %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x4c] +#CHECK: lgg %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x4c] +#CHECK: lgg %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x4c] +#CHECK: lgg %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x4c] +#CHECK: lgg %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x4c] +#CHECK: lgg %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x4c] + + lgg %r0, -524288 + lgg %r0, -1 + lgg %r0, 0 + lgg %r0, 1 + lgg %r0, 524287 + lgg %r0, 0(%r1) + lgg %r0, 0(%r15) + lgg %r0, 524287(%r1,%r15) + lgg %r0, 524287(%r15,%r1) + lgg %r15, 0 + +#CHECK: lgsc %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x4d] +#CHECK: lgsc %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x4d] +#CHECK: lgsc %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x4d] +#CHECK: lgsc %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x4d] +#CHECK: lgsc %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x4d] +#CHECK: lgsc %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x4d] +#CHECK: lgsc %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x4d] +#CHECK: lgsc %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x4d] +#CHECK: lgsc %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x4d] + + lgsc %r0, -524288 + lgsc %r0, -1 + lgsc %r0, 0 + lgsc %r0, 1 + lgsc %r0, 524287 + lgsc %r0, 0(%r1) + lgsc %r0, 0(%r15) + lgsc %r0, 524287(%r1,%r15) + lgsc %r0, 524287(%r15,%r1) + +#CHECK: llgfsg %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x48] +#CHECK: llgfsg %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x48] +#CHECK: llgfsg %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x48] +#CHECK: llgfsg %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x48] +#CHECK: llgfsg %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x48] +#CHECK: llgfsg %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x48] +#CHECK: llgfsg %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x48] +#CHECK: llgfsg %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x48] +#CHECK: llgfsg %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x48] +#CHECK: llgfsg %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x48] + + llgfsg %r0, -524288 + llgfsg %r0, -1 + llgfsg %r0, 0 + llgfsg %r0, 1 + llgfsg %r0, 524287 + llgfsg %r0, 0(%r1) + llgfsg %r0, 0(%r15) + llgfsg %r0, 524287(%r1,%r15) + llgfsg %r0, 524287(%r15,%r1) + llgfsg %r15, 0 + +#CHECK: mg %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x84] +#CHECK: mg %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x84] +#CHECK: mg %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x84] +#CHECK: mg %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x84] +#CHECK: mg %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x84] +#CHECK: mg %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x84] +#CHECK: mg %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x84] +#CHECK: mg %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x84] +#CHECK: mg %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x84] +#CHECK: mg %r14, 0 # encoding: [0xe3,0xe0,0x00,0x00,0x00,0x84] + + mg %r0, -524288 + mg %r0, -1 + mg %r0, 0 + mg %r0, 1 + mg %r0, 524287 + mg %r0, 0(%r1) + mg %r0, 0(%r15) + mg %r0, 524287(%r1,%r15) + mg %r0, 524287(%r15,%r1) + mg %r14, 0 + +#CHECK: mgh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3c] +#CHECK: mgh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3c] +#CHECK: mgh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3c] +#CHECK: mgh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3c] +#CHECK: mgh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3c] +#CHECK: mgh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3c] +#CHECK: mgh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3c] +#CHECK: mgh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3c] +#CHECK: mgh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3c] +#CHECK: mgh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3c] + + mgh %r0, -524288 + mgh %r0, -1 + mgh %r0, 0 + mgh %r0, 1 + mgh %r0, 524287 + mgh %r0, 0(%r1) + mgh %r0, 0(%r15) + mgh %r0, 524287(%r1,%r15) + mgh %r0, 524287(%r15,%r1) + mgh %r15, 0 + +#CHECK: mgrk %r0, %r0, %r0 # encoding: [0xb9,0xec,0x00,0x00] +#CHECK: mgrk %r0, %r0, %r15 # encoding: [0xb9,0xec,0xf0,0x00] +#CHECK: mgrk %r0, %r15, %r0 # encoding: [0xb9,0xec,0x00,0x0f] +#CHECK: mgrk %r14, %r0, %r0 # encoding: [0xb9,0xec,0x00,0xe0] +#CHECK: mgrk %r6, %r8, %r9 # encoding: [0xb9,0xec,0x90,0x68] + + mgrk %r0,%r0,%r0 + mgrk %r0,%r0,%r15 + mgrk %r0,%r15,%r0 + mgrk %r14,%r0,%r0 + mgrk %r6,%r8,%r9 + +#CHECK: msc %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x53] +#CHECK: msc %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x53] +#CHECK: msc %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x53] +#CHECK: msc %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x53] +#CHECK: msc %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x53] +#CHECK: msc %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x53] +#CHECK: msc %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x53] +#CHECK: msc %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x53] +#CHECK: msc %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x53] +#CHECK: msc %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x53] + + msc %r0, -524288 + msc %r0, -1 + msc %r0, 0 + msc %r0, 1 + msc %r0, 524287 + msc %r0, 0(%r1) + msc %r0, 0(%r15) + msc %r0, 524287(%r1,%r15) + msc %r0, 524287(%r15,%r1) + msc %r15, 0 + +#CHECK: msgc %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x83] +#CHECK: msgc %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x83] +#CHECK: msgc %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x83] +#CHECK: msgc %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x83] +#CHECK: msgc %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x83] +#CHECK: msgc %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x83] +#CHECK: msgc %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x83] +#CHECK: msgc %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x83] +#CHECK: msgc %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x83] +#CHECK: msgc %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x83] + + msgc %r0, -524288 + msgc %r0, -1 + msgc %r0, 0 + msgc %r0, 1 + msgc %r0, 524287 + msgc %r0, 0(%r1) + msgc %r0, 0(%r15) + msgc %r0, 524287(%r1,%r15) + msgc %r0, 524287(%r15,%r1) + msgc %r15, 0 + +#CHECK: msrkc %r0, %r0, %r0 # encoding: [0xb9,0xfd,0x00,0x00] +#CHECK: msrkc %r0, %r0, %r15 # encoding: [0xb9,0xfd,0xf0,0x00] +#CHECK: msrkc %r0, %r15, %r0 # encoding: [0xb9,0xfd,0x00,0x0f] +#CHECK: msrkc %r15, %r0, %r0 # encoding: [0xb9,0xfd,0x00,0xf0] +#CHECK: msrkc %r7, %r8, %r9 # encoding: [0xb9,0xfd,0x90,0x78] + + msrkc %r0,%r0,%r0 + msrkc %r0,%r0,%r15 + msrkc %r0,%r15,%r0 + msrkc %r15,%r0,%r0 + msrkc %r7,%r8,%r9 + +#CHECK: msgrkc %r0, %r0, %r0 # encoding: [0xb9,0xed,0x00,0x00] +#CHECK: msgrkc %r0, %r0, %r15 # encoding: [0xb9,0xed,0xf0,0x00] +#CHECK: msgrkc %r0, %r15, %r0 # encoding: [0xb9,0xed,0x00,0x0f] +#CHECK: msgrkc %r15, %r0, %r0 # encoding: [0xb9,0xed,0x00,0xf0] +#CHECK: msgrkc %r7, %r8, %r9 # encoding: [0xb9,0xed,0x90,0x78] + + msgrkc %r0,%r0,%r0 + msgrkc %r0,%r0,%r15 + msgrkc %r0,%r15,%r0 + msgrkc %r15,%r0,%r0 + msgrkc %r7,%r8,%r9 + +#CHECK: prno %r2, %r2 # encoding: [0xb9,0x3c,0x00,0x22] +#CHECK: prno %r2, %r14 # encoding: [0xb9,0x3c,0x00,0x2e] +#CHECK: prno %r14, %r2 # encoding: [0xb9,0x3c,0x00,0xe2] +#CHECK: prno %r6, %r10 # encoding: [0xb9,0x3c,0x00,0x6a] + + prno %r2, %r2 + prno %r2, %r14 + prno %r14, %r2 + prno %r6, %r10 + +#CHECK: sgh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x39] +#CHECK: sgh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x39] +#CHECK: sgh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x39] +#CHECK: sgh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x39] +#CHECK: sgh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x39] +#CHECK: sgh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x39] +#CHECK: sgh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x39] +#CHECK: sgh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x39] +#CHECK: sgh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x39] +#CHECK: sgh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x39] + + sgh %r0, -524288 + sgh %r0, -1 + sgh %r0, 0 + sgh %r0, 1 + sgh %r0, 524287 + sgh %r0, 0(%r1) + sgh %r0, 0(%r15) + sgh %r0, 524287(%r1,%r15) + sgh %r0, 524287(%r15,%r1) + sgh %r15, 0 + +#CHECK: stgsc %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x49] +#CHECK: stgsc %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x49] +#CHECK: stgsc %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x49] +#CHECK: stgsc %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x49] +#CHECK: stgsc %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x49] +#CHECK: stgsc %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x49] +#CHECK: stgsc %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x49] +#CHECK: stgsc %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x49] +#CHECK: stgsc %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x49] + + stgsc %r0, -524288 + stgsc %r0, -1 + stgsc %r0, 0 + stgsc %r0, 1 + stgsc %r0, 524287 + stgsc %r0, 0(%r1) + stgsc %r0, 0(%r15) + stgsc %r0, 524287(%r1,%r15) + stgsc %r0, 524287(%r15,%r1) + +#CHECK: vap %v0, %v0, %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x71] +#CHECK: vap %v0, %v0, %v0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x71] +#CHECK: vap %v0, %v0, %v0, 255, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x71] +#CHECK: vap %v0, %v0, %v31, 0, 0 # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x71] +#CHECK: vap %v0, %v31, %v0, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x71] +#CHECK: vap %v31, %v0, %v0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x71] +#CHECK: vap %v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x71] + + vap %v0, %v0, %v0, 0, 0 + vap %v0, %v0, %v0, 0, 15 + vap %v0, %v0, %v0, 255, 0 + vap %v0, %v0, %v31, 0, 0 + vap %v0, %v31, %v0, 0, 0 + vap %v31, %v0, %v0, 0, 0 + vap %v13, %v17, %v21, 0x79, 11 + +#CHECK: vbperm %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x85] +#CHECK: vbperm %v0, %v0, %v15 # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x85] +#CHECK: vbperm %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x85] +#CHECK: vbperm %v0, %v15, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x85] +#CHECK: vbperm %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x85] +#CHECK: vbperm %v15, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x85] +#CHECK: vbperm %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x85] +#CHECK: vbperm %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x85] + + vbperm %v0, %v0, %v0 + vbperm %v0, %v0, %v15 + vbperm %v0, %v0, %v31 + vbperm %v0, %v15, %v0 + vbperm %v0, %v31, %v0 + vbperm %v15, %v0, %v0 + vbperm %v31, %v0, %v0 + vbperm %v18, %v3, %v20 + +#CHECK: vcp %v0, %v0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x77] +#CHECK: vcp %v0, %v0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x77] +#CHECK: vcp %v15, %v0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x77] +#CHECK: vcp %v31, %v0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x77] +#CHECK: vcp %v0, %v15, 0 # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x77] +#CHECK: vcp %v0, %v31, 0 # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x77] +#CHECK: vcp %v3, %v18, 4 # encoding: [0xe6,0x03,0x20,0x40,0x02,0x77] + + vcp %v0, %v0, 0 + vcp %v0, %v0, 15 + vcp %v15, %v0, 0 + vcp %v31, %v0, 0 + vcp %v0, %v15, 0 + vcp %v0, %v31, 0 + vcp %v3, %v18, 4 + +#CHECK: vcvb %r0, %v0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x50] +#CHECK: vcvb %r0, %v0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x50] +#CHECK: vcvb %r15, %v0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x50] +#CHECK: vcvb %r0, %v15, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x50] +#CHECK: vcvb %r0, %v31, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x50] +#CHECK: vcvb %r3, %v18, 4 # encoding: [0xe6,0x32,0x00,0x40,0x04,0x50] + + vcvb %r0, %v0, 0 + vcvb %r0, %v0, 15 + vcvb %r15, %v0, 0 + vcvb %r0, %v15, 0 + vcvb %r0, %v31, 0 + vcvb %r3, %v18, 4 + +#CHECK: vcvbg %r0, %v0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x52] +#CHECK: vcvbg %r0, %v0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x52] +#CHECK: vcvbg %r15, %v0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x52] +#CHECK: vcvbg %r0, %v15, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x52] +#CHECK: vcvbg %r0, %v31, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x52] +#CHECK: vcvbg %r3, %v18, 4 # encoding: [0xe6,0x32,0x00,0x40,0x04,0x52] + + vcvbg %r0, %v0, 0 + vcvbg %r0, %v0, 15 + vcvbg %r15, %v0, 0 + vcvbg %r0, %v15, 0 + vcvbg %r0, %v31, 0 + vcvbg %r3, %v18, 4 + +#CHECK: vcvd %v0, %r0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x58] +#CHECK: vcvd %v0, %r0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x58] +#CHECK: vcvd %v0, %r0, 255, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x58] +#CHECK: vcvd %v0, %r15, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x58] +#CHECK: vcvd %v15, %r0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x58] +#CHECK: vcvd %v31, %r0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x58] +#CHECK: vcvd %v18, %r9, 52, 11 # encoding: [0xe6,0x29,0x00,0xb3,0x48,0x58] + + vcvd %v0, %r0, 0, 0 + vcvd %v0, %r0, 0, 15 + vcvd %v0, %r0, 255, 0 + vcvd %v0, %r15, 0, 0 + vcvd %v15, %r0, 0, 0 + vcvd %v31, %r0, 0, 0 + vcvd %v18, %r9, 0x34, 11 + +#CHECK: vcvdg %v0, %r0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x5a] +#CHECK: vcvdg %v0, %r0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x5a] +#CHECK: vcvdg %v0, %r0, 255, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x5a] +#CHECK: vcvdg %v0, %r15, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x5a] +#CHECK: vcvdg %v15, %r0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x5a] +#CHECK: vcvdg %v31, %r0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x5a] +#CHECK: vcvdg %v18, %r9, 52, 11 # encoding: [0xe6,0x29,0x00,0xb3,0x48,0x5a] + + vcvdg %v0, %r0, 0, 0 + vcvdg %v0, %r0, 0, 15 + vcvdg %v0, %r0, 255, 0 + vcvdg %v0, %r15, 0, 0 + vcvdg %v15, %r0, 0, 0 + vcvdg %v31, %r0, 0, 0 + vcvdg %v18, %r9, 0x34, 11 + +#CHECK: vdp %v0, %v0, %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x7a] +#CHECK: vdp %v0, %v0, %v0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x7a] +#CHECK: vdp %v0, %v0, %v0, 255, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x7a] +#CHECK: vdp %v0, %v0, %v31, 0, 0 # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x7a] +#CHECK: vdp %v0, %v31, %v0, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x7a] +#CHECK: vdp %v31, %v0, %v0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x7a] +#CHECK: vdp %v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x7a] + + vdp %v0, %v0, %v0, 0, 0 + vdp %v0, %v0, %v0, 0, 15 + vdp %v0, %v0, %v0, 255, 0 + vdp %v0, %v0, %v31, 0, 0 + vdp %v0, %v31, %v0, 0, 0 + vdp %v31, %v0, %v0, 0, 0 + vdp %v13, %v17, %v21, 0x79, 11 + +#CHECK: vfasb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe3] +#CHECK: vfasb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe3] +#CHECK: vfasb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe3] +#CHECK: vfasb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe3] +#CHECK: vfasb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe3] + + vfasb %v0, %v0, %v0 + vfasb %v0, %v0, %v31 + vfasb %v0, %v31, %v0 + vfasb %v31, %v0, %v0 + vfasb %v18, %v3, %v20 + +#CHECK: vfcesb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe8] +#CHECK: vfcesb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe8] +#CHECK: vfcesb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe8] +#CHECK: vfcesb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe8] +#CHECK: vfcesb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe8] + + vfcesb %v0, %v0, %v0 + vfcesb %v0, %v0, %v31 + vfcesb %v0, %v31, %v0 + vfcesb %v31, %v0, %v0 + vfcesb %v18, %v3, %v20 + +#CHECK: vfcesbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x10,0x20,0xe8] +#CHECK: vfcesbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x10,0x22,0xe8] +#CHECK: vfcesbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xe8] +#CHECK: vfcesbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xe8] +#CHECK: vfcesbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x10,0x2a,0xe8] + + vfcesbs %v0, %v0, %v0 + vfcesbs %v0, %v0, %v31 + vfcesbs %v0, %v31, %v0 + vfcesbs %v31, %v0, %v0 + vfcesbs %v18, %v3, %v20 + +#CHECK: vfchsb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xeb] +#CHECK: vfchsb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xeb] +#CHECK: vfchsb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xeb] +#CHECK: vfchsb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xeb] +#CHECK: vfchsb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xeb] + + vfchsb %v0, %v0, %v0 + vfchsb %v0, %v0, %v31 + vfchsb %v0, %v31, %v0 + vfchsb %v31, %v0, %v0 + vfchsb %v18, %v3, %v20 + +#CHECK: vfchsbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x10,0x20,0xeb] +#CHECK: vfchsbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x10,0x22,0xeb] +#CHECK: vfchsbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xeb] +#CHECK: vfchsbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xeb] +#CHECK: vfchsbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x10,0x2a,0xeb] + + vfchsbs %v0, %v0, %v0 + vfchsbs %v0, %v0, %v31 + vfchsbs %v0, %v31, %v0 + vfchsbs %v31, %v0, %v0 + vfchsbs %v18, %v3, %v20 + +#CHECK: vfchesb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xea] +#CHECK: vfchesb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xea] +#CHECK: vfchesb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xea] +#CHECK: vfchesb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xea] +#CHECK: vfchesb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xea] + + vfchesb %v0, %v0, %v0 + vfchesb %v0, %v0, %v31 + vfchesb %v0, %v31, %v0 + vfchesb %v31, %v0, %v0 + vfchesb %v18, %v3, %v20 + +#CHECK: vfchesbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x10,0x20,0xea] +#CHECK: vfchesbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x10,0x22,0xea] +#CHECK: vfchesbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xea] +#CHECK: vfchesbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xea] +#CHECK: vfchesbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x10,0x2a,0xea] + + vfchesbs %v0, %v0, %v0 + vfchesbs %v0, %v0, %v31 + vfchesbs %v0, %v31, %v0 + vfchesbs %v31, %v0, %v0 + vfchesbs %v18, %v3, %v20 + +#CHECK: vfdsb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe5] +#CHECK: vfdsb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe5] +#CHECK: vfdsb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe5] +#CHECK: vfdsb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe5] +#CHECK: vfdsb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe5] + + vfdsb %v0, %v0, %v0 + vfdsb %v0, %v0, %v31 + vfdsb %v0, %v31, %v0 + vfdsb %v31, %v0, %v0 + vfdsb %v18, %v3, %v20 + +#CHECK: vfisb %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xc7] +#CHECK: vfisb %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf0,0x20,0xc7] +#CHECK: vfisb %v0, %v0, 4, 0 # encoding: [0xe7,0x00,0x00,0x04,0x20,0xc7] +#CHECK: vfisb %v0, %v0, 7, 0 # encoding: [0xe7,0x00,0x00,0x07,0x20,0xc7] +#CHECK: vfisb %v0, %v31, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xc7] +#CHECK: vfisb %v31, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xc7] +#CHECK: vfisb %v14, %v17, 4, 10 # encoding: [0xe7,0xe1,0x00,0xa4,0x24,0xc7] + + vfisb %v0, %v0, 0, 0 + vfisb %v0, %v0, 0, 15 + vfisb %v0, %v0, 4, 0 + vfisb %v0, %v0, 7, 0 + vfisb %v0, %v31, 0, 0 + vfisb %v31, %v0, 0, 0 + vfisb %v14, %v17, 4, 10 + +#CHECK: vfkedb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x04,0x30,0xe8] +#CHECK: vfkedb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x04,0x32,0xe8] +#CHECK: vfkedb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x04,0x34,0xe8] +#CHECK: vfkedb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x04,0x38,0xe8] +#CHECK: vfkedb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x04,0x3a,0xe8] + + vfkedb %v0, %v0, %v0 + vfkedb %v0, %v0, %v31 + vfkedb %v0, %v31, %v0 + vfkedb %v31, %v0, %v0 + vfkedb %v18, %v3, %v20 + +#CHECK: vfkedbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x14,0x30,0xe8] +#CHECK: vfkedbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x14,0x32,0xe8] +#CHECK: vfkedbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x14,0x34,0xe8] +#CHECK: vfkedbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x14,0x38,0xe8] +#CHECK: vfkedbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x14,0x3a,0xe8] + + vfkedbs %v0, %v0, %v0 + vfkedbs %v0, %v0, %v31 + vfkedbs %v0, %v31, %v0 + vfkedbs %v31, %v0, %v0 + vfkedbs %v18, %v3, %v20 + +#CHECK: vfkesb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x04,0x20,0xe8] +#CHECK: vfkesb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x04,0x22,0xe8] +#CHECK: vfkesb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x04,0x24,0xe8] +#CHECK: vfkesb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x04,0x28,0xe8] +#CHECK: vfkesb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x04,0x2a,0xe8] + + vfkesb %v0, %v0, %v0 + vfkesb %v0, %v0, %v31 + vfkesb %v0, %v31, %v0 + vfkesb %v31, %v0, %v0 + vfkesb %v18, %v3, %v20 + +#CHECK: vfkesbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x14,0x20,0xe8] +#CHECK: vfkesbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x14,0x22,0xe8] +#CHECK: vfkesbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x14,0x24,0xe8] +#CHECK: vfkesbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x14,0x28,0xe8] +#CHECK: vfkesbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x14,0x2a,0xe8] + + vfkesbs %v0, %v0, %v0 + vfkesbs %v0, %v0, %v31 + vfkesbs %v0, %v31, %v0 + vfkesbs %v31, %v0, %v0 + vfkesbs %v18, %v3, %v20 + +#CHECK: vfkhdb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x04,0x30,0xeb] +#CHECK: vfkhdb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x04,0x32,0xeb] +#CHECK: vfkhdb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x04,0x34,0xeb] +#CHECK: vfkhdb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x04,0x38,0xeb] +#CHECK: vfkhdb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x04,0x3a,0xeb] + + vfkhdb %v0, %v0, %v0 + vfkhdb %v0, %v0, %v31 + vfkhdb %v0, %v31, %v0 + vfkhdb %v31, %v0, %v0 + vfkhdb %v18, %v3, %v20 + +#CHECK: vfkhdbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x14,0x30,0xeb] +#CHECK: vfkhdbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x14,0x32,0xeb] +#CHECK: vfkhdbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x14,0x34,0xeb] +#CHECK: vfkhdbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x14,0x38,0xeb] +#CHECK: vfkhdbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x14,0x3a,0xeb] + + vfkhdbs %v0, %v0, %v0 + vfkhdbs %v0, %v0, %v31 + vfkhdbs %v0, %v31, %v0 + vfkhdbs %v31, %v0, %v0 + vfkhdbs %v18, %v3, %v20 + +#CHECK: vfkhsb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x04,0x20,0xeb] +#CHECK: vfkhsb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x04,0x22,0xeb] +#CHECK: vfkhsb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x04,0x24,0xeb] +#CHECK: vfkhsb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x04,0x28,0xeb] +#CHECK: vfkhsb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x04,0x2a,0xeb] + + vfkhsb %v0, %v0, %v0 + vfkhsb %v0, %v0, %v31 + vfkhsb %v0, %v31, %v0 + vfkhsb %v31, %v0, %v0 + vfkhsb %v18, %v3, %v20 + +#CHECK: vfkhsbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x14,0x20,0xeb] +#CHECK: vfkhsbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x14,0x22,0xeb] +#CHECK: vfkhsbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x14,0x24,0xeb] +#CHECK: vfkhsbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x14,0x28,0xeb] +#CHECK: vfkhsbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x14,0x2a,0xeb] + + vfkhsbs %v0, %v0, %v0 + vfkhsbs %v0, %v0, %v31 + vfkhsbs %v0, %v31, %v0 + vfkhsbs %v31, %v0, %v0 + vfkhsbs %v18, %v3, %v20 + +#CHECK: vfkhedb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x04,0x30,0xea] +#CHECK: vfkhedb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x04,0x32,0xea] +#CHECK: vfkhedb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x04,0x34,0xea] +#CHECK: vfkhedb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x04,0x38,0xea] +#CHECK: vfkhedb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x04,0x3a,0xea] + + vfkhedb %v0, %v0, %v0 + vfkhedb %v0, %v0, %v31 + vfkhedb %v0, %v31, %v0 + vfkhedb %v31, %v0, %v0 + vfkhedb %v18, %v3, %v20 + +#CHECK: vfkhedbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x14,0x30,0xea] +#CHECK: vfkhedbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x14,0x32,0xea] +#CHECK: vfkhedbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x14,0x34,0xea] +#CHECK: vfkhedbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x14,0x38,0xea] +#CHECK: vfkhedbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x14,0x3a,0xea] + + vfkhedbs %v0, %v0, %v0 + vfkhedbs %v0, %v0, %v31 + vfkhedbs %v0, %v31, %v0 + vfkhedbs %v31, %v0, %v0 + vfkhedbs %v18, %v3, %v20 + +#CHECK: vfkhesb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x04,0x20,0xea] +#CHECK: vfkhesb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x04,0x22,0xea] +#CHECK: vfkhesb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x04,0x24,0xea] +#CHECK: vfkhesb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x04,0x28,0xea] +#CHECK: vfkhesb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x04,0x2a,0xea] + + vfkhesb %v0, %v0, %v0 + vfkhesb %v0, %v0, %v31 + vfkhesb %v0, %v31, %v0 + vfkhesb %v31, %v0, %v0 + vfkhesb %v18, %v3, %v20 + +#CHECK: vfkhesbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x14,0x20,0xea] +#CHECK: vfkhesbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x14,0x22,0xea] +#CHECK: vfkhesbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x14,0x24,0xea] +#CHECK: vfkhesbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x14,0x28,0xea] +#CHECK: vfkhesbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x14,0x2a,0xea] + + vfkhesbs %v0, %v0, %v0 + vfkhesbs %v0, %v0, %v31 + vfkhesbs %v0, %v31, %v0 + vfkhesbs %v31, %v0, %v0 + vfkhesbs %v18, %v3, %v20 + +#CHECK: vfpsosb %v0, %v0, 3 # encoding: [0xe7,0x00,0x00,0x30,0x20,0xcc] +#CHECK: vfpsosb %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0xf0,0x20,0xcc] +#CHECK: vfpsosb %v0, %v15, 3 # encoding: [0xe7,0x0f,0x00,0x30,0x20,0xcc] +#CHECK: vfpsosb %v0, %v31, 3 # encoding: [0xe7,0x0f,0x00,0x30,0x24,0xcc] +#CHECK: vfpsosb %v15, %v0, 3 # encoding: [0xe7,0xf0,0x00,0x30,0x20,0xcc] +#CHECK: vfpsosb %v31, %v0, 3 # encoding: [0xe7,0xf0,0x00,0x30,0x28,0xcc] +#CHECK: vfpsosb %v14, %v17, 7 # encoding: [0xe7,0xe1,0x00,0x70,0x24,0xcc] + + vfpsosb %v0, %v0, 3 + vfpsosb %v0, %v0, 15 + vfpsosb %v0, %v15, 3 + vfpsosb %v0, %v31, 3 + vfpsosb %v15, %v0, 3 + vfpsosb %v31, %v0, 3 + vfpsosb %v14, %v17, 7 + +#CHECK: vflcsb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xcc] +#CHECK: vflcsb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xcc] +#CHECK: vflcsb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xcc] +#CHECK: vflcsb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xcc] +#CHECK: vflcsb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xcc] +#CHECK: vflcsb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xcc] + + vflcsb %v0, %v0 + vflcsb %v0, %v15 + vflcsb %v0, %v31 + vflcsb %v15, %v0 + vflcsb %v31, %v0 + vflcsb %v14, %v17 + +#CHECK: vflnsb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x10,0x20,0xcc] +#CHECK: vflnsb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x10,0x20,0xcc] +#CHECK: vflnsb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xcc] +#CHECK: vflnsb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x10,0x20,0xcc] +#CHECK: vflnsb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xcc] +#CHECK: vflnsb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x10,0x24,0xcc] + + vflnsb %v0, %v0 + vflnsb %v0, %v15 + vflnsb %v0, %v31 + vflnsb %v15, %v0 + vflnsb %v31, %v0 + vflnsb %v14, %v17 + +#CHECK: vflpsb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x20,0x20,0xcc] +#CHECK: vflpsb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x20,0x20,0xcc] +#CHECK: vflpsb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x20,0x24,0xcc] +#CHECK: vflpsb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x20,0x20,0xcc] +#CHECK: vflpsb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x20,0x28,0xcc] +#CHECK: vflpsb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x20,0x24,0xcc] + + vflpsb %v0, %v0 + vflpsb %v0, %v15 + vflpsb %v0, %v31 + vflpsb %v15, %v0 + vflpsb %v31, %v0 + vflpsb %v14, %v17 + +#CHECK: vfll %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xc4] +#CHECK: vfll %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xc4] +#CHECK: vfll %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xc4] +#CHECK: vfll %v0, %v15, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xc4] +#CHECK: vfll %v0, %v31, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xc4] +#CHECK: vfll %v15, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xc4] +#CHECK: vfll %v31, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xc4] +#CHECK: vfll %v14, %v17, 11, 9 # encoding: [0xe7,0xe1,0x00,0x09,0xb4,0xc4] + + vfll %v0, %v0, 0, 0 + vfll %v0, %v0, 15, 0 + vfll %v0, %v0, 0, 15 + vfll %v0, %v15, 0, 0 + vfll %v0, %v31, 0, 0 + vfll %v15, %v0, 0, 0 + vfll %v31, %v0, 0, 0 + vfll %v14, %v17, 11, 9 + +#CHECK: vflls %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xc4] +#CHECK: vflls %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xc4] +#CHECK: vflls %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xc4] +#CHECK: vflls %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xc4] +#CHECK: vflls %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xc4] +#CHECK: vflls %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xc4] + + vflls %v0, %v0 + vflls %v0, %v15 + vflls %v0, %v31 + vflls %v15, %v0 + vflls %v31, %v0 + vflls %v14, %v17 + +#CHECK: vflr %v0, %v0, 0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xc5] +#CHECK: vflr %v0, %v0, 15, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xc5] +#CHECK: vflr %v0, %v0, 0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xc5] +#CHECK: vflr %v0, %v0, 0, 4, 0 # encoding: [0xe7,0x00,0x00,0x04,0x00,0xc5] +#CHECK: vflr %v0, %v0, 0, 12, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x00,0xc5] +#CHECK: vflr %v0, %v31, 0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xc5] +#CHECK: vflr %v31, %v0, 0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xc5] +#CHECK: vflr %v14, %v17, 11, 4, 10 # encoding: [0xe7,0xe1,0x00,0xa4,0xb4,0xc5] + + vflr %v0, %v0, 0, 0, 0 + vflr %v0, %v0, 15, 0, 0 + vflr %v0, %v0, 0, 0, 15 + vflr %v0, %v0, 0, 4, 0 + vflr %v0, %v0, 0, 12, 0 + vflr %v0, %v31, 0, 0, 0 + vflr %v31, %v0, 0, 0, 0 + vflr %v14, %v17, 11, 4, 10 + +#CHECK: vflrd %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0xc5] +#CHECK: vflrd %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xc5] +#CHECK: vflrd %v0, %v0, 4, 0 # encoding: [0xe7,0x00,0x00,0x04,0x30,0xc5] +#CHECK: vflrd %v0, %v0, 12, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5] +#CHECK: vflrd %v0, %v31, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xc5] +#CHECK: vflrd %v31, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xc5] +#CHECK: vflrd %v14, %v17, 4, 10 # encoding: [0xe7,0xe1,0x00,0xa4,0x34,0xc5] + + vflrd %v0, %v0, 0, 0 + vflrd %v0, %v0, 0, 15 + vflrd %v0, %v0, 4, 0 + vflrd %v0, %v0, 12, 0 + vflrd %v0, %v31, 0, 0 + vflrd %v31, %v0, 0, 0 + vflrd %v14, %v17, 4, 10 + +#CHECK: vfmax %v0, %v0, %v0, 0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xef] +#CHECK: vfmax %v0, %v0, %v0, 15, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xef] +#CHECK: vfmax %v0, %v0, %v0, 0, 15, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xef] +#CHECK: vfmax %v0, %v0, %v0, 0, 0, 4 # encoding: [0xe7,0x00,0x00,0x40,0x00,0xef] +#CHECK: vfmax %v0, %v0, %v31, 0, 0, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xef] +#CHECK: vfmax %v0, %v31, %v0, 0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xef] +#CHECK: vfmax %v31, %v0, %v0, 0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xef] +#CHECK: vfmax %v18, %v3, %v20, 11, 9, 12 # encoding: [0xe7,0x23,0x40,0xc9,0xba,0xef] + + vfmax %v0, %v0, %v0, 0, 0, 0 + vfmax %v0, %v0, %v0, 15, 0, 0 + vfmax %v0, %v0, %v0, 0, 15, 0 + vfmax %v0, %v0, %v0, 0, 0, 4 + vfmax %v0, %v0, %v31, 0, 0, 0 + vfmax %v0, %v31, %v0, 0, 0, 0 + vfmax %v31, %v0, %v0, 0, 0, 0 + vfmax %v18, %v3, %v20, 11, 9, 12 + +#CHECK: vfmaxdb %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0xef] +#CHECK: vfmaxdb %v0, %v0, %v0, 4 # encoding: [0xe7,0x00,0x00,0x40,0x30,0xef] +#CHECK: vfmaxdb %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xef] +#CHECK: vfmaxdb %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xef] +#CHECK: vfmaxdb %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xef] +#CHECK: vfmaxdb %v18, %v3, %v20, 12 # encoding: [0xe7,0x23,0x40,0xc0,0x3a,0xef] + + vfmaxdb %v0, %v0, %v0, 0 + vfmaxdb %v0, %v0, %v0, 4 + vfmaxdb %v0, %v0, %v31, 0 + vfmaxdb %v0, %v31, %v0, 0 + vfmaxdb %v31, %v0, %v0, 0 + vfmaxdb %v18, %v3, %v20, 12 + +#CHECK: vfmaxsb %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xef] +#CHECK: vfmaxsb %v0, %v0, %v0, 4 # encoding: [0xe7,0x00,0x00,0x40,0x20,0xef] +#CHECK: vfmaxsb %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xef] +#CHECK: vfmaxsb %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xef] +#CHECK: vfmaxsb %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xef] +#CHECK: vfmaxsb %v18, %v3, %v20, 12 # encoding: [0xe7,0x23,0x40,0xc0,0x2a,0xef] + + vfmaxsb %v0, %v0, %v0, 0 + vfmaxsb %v0, %v0, %v0, 4 + vfmaxsb %v0, %v0, %v31, 0 + vfmaxsb %v0, %v31, %v0, 0 + vfmaxsb %v31, %v0, %v0, 0 + vfmaxsb %v18, %v3, %v20, 12 + +#CHECK: vfmin %v0, %v0, %v0, 0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xee] +#CHECK: vfmin %v0, %v0, %v0, 15, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xee] +#CHECK: vfmin %v0, %v0, %v0, 0, 15, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xee] +#CHECK: vfmin %v0, %v0, %v0, 0, 0, 4 # encoding: [0xe7,0x00,0x00,0x40,0x00,0xee] +#CHECK: vfmin %v0, %v0, %v31, 0, 0, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xee] +#CHECK: vfmin %v0, %v31, %v0, 0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xee] +#CHECK: vfmin %v31, %v0, %v0, 0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xee] +#CHECK: vfmin %v18, %v3, %v20, 11, 9, 12 # encoding: [0xe7,0x23,0x40,0xc9,0xba,0xee] + + vfmin %v0, %v0, %v0, 0, 0, 0 + vfmin %v0, %v0, %v0, 15, 0, 0 + vfmin %v0, %v0, %v0, 0, 15, 0 + vfmin %v0, %v0, %v0, 0, 0, 4 + vfmin %v0, %v0, %v31, 0, 0, 0 + vfmin %v0, %v31, %v0, 0, 0, 0 + vfmin %v31, %v0, %v0, 0, 0, 0 + vfmin %v18, %v3, %v20, 11, 9, 12 + +#CHECK: vfmindb %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0xee] +#CHECK: vfmindb %v0, %v0, %v0, 4 # encoding: [0xe7,0x00,0x00,0x40,0x30,0xee] +#CHECK: vfmindb %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xee] +#CHECK: vfmindb %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xee] +#CHECK: vfmindb %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xee] +#CHECK: vfmindb %v18, %v3, %v20, 12 # encoding: [0xe7,0x23,0x40,0xc0,0x3a,0xee] + + vfmindb %v0, %v0, %v0, 0 + vfmindb %v0, %v0, %v0, 4 + vfmindb %v0, %v0, %v31, 0 + vfmindb %v0, %v31, %v0, 0 + vfmindb %v31, %v0, %v0, 0 + vfmindb %v18, %v3, %v20, 12 + +#CHECK: vfminsb %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xee] +#CHECK: vfminsb %v0, %v0, %v0, 4 # encoding: [0xe7,0x00,0x00,0x40,0x20,0xee] +#CHECK: vfminsb %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xee] +#CHECK: vfminsb %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xee] +#CHECK: vfminsb %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xee] +#CHECK: vfminsb %v18, %v3, %v20, 12 # encoding: [0xe7,0x23,0x40,0xc0,0x2a,0xee] + + vfminsb %v0, %v0, %v0, 0 + vfminsb %v0, %v0, %v0, 4 + vfminsb %v0, %v0, %v31, 0 + vfminsb %v0, %v31, %v0, 0 + vfminsb %v31, %v0, %v0, 0 + vfminsb %v18, %v3, %v20, 12 + +#CHECK: vfmasb %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x02,0x00,0x00,0x8f] +#CHECK: vfmasb %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x8f] +#CHECK: vfmasb %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x8f] +#CHECK: vfmasb %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x8f] +#CHECK: vfmasb %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x8f] +#CHECK: vfmasb %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x8f] + + vfmasb %v0, %v0, %v0, %v0 + vfmasb %v0, %v0, %v0, %v31 + vfmasb %v0, %v0, %v31, %v0 + vfmasb %v0, %v31, %v0, %v0 + vfmasb %v31, %v0, %v0, %v0 + vfmasb %v13, %v17, %v21, %v25 + +#CHECK: vfmsb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe7] +#CHECK: vfmsb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe7] +#CHECK: vfmsb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe7] +#CHECK: vfmsb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe7] +#CHECK: vfmsb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe7] + + vfmsb %v0, %v0, %v0 + vfmsb %v0, %v0, %v31 + vfmsb %v0, %v31, %v0 + vfmsb %v31, %v0, %v0 + vfmsb %v18, %v3, %v20 + +#CHECK: vfmssb %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x02,0x00,0x00,0x8e] +#CHECK: vfmssb %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x8e] +#CHECK: vfmssb %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x8e] +#CHECK: vfmssb %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x8e] +#CHECK: vfmssb %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x8e] +#CHECK: vfmssb %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x8e] + + vfmssb %v0, %v0, %v0, %v0 + vfmssb %v0, %v0, %v0, %v31 + vfmssb %v0, %v0, %v31, %v0 + vfmssb %v0, %v31, %v0, %v0 + vfmssb %v31, %v0, %v0, %v0 + vfmssb %v13, %v17, %v21, %v25 + +#CHECK: vfnma %v0, %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x9f] +#CHECK: vfnma %v0, %v0, %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x0f,0x00,0x00,0x9f] +#CHECK: vfnma %v0, %v0, %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0x9f] +#CHECK: vfnma %v0, %v0, %v0, %v31, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x9f] +#CHECK: vfnma %v0, %v0, %v31, %v0, 0, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x9f] +#CHECK: vfnma %v0, %v31, %v0, %v0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x9f] +#CHECK: vfnma %v31, %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x9f] +#CHECK: vfnma %v13, %v17, %v21, %v25, 9, 11 # encoding: [0xe7,0xd1,0x5b,0x09,0x97,0x9f] + + vfnma %v0, %v0, %v0, %v0, 0, 0 + vfnma %v0, %v0, %v0, %v0, 0, 15 + vfnma %v0, %v0, %v0, %v0, 15, 0 + vfnma %v0, %v0, %v0, %v31, 0, 0 + vfnma %v0, %v0, %v31, %v0, 0, 0 + vfnma %v0, %v31, %v0, %v0, 0, 0 + vfnma %v31, %v0, %v0, %v0, 0, 0 + vfnma %v13, %v17, %v21, %v25, 9, 11 + +#CHECK: vfnmadb %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x03,0x00,0x00,0x9f] +#CHECK: vfnmadb %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x03,0x00,0xf1,0x9f] +#CHECK: vfnmadb %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf3,0x00,0x02,0x9f] +#CHECK: vfnmadb %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x03,0x00,0x04,0x9f] +#CHECK: vfnmadb %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x03,0x00,0x08,0x9f] +#CHECK: vfnmadb %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x53,0x00,0x97,0x9f] + + vfnmadb %v0, %v0, %v0, %v0 + vfnmadb %v0, %v0, %v0, %v31 + vfnmadb %v0, %v0, %v31, %v0 + vfnmadb %v0, %v31, %v0, %v0 + vfnmadb %v31, %v0, %v0, %v0 + vfnmadb %v13, %v17, %v21, %v25 + +#CHECK: vfnmasb %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x02,0x00,0x00,0x9f] +#CHECK: vfnmasb %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x9f] +#CHECK: vfnmasb %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x9f] +#CHECK: vfnmasb %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x9f] +#CHECK: vfnmasb %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x9f] +#CHECK: vfnmasb %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x9f] + + vfnmasb %v0, %v0, %v0, %v0 + vfnmasb %v0, %v0, %v0, %v31 + vfnmasb %v0, %v0, %v31, %v0 + vfnmasb %v0, %v31, %v0, %v0 + vfnmasb %v31, %v0, %v0, %v0 + vfnmasb %v13, %v17, %v21, %v25 + +#CHECK: vfnms %v0, %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x9e] +#CHECK: vfnms %v0, %v0, %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x0f,0x00,0x00,0x9e] +#CHECK: vfnms %v0, %v0, %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0x9e] +#CHECK: vfnms %v0, %v0, %v0, %v31, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x9e] +#CHECK: vfnms %v0, %v0, %v31, %v0, 0, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x9e] +#CHECK: vfnms %v0, %v31, %v0, %v0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x9e] +#CHECK: vfnms %v31, %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x9e] +#CHECK: vfnms %v13, %v17, %v21, %v25, 9, 11 # encoding: [0xe7,0xd1,0x5b,0x09,0x97,0x9e] + + vfnms %v0, %v0, %v0, %v0, 0, 0 + vfnms %v0, %v0, %v0, %v0, 0, 15 + vfnms %v0, %v0, %v0, %v0, 15, 0 + vfnms %v0, %v0, %v0, %v31, 0, 0 + vfnms %v0, %v0, %v31, %v0, 0, 0 + vfnms %v0, %v31, %v0, %v0, 0, 0 + vfnms %v31, %v0, %v0, %v0, 0, 0 + vfnms %v13, %v17, %v21, %v25, 9, 11 + +#CHECK: vfnmsdb %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x03,0x00,0x00,0x9e] +#CHECK: vfnmsdb %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x03,0x00,0xf1,0x9e] +#CHECK: vfnmsdb %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf3,0x00,0x02,0x9e] +#CHECK: vfnmsdb %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x03,0x00,0x04,0x9e] +#CHECK: vfnmsdb %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x03,0x00,0x08,0x9e] +#CHECK: vfnmsdb %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x53,0x00,0x97,0x9e] + + vfnmsdb %v0, %v0, %v0, %v0 + vfnmsdb %v0, %v0, %v0, %v31 + vfnmsdb %v0, %v0, %v31, %v0 + vfnmsdb %v0, %v31, %v0, %v0 + vfnmsdb %v31, %v0, %v0, %v0 + vfnmsdb %v13, %v17, %v21, %v25 + +#CHECK: vfnmssb %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x02,0x00,0x00,0x9e] +#CHECK: vfnmssb %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x9e] +#CHECK: vfnmssb %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x9e] +#CHECK: vfnmssb %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x9e] +#CHECK: vfnmssb %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x9e] +#CHECK: vfnmssb %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x9e] + + vfnmssb %v0, %v0, %v0, %v0 + vfnmssb %v0, %v0, %v0, %v31 + vfnmssb %v0, %v0, %v31, %v0 + vfnmssb %v0, %v31, %v0, %v0 + vfnmssb %v31, %v0, %v0, %v0 + vfnmssb %v13, %v17, %v21, %v25 + +#CHECK: vfssb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe2] +#CHECK: vfssb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe2] +#CHECK: vfssb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe2] +#CHECK: vfssb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe2] +#CHECK: vfssb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe2] + + vfssb %v0, %v0, %v0 + vfssb %v0, %v0, %v31 + vfssb %v0, %v31, %v0 + vfssb %v31, %v0, %v0 + vfssb %v18, %v3, %v20 + +#CHECK: vfsqsb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xce] +#CHECK: vfsqsb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xce] +#CHECK: vfsqsb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xce] +#CHECK: vfsqsb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xce] +#CHECK: vfsqsb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xce] +#CHECK: vfsqsb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xce] + + vfsqsb %v0, %v0 + vfsqsb %v0, %v15 + vfsqsb %v0, %v31 + vfsqsb %v15, %v0 + vfsqsb %v31, %v0 + vfsqsb %v14, %v17 + +#CHECK: vftcisb %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x4a] +#CHECK: vftcisb %v0, %v0, 4095 # encoding: [0xe7,0x00,0xff,0xf0,0x20,0x4a] +#CHECK: vftcisb %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x4a] +#CHECK: vftcisb %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x4a] +#CHECK: vftcisb %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x4a] +#CHECK: vftcisb %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x4a] +#CHECK: vftcisb %v4, %v21, 1656 # encoding: [0xe7,0x45,0x67,0x80,0x24,0x4a] + + vftcisb %v0, %v0, 0 + vftcisb %v0, %v0, 4095 + vftcisb %v0, %v15, 0 + vftcisb %v0, %v31, 0 + vftcisb %v15, %v0, 0 + vftcisb %v31, %v0, 0 + vftcisb %v4, %v21, 0x678 + +#CHECK: vlip %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x49] +#CHECK: vlip %v0, 0, 15 # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x49] +#CHECK: vlip %v0, 65535, 0 # encoding: [0xe6,0x00,0xff,0xff,0x00,0x49] +#CHECK: vlip %v15, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x49] +#CHECK: vlip %v31, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x49] +#CHECK: vlip %v17, 4660, 7 # encoding: [0xe6,0x10,0x12,0x34,0x78,0x49] + + vlip %v0, 0, 0 + vlip %v0, 0, 15 + vlip %v0, 0xffff, 0 + vlip %v15, 0, 0 + vlip %v31, 0, 0 + vlip %v17, 0x1234, 7 + +#CHECK: vllezlf %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x60,0x04] +#CHECK: vllezlf %v0, 4095 # encoding: [0xe7,0x00,0x0f,0xff,0x60,0x04] +#CHECK: vllezlf %v0, 0(%r15) # encoding: [0xe7,0x00,0xf0,0x00,0x60,0x04] +#CHECK: vllezlf %v0, 0(%r15,%r1) # encoding: [0xe7,0x0f,0x10,0x00,0x60,0x04] +#CHECK: vllezlf %v15, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x60,0x04] +#CHECK: vllezlf %v31, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x68,0x04] +#CHECK: vllezlf %v18, 1383(%r3,%r4) # encoding: [0xe7,0x23,0x45,0x67,0x68,0x04] + + vllezlf %v0, 0 + vllezlf %v0, 4095 + vllezlf %v0, 0(%r15) + vllezlf %v0, 0(%r15,%r1) + vllezlf %v15, 0 + vllezlf %v31, 0 + vllezlf %v18, 0x567(%r3,%r4) + +#CHECK: vlrl %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x35] +#CHECK: vlrl %v0, 4095, 0 # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x35] +#CHECK: vlrl %v0, 0(%r15), 0 # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x35] +#CHECK: vlrl %v0, 0, 255 # encoding: [0xe6,0xff,0x00,0x00,0x00,0x35] +#CHECK: vlrl %v15, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x35] +#CHECK: vlrl %v31, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x35] +#CHECK: vlrl %v18, 1383(%r4), 3 # encoding: [0xe6,0x03,0x45,0x67,0x21,0x35] + + vlrl %v0, 0, 0 + vlrl %v0, 4095, 0 + vlrl %v0, 0(%r15), 0 + vlrl %v0, 0, 255 + vlrl %v15, 0, 0 + vlrl %v31, 0, 0 + vlrl %v18, 1383(%r4), 3 + +#CHECK: vlrlr %v0, %r0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x37] +#CHECK: vlrlr %v0, %r0, 4095 # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x37] +#CHECK: vlrlr %v0, %r0, 0(%r15) # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x37] +#CHECK: vlrlr %v0, %r15, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x37] +#CHECK: vlrlr %v15, %r0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x37] +#CHECK: vlrlr %v31, %r0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x37] +#CHECK: vlrlr %v18, %r3, 1383(%r4) # encoding: [0xe6,0x03,0x45,0x67,0x21,0x37] + + vlrlr %v0, %r0, 0 + vlrlr %v0, %r0, 4095 + vlrlr %v0, %r0, 0(%r15) + vlrlr %v0, %r15, 0 + vlrlr %v15, %r0, 0 + vlrlr %v31, %r0, 0 + vlrlr %v18, %r3, 1383(%r4) + +#CHECK: vmsl %v0, %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xb8] +#CHECK: vmsl %v0, %v0, %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x0f,0x00,0x00,0xb8] +#CHECK: vmsl %v0, %v0, %v0, %v0, 0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x00,0xb8] +#CHECK: vmsl %v0, %v0, %v0, %v15, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xb8] +#CHECK: vmsl %v0, %v0, %v0, %v31, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xb8] +#CHECK: vmsl %v0, %v0, %v15, %v0, 0, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x00,0xb8] +#CHECK: vmsl %v0, %v0, %v31, %v0, 0, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xb8] +#CHECK: vmsl %v0, %v15, %v0, %v0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xb8] +#CHECK: vmsl %v0, %v31, %v0, %v0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xb8] +#CHECK: vmsl %v15, %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xb8] +#CHECK: vmsl %v31, %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xb8] +#CHECK: vmsl %v18, %v3, %v20, %v5, 0, 4 # encoding: [0xe7,0x23,0x40,0x40,0x5a,0xb8] +#CHECK: vmsl %v18, %v3, %v20, %v5, 11, 8 # encoding: [0xe7,0x23,0x4b,0x80,0x5a,0xb8] + + vmsl %v0, %v0, %v0, %v0, 0, 0 + vmsl %v0, %v0, %v0, %v0, 15, 0 + vmsl %v0, %v0, %v0, %v0, 0, 12 + vmsl %v0, %v0, %v0, %v15, 0, 0 + vmsl %v0, %v0, %v0, %v31, 0, 0 + vmsl %v0, %v0, %v15, %v0, 0, 0 + vmsl %v0, %v0, %v31, %v0, 0, 0 + vmsl %v0, %v15, %v0, %v0, 0, 0 + vmsl %v0, %v31, %v0, %v0, 0, 0 + vmsl %v15, %v0, %v0, %v0, 0, 0 + vmsl %v31, %v0, %v0, %v0, 0, 0 + vmsl %v18, %v3, %v20, %v5, 0, 4 + vmsl %v18, %v3, %v20, %v5, 11, 8 + +#CHECK: vmslg %v0, %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x03,0x00,0x00,0xb8] +#CHECK: vmslg %v0, %v0, %v0, %v0, 12 # encoding: [0xe7,0x00,0x03,0xc0,0x00,0xb8] +#CHECK: vmslg %v0, %v0, %v0, %v15, 0 # encoding: [0xe7,0x00,0x03,0x00,0xf0,0xb8] +#CHECK: vmslg %v0, %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0x03,0x00,0xf1,0xb8] +#CHECK: vmslg %v0, %v0, %v15, %v0, 0 # encoding: [0xe7,0x00,0xf3,0x00,0x00,0xb8] +#CHECK: vmslg %v0, %v0, %v31, %v0, 0 # encoding: [0xe7,0x00,0xf3,0x00,0x02,0xb8] +#CHECK: vmslg %v0, %v15, %v0, %v0, 0 # encoding: [0xe7,0x0f,0x03,0x00,0x00,0xb8] +#CHECK: vmslg %v0, %v31, %v0, %v0, 0 # encoding: [0xe7,0x0f,0x03,0x00,0x04,0xb8] +#CHECK: vmslg %v15, %v0, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x03,0x00,0x00,0xb8] +#CHECK: vmslg %v31, %v0, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x03,0x00,0x08,0xb8] +#CHECK: vmslg %v18, %v3, %v20, %v5, 4 # encoding: [0xe7,0x23,0x43,0x40,0x5a,0xb8] +#CHECK: vmslg %v18, %v3, %v20, %v5, 8 # encoding: [0xe7,0x23,0x43,0x80,0x5a,0xb8] + + vmslg %v0, %v0, %v0, %v0, 0 + vmslg %v0, %v0, %v0, %v0, 12 + vmslg %v0, %v0, %v0, %v15, 0 + vmslg %v0, %v0, %v0, %v31, 0 + vmslg %v0, %v0, %v15, %v0, 0 + vmslg %v0, %v0, %v31, %v0, 0 + vmslg %v0, %v15, %v0, %v0, 0 + vmslg %v0, %v31, %v0, %v0, 0 + vmslg %v15, %v0, %v0, %v0, 0 + vmslg %v31, %v0, %v0, %v0, 0 + vmslg %v18, %v3, %v20, %v5, 4 + vmslg %v18, %v3, %v20, %v5, 8 + +#CHECK: vmp %v0, %v0, %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x78] +#CHECK: vmp %v0, %v0, %v0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x78] +#CHECK: vmp %v0, %v0, %v0, 255, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x78] +#CHECK: vmp %v0, %v0, %v31, 0, 0 # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x78] +#CHECK: vmp %v0, %v31, %v0, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x78] +#CHECK: vmp %v31, %v0, %v0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x78] +#CHECK: vmp %v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x78] + + vmp %v0, %v0, %v0, 0, 0 + vmp %v0, %v0, %v0, 0, 15 + vmp %v0, %v0, %v0, 255, 0 + vmp %v0, %v0, %v31, 0, 0 + vmp %v0, %v31, %v0, 0, 0 + vmp %v31, %v0, %v0, 0, 0 + vmp %v13, %v17, %v21, 0x79, 11 + +#CHECK: vmsp %v0, %v0, %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x79] +#CHECK: vmsp %v0, %v0, %v0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x79] +#CHECK: vmsp %v0, %v0, %v0, 255, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x79] +#CHECK: vmsp %v0, %v0, %v31, 0, 0 # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x79] +#CHECK: vmsp %v0, %v31, %v0, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x79] +#CHECK: vmsp %v31, %v0, %v0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x79] +#CHECK: vmsp %v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x79] + + vmsp %v0, %v0, %v0, 0, 0 + vmsp %v0, %v0, %v0, 0, 15 + vmsp %v0, %v0, %v0, 255, 0 + vmsp %v0, %v0, %v31, 0, 0 + vmsp %v0, %v31, %v0, 0, 0 + vmsp %v31, %v0, %v0, 0, 0 + vmsp %v13, %v17, %v21, 0x79, 11 + +#CHECK: vnn %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6e] +#CHECK: vnn %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6e] +#CHECK: vnn %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6e] +#CHECK: vnn %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6e] +#CHECK: vnn %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6e] + + vnn %v0, %v0, %v0 + vnn %v0, %v0, %v31 + vnn %v0, %v31, %v0 + vnn %v31, %v0, %v0 + vnn %v18, %v3, %v20 + +#CHECK: vnx %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6c] +#CHECK: vnx %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6c] +#CHECK: vnx %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6c] +#CHECK: vnx %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6c] +#CHECK: vnx %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6c] + + vnx %v0, %v0, %v0 + vnx %v0, %v0, %v31 + vnx %v0, %v31, %v0 + vnx %v31, %v0, %v0 + vnx %v18, %v3, %v20 + +#CHECK: voc %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6f] +#CHECK: voc %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6f] +#CHECK: voc %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6f] +#CHECK: voc %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6f] +#CHECK: voc %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6f] + + voc %v0, %v0, %v0 + voc %v0, %v0, %v31 + voc %v0, %v31, %v0 + voc %v31, %v0, %v0 + voc %v18, %v3, %v20 + +#CHECK: vpkz %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x34] +#CHECK: vpkz %v0, 4095, 0 # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x34] +#CHECK: vpkz %v0, 0(%r15), 0 # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x34] +#CHECK: vpkz %v0, 0, 255 # encoding: [0xe6,0xff,0x00,0x00,0x00,0x34] +#CHECK: vpkz %v15, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x34] +#CHECK: vpkz %v31, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x34] +#CHECK: vpkz %v18, 1383(%r4), 3 # encoding: [0xe6,0x03,0x45,0x67,0x21,0x34] + + vpkz %v0, 0, 0 + vpkz %v0, 4095, 0 + vpkz %v0, 0(%r15), 0 + vpkz %v0, 0, 255 + vpkz %v15, 0, 0 + vpkz %v31, 0, 0 + vpkz %v18, 1383(%r4), 3 + +#CHECK: vpopctb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x50] +#CHECK: vpopctb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x50] +#CHECK: vpopctb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x50] +#CHECK: vpopctb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x50] +#CHECK: vpopctb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x50] +#CHECK: vpopctb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x04,0x50] + + vpopctb %v0, %v0 + vpopctb %v0, %v15 + vpopctb %v0, %v31 + vpopctb %v15, %v0 + vpopctb %v31, %v0 + vpopctb %v14, %v17 + +#CHECK: vpopctf %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x50] +#CHECK: vpopctf %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x50] +#CHECK: vpopctf %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x50] +#CHECK: vpopctf %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x50] +#CHECK: vpopctf %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x50] +#CHECK: vpopctf %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x24,0x50] + + vpopctf %v0, %v0 + vpopctf %v0, %v15 + vpopctf %v0, %v31 + vpopctf %v15, %v0 + vpopctf %v31, %v0 + vpopctf %v14, %v17 + +#CHECK: vpopctg %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0x50] +#CHECK: vpopctg %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x50] +#CHECK: vpopctg %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x50] +#CHECK: vpopctg %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x50] +#CHECK: vpopctg %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x50] +#CHECK: vpopctg %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x34,0x50] + + vpopctg %v0, %v0 + vpopctg %v0, %v15 + vpopctg %v0, %v31 + vpopctg %v15, %v0 + vpopctg %v31, %v0 + vpopctg %v14, %v17 + +#CHECK: vpopcth %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x50] +#CHECK: vpopcth %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x50] +#CHECK: vpopcth %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x50] +#CHECK: vpopcth %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x50] +#CHECK: vpopcth %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x50] +#CHECK: vpopcth %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x14,0x50] + + vpopcth %v0, %v0 + vpopcth %v0, %v15 + vpopcth %v0, %v31 + vpopcth %v15, %v0 + vpopcth %v31, %v0 + vpopcth %v14, %v17 + +#CHECK: vpsop %v0, %v0, 0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x5b] +#CHECK: vpsop %v0, %v0, 0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x5b] +#CHECK: vpsop %v0, %v0, 0, 255, 0 # encoding: [0xe6,0x00,0xff,0x00,0x00,0x5b] +#CHECK: vpsop %v0, %v0, 255, 0, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x5b] +#CHECK: vpsop %v0, %v31, 0, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x5b] +#CHECK: vpsop %v31, %v0, 0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x5b] +#CHECK: vpsop %v13, %v17, 52, 121, 11 # encoding: [0xe6,0xd1,0x79,0xb3,0x44,0x5b] + + vpsop %v0, %v0, 0, 0, 0 + vpsop %v0, %v0, 0, 0, 15 + vpsop %v0, %v0, 0, 255, 0 + vpsop %v0, %v0, 255, 0, 0 + vpsop %v0, %v31, 0, 0, 0 + vpsop %v31, %v0, 0, 0, 0 + vpsop %v13, %v17, 0x34, 0x79, 11 + +#CHECK: vrp %v0, %v0, %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x7b] +#CHECK: vrp %v0, %v0, %v0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x7b] +#CHECK: vrp %v0, %v0, %v0, 255, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x7b] +#CHECK: vrp %v0, %v0, %v31, 0, 0 # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x7b] +#CHECK: vrp %v0, %v31, %v0, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x7b] +#CHECK: vrp %v31, %v0, %v0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x7b] +#CHECK: vrp %v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x7b] + + vrp %v0, %v0, %v0, 0, 0 + vrp %v0, %v0, %v0, 0, 15 + vrp %v0, %v0, %v0, 255, 0 + vrp %v0, %v0, %v31, 0, 0 + vrp %v0, %v31, %v0, 0, 0 + vrp %v31, %v0, %v0, 0, 0 + vrp %v13, %v17, %v21, 0x79, 11 + +#CHECK: vsdp %v0, %v0, %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x7e] +#CHECK: vsdp %v0, %v0, %v0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x7e] +#CHECK: vsdp %v0, %v0, %v0, 255, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x7e] +#CHECK: vsdp %v0, %v0, %v31, 0, 0 # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x7e] +#CHECK: vsdp %v0, %v31, %v0, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x7e] +#CHECK: vsdp %v31, %v0, %v0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x7e] +#CHECK: vsdp %v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x7e] + + vsdp %v0, %v0, %v0, 0, 0 + vsdp %v0, %v0, %v0, 0, 15 + vsdp %v0, %v0, %v0, 255, 0 + vsdp %v0, %v0, %v31, 0, 0 + vsdp %v0, %v31, %v0, 0, 0 + vsdp %v31, %v0, %v0, 0, 0 + vsdp %v13, %v17, %v21, 0x79, 11 + +#CHECK: vsp %v0, %v0, %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x73] +#CHECK: vsp %v0, %v0, %v0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x73] +#CHECK: vsp %v0, %v0, %v0, 255, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x73] +#CHECK: vsp %v0, %v0, %v31, 0, 0 # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x73] +#CHECK: vsp %v0, %v31, %v0, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x73] +#CHECK: vsp %v31, %v0, %v0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x73] +#CHECK: vsp %v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x73] + + vsp %v0, %v0, %v0, 0, 0 + vsp %v0, %v0, %v0, 0, 15 + vsp %v0, %v0, %v0, 255, 0 + vsp %v0, %v0, %v31, 0, 0 + vsp %v0, %v31, %v0, 0, 0 + vsp %v31, %v0, %v0, 0, 0 + vsp %v13, %v17, %v21, 0x79, 11 + +#CHECK: vsrp %v0, %v0, 0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x59] +#CHECK: vsrp %v0, %v0, 0, 0, 15 # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x59] +#CHECK: vsrp %v0, %v0, 0, 255, 0 # encoding: [0xe6,0x00,0xff,0x00,0x00,0x59] +#CHECK: vsrp %v0, %v0, 255, 0, 0 # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x59] +#CHECK: vsrp %v0, %v31, 0, 0, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x59] +#CHECK: vsrp %v31, %v0, 0, 0, 0 # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x59] +#CHECK: vsrp %v13, %v17, 52, 121, 11 # encoding: [0xe6,0xd1,0x79,0xb3,0x44,0x59] + + vsrp %v0, %v0, 0, 0, 0 + vsrp %v0, %v0, 0, 0, 15 + vsrp %v0, %v0, 0, 255, 0 + vsrp %v0, %v0, 255, 0, 0 + vsrp %v0, %v31, 0, 0, 0 + vsrp %v31, %v0, 0, 0, 0 + vsrp %v13, %v17, 0x34, 0x79, 11 + +#CHECK: vstrl %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x3d] +#CHECK: vstrl %v0, 4095, 0 # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x3d] +#CHECK: vstrl %v0, 0(%r15), 0 # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x3d] +#CHECK: vstrl %v0, 0, 255 # encoding: [0xe6,0xff,0x00,0x00,0x00,0x3d] +#CHECK: vstrl %v15, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x3d] +#CHECK: vstrl %v31, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x3d] +#CHECK: vstrl %v18, 1383(%r4), 3 # encoding: [0xe6,0x03,0x45,0x67,0x21,0x3d] + + vstrl %v0, 0, 0 + vstrl %v0, 4095, 0 + vstrl %v0, 0(%r15), 0 + vstrl %v0, 0, 255 + vstrl %v15, 0, 0 + vstrl %v31, 0, 0 + vstrl %v18, 1383(%r4), 3 + +#CHECK: vstrlr %v0, %r0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x3f] +#CHECK: vstrlr %v0, %r0, 4095 # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x3f] +#CHECK: vstrlr %v0, %r0, 0(%r15) # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x3f] +#CHECK: vstrlr %v0, %r15, 0 # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x3f] +#CHECK: vstrlr %v15, %r0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x3f] +#CHECK: vstrlr %v31, %r0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x3f] +#CHECK: vstrlr %v18, %r3, 1383(%r4) # encoding: [0xe6,0x03,0x45,0x67,0x21,0x3f] + + vstrlr %v0, %r0, 0 + vstrlr %v0, %r0, 4095 + vstrlr %v0, %r0, 0(%r15) + vstrlr %v0, %r15, 0 + vstrlr %v15, %r0, 0 + vstrlr %v31, %r0, 0 + vstrlr %v18, %r3, 1383(%r4) + +#CHECK: vtp %v0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x5f] +#CHECK: vtp %v15 # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x5f] +#CHECK: vtp %v31 # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x5f] + + vtp %v0 + vtp %v15 + vtp %v31 + +#CHECK: vupkz %v0, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0x00,0x3c] +#CHECK: vupkz %v0, 4095, 0 # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x3c] +#CHECK: vupkz %v0, 0(%r15), 0 # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x3c] +#CHECK: vupkz %v0, 0, 255 # encoding: [0xe6,0xff,0x00,0x00,0x00,0x3c] +#CHECK: vupkz %v15, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x3c] +#CHECK: vupkz %v31, 0, 0 # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x3c] +#CHECK: vupkz %v18, 1383(%r4), 3 # encoding: [0xe6,0x03,0x45,0x67,0x21,0x3c] + + vupkz %v0, 0, 0 + vupkz %v0, 4095, 0 + vupkz %v0, 0(%r15), 0 + vupkz %v0, 0, 255 + vupkz %v15, 0, 0 + vupkz %v31, 0, 0 + vupkz %v18, 1383(%r4), 3 + +#CHECK: wfasb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe3] +#CHECK: wfasb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe3] +#CHECK: wfasb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe3] +#CHECK: wfasb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe3] +#CHECK: wfasb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe3] +#CHECK: wfasb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe3] + + wfasb %v0, %v0, %v0 + wfasb %f0, %f0, %f0 + wfasb %v0, %v0, %v31 + wfasb %v0, %v31, %v0 + wfasb %v31, %v0, %v0 + wfasb %v18, %v3, %v20 + +#CHECK: wfaxb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe3] +#CHECK: wfaxb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe3] +#CHECK: wfaxb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe3] +#CHECK: wfaxb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe3] +#CHECK: wfaxb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe3] + + wfaxb %v0, %v0, %v0 + wfaxb %v0, %v0, %v31 + wfaxb %v0, %v31, %v0 + wfaxb %v31, %v0, %v0 + wfaxb %v18, %v3, %v20 + +#CHECK: wfcsb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xcb] +#CHECK: wfcsb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xcb] +#CHECK: wfcsb %f0, %f15 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xcb] +#CHECK: wfcsb %f0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xcb] +#CHECK: wfcsb %f15, %f0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xcb] +#CHECK: wfcsb %v31, %f0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xcb] +#CHECK: wfcsb %f14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xcb] + + wfcsb %v0, %v0 + wfcsb %f0, %f0 + wfcsb %v0, %v15 + wfcsb %v0, %v31 + wfcsb %v15, %v0 + wfcsb %v31, %v0 + wfcsb %v14, %v17 + +#CHECK: wfcxb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x40,0xcb] +#CHECK: wfcxb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x40,0xcb] +#CHECK: wfcxb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x44,0xcb] +#CHECK: wfcxb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x40,0xcb] +#CHECK: wfcxb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x48,0xcb] +#CHECK: wfcxb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x44,0xcb] + + wfcxb %v0, %v0 + wfcxb %v0, %v15 + wfcxb %v0, %v31 + wfcxb %v15, %v0 + wfcxb %v31, %v0 + wfcxb %v14, %v17 + +#CHECK: wfcesb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe8] +#CHECK: wfcesb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe8] +#CHECK: wfcesb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe8] +#CHECK: wfcesb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe8] +#CHECK: wfcesb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe8] +#CHECK: wfcesb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe8] + + wfcesb %v0, %v0, %v0 + wfcesb %f0, %f0, %f0 + wfcesb %v0, %v0, %v31 + wfcesb %v0, %v31, %v0 + wfcesb %v31, %v0, %v0 + wfcesb %v18, %v3, %v20 + +#CHECK: wfcesbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x18,0x20,0xe8] +#CHECK: wfcesbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x18,0x20,0xe8] +#CHECK: wfcesbs %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x18,0x22,0xe8] +#CHECK: wfcesbs %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xe8] +#CHECK: wfcesbs %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xe8] +#CHECK: wfcesbs %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x18,0x2a,0xe8] + + wfcesbs %v0, %v0, %v0 + wfcesbs %f0, %f0, %f0 + wfcesbs %v0, %v0, %v31 + wfcesbs %v0, %v31, %v0 + wfcesbs %v31, %v0, %v0 + wfcesbs %v18, %v3, %v20 + +#CHECK: wfcexb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe8] +#CHECK: wfcexb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe8] +#CHECK: wfcexb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe8] +#CHECK: wfcexb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe8] +#CHECK: wfcexb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe8] + + wfcexb %v0, %v0, %v0 + wfcexb %v0, %v0, %v31 + wfcexb %v0, %v31, %v0 + wfcexb %v31, %v0, %v0 + wfcexb %v18, %v3, %v20 + +#CHECK: wfcexbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x18,0x40,0xe8] +#CHECK: wfcexbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x18,0x42,0xe8] +#CHECK: wfcexbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xe8] +#CHECK: wfcexbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xe8] +#CHECK: wfcexbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x18,0x4a,0xe8] + + wfcexbs %v0, %v0, %v0 + wfcexbs %v0, %v0, %v31 + wfcexbs %v0, %v31, %v0 + wfcexbs %v31, %v0, %v0 + wfcexbs %v18, %v3, %v20 + +#CHECK: wfchsb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xeb] +#CHECK: wfchsb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xeb] +#CHECK: wfchsb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xeb] +#CHECK: wfchsb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xeb] +#CHECK: wfchsb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xeb] +#CHECK: wfchsb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xeb] + + wfchsb %v0, %v0, %v0 + wfchsb %f0, %f0, %f0 + wfchsb %v0, %v0, %v31 + wfchsb %v0, %v31, %v0 + wfchsb %v31, %v0, %v0 + wfchsb %v18, %v3, %v20 + +#CHECK: wfchsbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x18,0x20,0xeb] +#CHECK: wfchsbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x18,0x20,0xeb] +#CHECK: wfchsbs %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x18,0x22,0xeb] +#CHECK: wfchsbs %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xeb] +#CHECK: wfchsbs %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xeb] +#CHECK: wfchsbs %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x18,0x2a,0xeb] + + wfchsbs %v0, %v0, %v0 + wfchsbs %f0, %f0, %f0 + wfchsbs %v0, %v0, %v31 + wfchsbs %v0, %v31, %v0 + wfchsbs %v31, %v0, %v0 + wfchsbs %v18, %v3, %v20 + +#CHECK: wfchxb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xeb] +#CHECK: wfchxb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xeb] +#CHECK: wfchxb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xeb] +#CHECK: wfchxb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xeb] +#CHECK: wfchxb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xeb] + + wfchxb %v0, %v0, %v0 + wfchxb %v0, %v0, %v31 + wfchxb %v0, %v31, %v0 + wfchxb %v31, %v0, %v0 + wfchxb %v18, %v3, %v20 + +#CHECK: wfchxbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x18,0x40,0xeb] +#CHECK: wfchxbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x18,0x42,0xeb] +#CHECK: wfchxbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xeb] +#CHECK: wfchxbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xeb] +#CHECK: wfchxbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x18,0x4a,0xeb] + + wfchxbs %v0, %v0, %v0 + wfchxbs %v0, %v0, %v31 + wfchxbs %v0, %v31, %v0 + wfchxbs %v31, %v0, %v0 + wfchxbs %v18, %v3, %v20 + +#CHECK: wfchesb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xea] +#CHECK: wfchesb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xea] +#CHECK: wfchesb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xea] +#CHECK: wfchesb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xea] +#CHECK: wfchesb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xea] +#CHECK: wfchesb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xea] + + wfchesb %v0, %v0, %v0 + wfchesb %f0, %f0, %f0 + wfchesb %v0, %v0, %v31 + wfchesb %v0, %v31, %v0 + wfchesb %v31, %v0, %v0 + wfchesb %v18, %v3, %v20 + +#CHECK: wfchesbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x18,0x20,0xea] +#CHECK: wfchesbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x18,0x20,0xea] +#CHECK: wfchesbs %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x18,0x22,0xea] +#CHECK: wfchesbs %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xea] +#CHECK: wfchesbs %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xea] +#CHECK: wfchesbs %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x18,0x2a,0xea] + + wfchesbs %v0, %v0, %v0 + wfchesbs %f0, %f0, %f0 + wfchesbs %v0, %v0, %v31 + wfchesbs %v0, %v31, %v0 + wfchesbs %v31, %v0, %v0 + wfchesbs %v18, %v3, %v20 + +#CHECK: wfchexb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xea] +#CHECK: wfchexb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xea] +#CHECK: wfchexb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xea] +#CHECK: wfchexb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xea] +#CHECK: wfchexb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xea] + + wfchexb %v0, %v0, %v0 + wfchexb %v0, %v0, %v31 + wfchexb %v0, %v31, %v0 + wfchexb %v31, %v0, %v0 + wfchexb %v18, %v3, %v20 + +#CHECK: wfchexbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x18,0x40,0xea] +#CHECK: wfchexbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x18,0x42,0xea] +#CHECK: wfchexbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xea] +#CHECK: wfchexbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xea] +#CHECK: wfchexbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x18,0x4a,0xea] + + wfchexbs %v0, %v0, %v0 + wfchexbs %v0, %v0, %v31 + wfchexbs %v0, %v31, %v0 + wfchexbs %v31, %v0, %v0 + wfchexbs %v18, %v3, %v20 + +#CHECK: wfdsb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe5] +#CHECK: wfdsb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe5] +#CHECK: wfdsb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe5] +#CHECK: wfdsb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe5] +#CHECK: wfdsb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe5] +#CHECK: wfdsb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe5] + + wfdsb %v0, %v0, %v0 + wfdsb %f0, %f0, %f0 + wfdsb %v0, %v0, %v31 + wfdsb %v0, %v31, %v0 + wfdsb %v31, %v0, %v0 + wfdsb %v18, %v3, %v20 + +#CHECK: wfdxb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe5] +#CHECK: wfdxb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe5] +#CHECK: wfdxb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe5] +#CHECK: wfdxb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe5] +#CHECK: wfdxb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe5] + + wfdxb %v0, %v0, %v0 + wfdxb %v0, %v0, %v31 + wfdxb %v0, %v31, %v0 + wfdxb %v31, %v0, %v0 + wfdxb %v18, %v3, %v20 + +#CHECK: wfisb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc7] +#CHECK: wfisb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc7] +#CHECK: wfisb %f0, %f0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x20,0xc7] +#CHECK: wfisb %f0, %f0, 4, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xc7] +#CHECK: wfisb %f0, %f0, 7, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x20,0xc7] +#CHECK: wfisb %f0, %v31, 0, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xc7] +#CHECK: wfisb %v31, %f0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xc7] +#CHECK: wfisb %f14, %v17, 4, 10 # encoding: [0xe7,0xe1,0x00,0xac,0x24,0xc7] + + wfisb %v0, %v0, 0, 0 + wfisb %f0, %f0, 0, 0 + wfisb %v0, %v0, 0, 15 + wfisb %v0, %v0, 4, 0 + wfisb %v0, %v0, 7, 0 + wfisb %v0, %v31, 0, 0 + wfisb %v31, %v0, 0, 0 + wfisb %v14, %v17, 4, 10 + +#CHECK: wfixb %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xc7] +#CHECK: wfixb %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x40,0xc7] +#CHECK: wfixb %v0, %v0, 4, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xc7] +#CHECK: wfixb %v0, %v0, 7, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x40,0xc7] +#CHECK: wfixb %v0, %v31, 0, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xc7] +#CHECK: wfixb %v31, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xc7] +#CHECK: wfixb %v14, %v17, 4, 10 # encoding: [0xe7,0xe1,0x00,0xac,0x44,0xc7] + + wfixb %v0, %v0, 0, 0 + wfixb %v0, %v0, 0, 15 + wfixb %v0, %v0, 4, 0 + wfixb %v0, %v0, 7, 0 + wfixb %v0, %v31, 0, 0 + wfixb %v31, %v0, 0, 0 + wfixb %v14, %v17, 4, 10 + +#CHECK: wfksb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xca] +#CHECK: wfksb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xca] +#CHECK: wfksb %f0, %f15 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xca] +#CHECK: wfksb %f0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xca] +#CHECK: wfksb %f15, %f0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xca] +#CHECK: wfksb %v31, %f0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xca] +#CHECK: wfksb %f14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xca] + + wfksb %v0, %v0 + wfksb %f0, %f0 + wfksb %v0, %v15 + wfksb %v0, %v31 + wfksb %v15, %v0 + wfksb %v31, %v0 + wfksb %v14, %v17 + +#CHECK: wfkxb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x40,0xca] +#CHECK: wfkxb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x40,0xca] +#CHECK: wfkxb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x44,0xca] +#CHECK: wfkxb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x40,0xca] +#CHECK: wfkxb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x48,0xca] +#CHECK: wfkxb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x44,0xca] + + wfkxb %v0, %v0 + wfkxb %v0, %v15 + wfkxb %v0, %v31 + wfkxb %v15, %v0 + wfkxb %v31, %v0 + wfkxb %v14, %v17 + +#CHECK: wfkedb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xe8] +#CHECK: wfkedb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xe8] +#CHECK: wfkedb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x0c,0x32,0xe8] +#CHECK: wfkedb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x0c,0x34,0xe8] +#CHECK: wfkedb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x0c,0x38,0xe8] +#CHECK: wfkedb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x0c,0x3a,0xe8] + + wfkedb %v0, %v0, %v0 + wfkedb %f0, %f0, %f0 + wfkedb %v0, %v0, %v31 + wfkedb %v0, %v31, %v0 + wfkedb %v31, %v0, %v0 + wfkedb %v18, %v3, %v20 + +#CHECK: wfkedbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xe8] +#CHECK: wfkedbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xe8] +#CHECK: wfkedbs %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x1c,0x32,0xe8] +#CHECK: wfkedbs %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x1c,0x34,0xe8] +#CHECK: wfkedbs %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x1c,0x38,0xe8] +#CHECK: wfkedbs %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x1c,0x3a,0xe8] + + wfkedbs %v0, %v0, %v0 + wfkedbs %f0, %f0, %f0 + wfkedbs %v0, %v0, %v31 + wfkedbs %v0, %v31, %v0 + wfkedbs %v31, %v0, %v0 + wfkedbs %v18, %v3, %v20 + +#CHECK: wfkesb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xe8] +#CHECK: wfkesb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xe8] +#CHECK: wfkesb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x0c,0x22,0xe8] +#CHECK: wfkesb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x0c,0x24,0xe8] +#CHECK: wfkesb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x0c,0x28,0xe8] +#CHECK: wfkesb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x0c,0x2a,0xe8] + + wfkesb %v0, %v0, %v0 + wfkesb %f0, %f0, %f0 + wfkesb %v0, %v0, %v31 + wfkesb %v0, %v31, %v0 + wfkesb %v31, %v0, %v0 + wfkesb %v18, %v3, %v20 + +#CHECK: wfkesbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xe8] +#CHECK: wfkesbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xe8] +#CHECK: wfkesbs %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x1c,0x22,0xe8] +#CHECK: wfkesbs %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x1c,0x24,0xe8] +#CHECK: wfkesbs %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x1c,0x28,0xe8] +#CHECK: wfkesbs %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x1c,0x2a,0xe8] + + wfkesbs %v0, %v0, %v0 + wfkesbs %f0, %f0, %f0 + wfkesbs %v0, %v0, %v31 + wfkesbs %v0, %v31, %v0 + wfkesbs %v31, %v0, %v0 + wfkesbs %v18, %v3, %v20 + +#CHECK: wfkexb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xe8] +#CHECK: wfkexb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x0c,0x42,0xe8] +#CHECK: wfkexb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x0c,0x44,0xe8] +#CHECK: wfkexb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x0c,0x48,0xe8] +#CHECK: wfkexb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x0c,0x4a,0xe8] + + wfkexb %v0, %v0, %v0 + wfkexb %v0, %v0, %v31 + wfkexb %v0, %v31, %v0 + wfkexb %v31, %v0, %v0 + wfkexb %v18, %v3, %v20 + +#CHECK: wfkexbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x1c,0x40,0xe8] +#CHECK: wfkexbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x1c,0x42,0xe8] +#CHECK: wfkexbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x1c,0x44,0xe8] +#CHECK: wfkexbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x1c,0x48,0xe8] +#CHECK: wfkexbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x1c,0x4a,0xe8] + + wfkexbs %v0, %v0, %v0 + wfkexbs %v0, %v0, %v31 + wfkexbs %v0, %v31, %v0 + wfkexbs %v31, %v0, %v0 + wfkexbs %v18, %v3, %v20 + +#CHECK: wfkhdb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xeb] +#CHECK: wfkhdb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xeb] +#CHECK: wfkhdb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x0c,0x32,0xeb] +#CHECK: wfkhdb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x0c,0x34,0xeb] +#CHECK: wfkhdb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x0c,0x38,0xeb] +#CHECK: wfkhdb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x0c,0x3a,0xeb] + + wfkhdb %v0, %v0, %v0 + wfkhdb %f0, %f0, %f0 + wfkhdb %v0, %v0, %v31 + wfkhdb %v0, %v31, %v0 + wfkhdb %v31, %v0, %v0 + wfkhdb %v18, %v3, %v20 + +#CHECK: wfkhdbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xeb] +#CHECK: wfkhdbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xeb] +#CHECK: wfkhdbs %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x1c,0x32,0xeb] +#CHECK: wfkhdbs %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x1c,0x34,0xeb] +#CHECK: wfkhdbs %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x1c,0x38,0xeb] +#CHECK: wfkhdbs %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x1c,0x3a,0xeb] + + wfkhdbs %v0, %v0, %v0 + wfkhdbs %f0, %f0, %f0 + wfkhdbs %v0, %v0, %v31 + wfkhdbs %v0, %v31, %v0 + wfkhdbs %v31, %v0, %v0 + wfkhdbs %v18, %v3, %v20 + +#CHECK: wfkhsb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xeb] +#CHECK: wfkhsb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xeb] +#CHECK: wfkhsb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x0c,0x22,0xeb] +#CHECK: wfkhsb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x0c,0x24,0xeb] +#CHECK: wfkhsb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x0c,0x28,0xeb] +#CHECK: wfkhsb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x0c,0x2a,0xeb] + + wfkhsb %v0, %v0, %v0 + wfkhsb %f0, %f0, %f0 + wfkhsb %v0, %v0, %v31 + wfkhsb %v0, %v31, %v0 + wfkhsb %v31, %v0, %v0 + wfkhsb %v18, %v3, %v20 + +#CHECK: wfkhsbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xeb] +#CHECK: wfkhsbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xeb] +#CHECK: wfkhsbs %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x1c,0x22,0xeb] +#CHECK: wfkhsbs %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x1c,0x24,0xeb] +#CHECK: wfkhsbs %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x1c,0x28,0xeb] +#CHECK: wfkhsbs %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x1c,0x2a,0xeb] + + wfkhsbs %v0, %v0, %v0 + wfkhsbs %f0, %f0, %f0 + wfkhsbs %v0, %v0, %v31 + wfkhsbs %v0, %v31, %v0 + wfkhsbs %v31, %v0, %v0 + wfkhsbs %v18, %v3, %v20 + +#CHECK: wfkhxb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xeb] +#CHECK: wfkhxb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x0c,0x42,0xeb] +#CHECK: wfkhxb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x0c,0x44,0xeb] +#CHECK: wfkhxb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x0c,0x48,0xeb] +#CHECK: wfkhxb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x0c,0x4a,0xeb] + + wfkhxb %v0, %v0, %v0 + wfkhxb %v0, %v0, %v31 + wfkhxb %v0, %v31, %v0 + wfkhxb %v31, %v0, %v0 + wfkhxb %v18, %v3, %v20 + +#CHECK: wfkhxbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x1c,0x40,0xeb] +#CHECK: wfkhxbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x1c,0x42,0xeb] +#CHECK: wfkhxbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x1c,0x44,0xeb] +#CHECK: wfkhxbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x1c,0x48,0xeb] +#CHECK: wfkhxbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x1c,0x4a,0xeb] + + wfkhxbs %v0, %v0, %v0 + wfkhxbs %v0, %v0, %v31 + wfkhxbs %v0, %v31, %v0 + wfkhxbs %v31, %v0, %v0 + wfkhxbs %v18, %v3, %v20 + +#CHECK: wfkhedb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xea] +#CHECK: wfkhedb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xea] +#CHECK: wfkhedb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x0c,0x32,0xea] +#CHECK: wfkhedb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x0c,0x34,0xea] +#CHECK: wfkhedb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x0c,0x38,0xea] +#CHECK: wfkhedb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x0c,0x3a,0xea] + + wfkhedb %v0, %v0, %v0 + wfkhedb %f0, %f0, %f0 + wfkhedb %v0, %v0, %v31 + wfkhedb %v0, %v31, %v0 + wfkhedb %v31, %v0, %v0 + wfkhedb %v18, %v3, %v20 + +#CHECK: wfkhedbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xea] +#CHECK: wfkhedbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xea] +#CHECK: wfkhedbs %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x1c,0x32,0xea] +#CHECK: wfkhedbs %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x1c,0x34,0xea] +#CHECK: wfkhedbs %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x1c,0x38,0xea] +#CHECK: wfkhedbs %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x1c,0x3a,0xea] + + wfkhedbs %v0, %v0, %v0 + wfkhedbs %f0, %f0, %f0 + wfkhedbs %v0, %v0, %v31 + wfkhedbs %v0, %v31, %v0 + wfkhedbs %v31, %v0, %v0 + wfkhedbs %v18, %v3, %v20 + +#CHECK: wfkhesb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xea] +#CHECK: wfkhesb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xea] +#CHECK: wfkhesb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x0c,0x22,0xea] +#CHECK: wfkhesb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x0c,0x24,0xea] +#CHECK: wfkhesb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x0c,0x28,0xea] +#CHECK: wfkhesb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x0c,0x2a,0xea] + + wfkhesb %v0, %v0, %v0 + wfkhesb %f0, %f0, %f0 + wfkhesb %v0, %v0, %v31 + wfkhesb %v0, %v31, %v0 + wfkhesb %v31, %v0, %v0 + wfkhesb %v18, %v3, %v20 + +#CHECK: wfkhesbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xea] +#CHECK: wfkhesbs %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xea] +#CHECK: wfkhesbs %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x1c,0x22,0xea] +#CHECK: wfkhesbs %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x1c,0x24,0xea] +#CHECK: wfkhesbs %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x1c,0x28,0xea] +#CHECK: wfkhesbs %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x1c,0x2a,0xea] + + wfkhesbs %v0, %v0, %v0 + wfkhesbs %f0, %f0, %f0 + wfkhesbs %v0, %v0, %v31 + wfkhesbs %v0, %v31, %v0 + wfkhesbs %v31, %v0, %v0 + wfkhesbs %v18, %v3, %v20 + +#CHECK: wfkhexb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xea] +#CHECK: wfkhexb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x0c,0x42,0xea] +#CHECK: wfkhexb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x0c,0x44,0xea] +#CHECK: wfkhexb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x0c,0x48,0xea] +#CHECK: wfkhexb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x0c,0x4a,0xea] + + wfkhexb %v0, %v0, %v0 + wfkhexb %v0, %v0, %v31 + wfkhexb %v0, %v31, %v0 + wfkhexb %v31, %v0, %v0 + wfkhexb %v18, %v3, %v20 + +#CHECK: wfkhexbs %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x1c,0x40,0xea] +#CHECK: wfkhexbs %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x1c,0x42,0xea] +#CHECK: wfkhexbs %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x1c,0x44,0xea] +#CHECK: wfkhexbs %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x1c,0x48,0xea] +#CHECK: wfkhexbs %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x1c,0x4a,0xea] + + wfkhexbs %v0, %v0, %v0 + wfkhexbs %v0, %v0, %v31 + wfkhexbs %v0, %v31, %v0 + wfkhexbs %v31, %v0, %v0 + wfkhexbs %v18, %v3, %v20 + +#CHECK: wfpsosb %f0, %f0, 3 # encoding: [0xe7,0x00,0x00,0x38,0x20,0xcc] +#CHECK: wfpsosb %f0, %f0, 3 # encoding: [0xe7,0x00,0x00,0x38,0x20,0xcc] +#CHECK: wfpsosb %f0, %f0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x20,0xcc] +#CHECK: wfpsosb %f0, %f15, 3 # encoding: [0xe7,0x0f,0x00,0x38,0x20,0xcc] +#CHECK: wfpsosb %f0, %v31, 3 # encoding: [0xe7,0x0f,0x00,0x38,0x24,0xcc] +#CHECK: wfpsosb %f15, %f0, 3 # encoding: [0xe7,0xf0,0x00,0x38,0x20,0xcc] +#CHECK: wfpsosb %v31, %f0, 3 # encoding: [0xe7,0xf0,0x00,0x38,0x28,0xcc] +#CHECK: wfpsosb %f14, %v17, 7 # encoding: [0xe7,0xe1,0x00,0x78,0x24,0xcc] + + wfpsosb %v0, %v0, 3 + wfpsosb %f0, %f0, 3 + wfpsosb %v0, %v0, 15 + wfpsosb %v0, %v15, 3 + wfpsosb %v0, %v31, 3 + wfpsosb %v15, %v0, 3 + wfpsosb %v31, %v0, 3 + wfpsosb %v14, %v17, 7 + +#CHECK: wfpsoxb %v0, %v0, 3 # encoding: [0xe7,0x00,0x00,0x38,0x40,0xcc] +#CHECK: wfpsoxb %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x40,0xcc] +#CHECK: wfpsoxb %v0, %v15, 3 # encoding: [0xe7,0x0f,0x00,0x38,0x40,0xcc] +#CHECK: wfpsoxb %v0, %v31, 3 # encoding: [0xe7,0x0f,0x00,0x38,0x44,0xcc] +#CHECK: wfpsoxb %v15, %v0, 3 # encoding: [0xe7,0xf0,0x00,0x38,0x40,0xcc] +#CHECK: wfpsoxb %v31, %v0, 3 # encoding: [0xe7,0xf0,0x00,0x38,0x48,0xcc] +#CHECK: wfpsoxb %v14, %v17, 7 # encoding: [0xe7,0xe1,0x00,0x78,0x44,0xcc] + + wfpsoxb %v0, %v0, 3 + wfpsoxb %v0, %v0, 15 + wfpsoxb %v0, %v15, 3 + wfpsoxb %v0, %v31, 3 + wfpsoxb %v15, %v0, 3 + wfpsoxb %v31, %v0, 3 + wfpsoxb %v14, %v17, 7 + +#CHECK: wflcsb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xcc] +#CHECK: wflcsb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xcc] +#CHECK: wflcsb %f0, %f15 # encoding: [0xe7,0x0f,0x00,0x08,0x20,0xcc] +#CHECK: wflcsb %f0, %v31 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xcc] +#CHECK: wflcsb %f15, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x20,0xcc] +#CHECK: wflcsb %v31, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xcc] +#CHECK: wflcsb %f14, %v17 # encoding: [0xe7,0xe1,0x00,0x08,0x24,0xcc] + + wflcsb %v0, %v0 + wflcsb %f0, %f0 + wflcsb %v0, %v15 + wflcsb %v0, %v31 + wflcsb %v15, %v0 + wflcsb %v31, %v0 + wflcsb %v14, %v17 + +#CHECK: wflcxb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xcc] +#CHECK: wflcxb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x08,0x40,0xcc] +#CHECK: wflcxb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xcc] +#CHECK: wflcxb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x40,0xcc] +#CHECK: wflcxb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xcc] +#CHECK: wflcxb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x08,0x44,0xcc] + + wflcxb %v0, %v0 + wflcxb %v0, %v15 + wflcxb %v0, %v31 + wflcxb %v15, %v0 + wflcxb %v31, %v0 + wflcxb %v14, %v17 + +#CHECK: wflnsb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x18,0x20,0xcc] +#CHECK: wflnsb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x18,0x20,0xcc] +#CHECK: wflnsb %f0, %f15 # encoding: [0xe7,0x0f,0x00,0x18,0x20,0xcc] +#CHECK: wflnsb %f0, %v31 # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xcc] +#CHECK: wflnsb %f15, %f0 # encoding: [0xe7,0xf0,0x00,0x18,0x20,0xcc] +#CHECK: wflnsb %v31, %f0 # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xcc] +#CHECK: wflnsb %f14, %v17 # encoding: [0xe7,0xe1,0x00,0x18,0x24,0xcc] + + wflnsb %v0, %v0 + wflnsb %f0, %f0 + wflnsb %v0, %v15 + wflnsb %v0, %v31 + wflnsb %v15, %v0 + wflnsb %v31, %v0 + wflnsb %v14, %v17 + +#CHECK: wflnxb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x18,0x40,0xcc] +#CHECK: wflnxb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x18,0x40,0xcc] +#CHECK: wflnxb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xcc] +#CHECK: wflnxb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x18,0x40,0xcc] +#CHECK: wflnxb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xcc] +#CHECK: wflnxb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x18,0x44,0xcc] + + wflnxb %v0, %v0 + wflnxb %v0, %v15 + wflnxb %v0, %v31 + wflnxb %v15, %v0 + wflnxb %v31, %v0 + wflnxb %v14, %v17 + +#CHECK: wflpsb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x28,0x20,0xcc] +#CHECK: wflpsb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x28,0x20,0xcc] +#CHECK: wflpsb %f0, %f15 # encoding: [0xe7,0x0f,0x00,0x28,0x20,0xcc] +#CHECK: wflpsb %f0, %v31 # encoding: [0xe7,0x0f,0x00,0x28,0x24,0xcc] +#CHECK: wflpsb %f15, %f0 # encoding: [0xe7,0xf0,0x00,0x28,0x20,0xcc] +#CHECK: wflpsb %v31, %f0 # encoding: [0xe7,0xf0,0x00,0x28,0x28,0xcc] +#CHECK: wflpsb %f14, %v17 # encoding: [0xe7,0xe1,0x00,0x28,0x24,0xcc] + + wflpsb %v0, %v0 + wflpsb %f0, %f0 + wflpsb %v0, %v15 + wflpsb %v0, %v31 + wflpsb %v15, %v0 + wflpsb %v31, %v0 + wflpsb %v14, %v17 + +#CHECK: wflpxb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x28,0x40,0xcc] +#CHECK: wflpxb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x28,0x40,0xcc] +#CHECK: wflpxb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x28,0x44,0xcc] +#CHECK: wflpxb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x28,0x40,0xcc] +#CHECK: wflpxb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x28,0x48,0xcc] +#CHECK: wflpxb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x28,0x44,0xcc] + + wflpxb %v0, %v0 + wflpxb %v0, %v15 + wflpxb %v0, %v31 + wflpxb %v15, %v0 + wflpxb %v31, %v0 + wflpxb %v14, %v17 + +#CHECK: wflls %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc4] +#CHECK: wflls %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc4] +#CHECK: wflls %f0, %f15 # encoding: [0xe7,0x0f,0x00,0x08,0x20,0xc4] +#CHECK: wflls %f0, %v31 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xc4] +#CHECK: wflls %f15, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x20,0xc4] +#CHECK: wflls %v31, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xc4] +#CHECK: wflls %f14, %v17 # encoding: [0xe7,0xe1,0x00,0x08,0x24,0xc4] + + wflls %v0, %v0 + wflls %f0, %f0 + wflls %v0, %v15 + wflls %v0, %v31 + wflls %v15, %v0 + wflls %v31, %v0 + wflls %v14, %v17 + +#CHECK: wflld %v0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc4] +#CHECK: wflld %v0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc4] +#CHECK: wflld %v0, %f15 # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xc4] +#CHECK: wflld %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc4] +#CHECK: wflld %v15, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xc4] +#CHECK: wflld %v31, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc4] +#CHECK: wflld %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x08,0x34,0xc4] + + wflld %v0, %v0 + wflld %v0, %f0 + wflld %v0, %v15 + wflld %v0, %v31 + wflld %v15, %v0 + wflld %v31, %v0 + wflld %v14, %v17 + +#CHECK: wflrd %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc5] +#CHECK: wflrd %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc5] +#CHECK: wflrd %f0, %f0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc5] +#CHECK: wflrd %f0, %f0, 4, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5] +#CHECK: wflrd %f0, %f0, 12, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5] +#CHECK: wflrd %f0, %v31, 0, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc5] +#CHECK: wflrd %v31, %f0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc5] +#CHECK: wflrd %f14, %v17, 4, 10 # encoding: [0xe7,0xe1,0x00,0xac,0x34,0xc5] + + wflrd %v0, %v0, 0, 0 + wflrd %f0, %f0, 0, 0 + wflrd %v0, %v0, 0, 15 + wflrd %v0, %v0, 4, 0 + wflrd %v0, %v0, 12, 0 + wflrd %v0, %v31, 0, 0 + wflrd %v31, %v0, 0, 0 + wflrd %v14, %v17, 4, 10 + +#CHECK: wflrx %f0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xc5] +#CHECK: wflrx %f0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xc5] +#CHECK: wflrx %f0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x40,0xc5] +#CHECK: wflrx %f0, %v0, 4, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xc5] +#CHECK: wflrx %f0, %v0, 7, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x40,0xc5] +#CHECK: wflrx %f0, %v31, 0, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xc5] +#CHECK: wflrx %v31, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xc5] +#CHECK: wflrx %f14, %v17, 4, 10 # encoding: [0xe7,0xe1,0x00,0xac,0x44,0xc5] + + wflrx %v0, %v0, 0, 0 + wflrx %f0, %v0, 0, 0 + wflrx %v0, %v0, 0, 15 + wflrx %v0, %v0, 4, 0 + wflrx %v0, %v0, 7, 0 + wflrx %v0, %v31, 0, 0 + wflrx %v31, %v0, 0, 0 + wflrx %v14, %v17, 4, 10 + +#CHECK: wfmaxdb %f0, %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xef] +#CHECK: wfmaxdb %f0, %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xef] +#CHECK: wfmaxdb %f0, %f0, %f0, 4 # encoding: [0xe7,0x00,0x00,0x48,0x30,0xef] +#CHECK: wfmaxdb %f0, %f0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xef] +#CHECK: wfmaxdb %f0, %v31, %f0, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xef] +#CHECK: wfmaxdb %v31, %f0, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xef] +#CHECK: wfmaxdb %v18, %f3, %v20, 11 # encoding: [0xe7,0x23,0x40,0xb8,0x3a,0xef] + + wfmaxdb %v0, %v0, %v0, 0 + wfmaxdb %f0, %f0, %f0, 0 + wfmaxdb %v0, %v0, %v0, 4 + wfmaxdb %v0, %v0, %v31, 0 + wfmaxdb %v0, %v31, %v0, 0 + wfmaxdb %v31, %v0, %v0, 0 + wfmaxdb %v18, %v3, %v20, 11 + +#CHECK: wfmaxsb %f0, %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xef] +#CHECK: wfmaxsb %f0, %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xef] +#CHECK: wfmaxsb %f0, %f0, %f0, 4 # encoding: [0xe7,0x00,0x00,0x48,0x20,0xef] +#CHECK: wfmaxsb %f0, %f0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xef] +#CHECK: wfmaxsb %f0, %v31, %f0, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xef] +#CHECK: wfmaxsb %v31, %f0, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xef] +#CHECK: wfmaxsb %v18, %f3, %v20, 11 # encoding: [0xe7,0x23,0x40,0xb8,0x2a,0xef] + + wfmaxsb %v0, %v0, %v0, 0 + wfmaxsb %f0, %f0, %f0, 0 + wfmaxsb %v0, %v0, %v0, 4 + wfmaxsb %v0, %v0, %v31, 0 + wfmaxsb %v0, %v31, %v0, 0 + wfmaxsb %v31, %v0, %v0, 0 + wfmaxsb %v18, %v3, %v20, 11 + +#CHECK: wfmaxxb %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xef] +#CHECK: wfmaxxb %v0, %v0, %v0, 4 # encoding: [0xe7,0x00,0x00,0x48,0x40,0xef] +#CHECK: wfmaxxb %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xef] +#CHECK: wfmaxxb %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xef] +#CHECK: wfmaxxb %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xef] +#CHECK: wfmaxxb %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0xb8,0x4a,0xef] + + wfmaxxb %v0, %v0, %v0, 0 + wfmaxxb %v0, %v0, %v0, 4 + wfmaxxb %v0, %v0, %v31, 0 + wfmaxxb %v0, %v31, %v0, 0 + wfmaxxb %v31, %v0, %v0, 0 + wfmaxxb %v18, %v3, %v20, 11 + +#CHECK: wfmindb %f0, %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xee] +#CHECK: wfmindb %f0, %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xee] +#CHECK: wfmindb %f0, %f0, %f0, 4 # encoding: [0xe7,0x00,0x00,0x48,0x30,0xee] +#CHECK: wfmindb %f0, %f0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xee] +#CHECK: wfmindb %f0, %v31, %f0, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xee] +#CHECK: wfmindb %v31, %f0, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xee] +#CHECK: wfmindb %v18, %f3, %v20, 11 # encoding: [0xe7,0x23,0x40,0xb8,0x3a,0xee] + + wfmindb %v0, %v0, %v0, 0 + wfmindb %f0, %f0, %f0, 0 + wfmindb %v0, %v0, %v0, 4 + wfmindb %v0, %v0, %v31, 0 + wfmindb %v0, %v31, %v0, 0 + wfmindb %v31, %v0, %v0, 0 + wfmindb %v18, %v3, %v20, 11 + +#CHECK: wfminsb %f0, %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xee] +#CHECK: wfminsb %f0, %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xee] +#CHECK: wfminsb %f0, %f0, %f0, 4 # encoding: [0xe7,0x00,0x00,0x48,0x20,0xee] +#CHECK: wfminsb %f0, %f0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xee] +#CHECK: wfminsb %f0, %v31, %f0, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xee] +#CHECK: wfminsb %v31, %f0, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xee] +#CHECK: wfminsb %v18, %f3, %v20, 11 # encoding: [0xe7,0x23,0x40,0xb8,0x2a,0xee] + + wfminsb %v0, %v0, %v0, 0 + wfminsb %f0, %f0, %f0, 0 + wfminsb %v0, %v0, %v0, 4 + wfminsb %v0, %v0, %v31, 0 + wfminsb %v0, %v31, %v0, 0 + wfminsb %v31, %v0, %v0, 0 + wfminsb %v18, %v3, %v20, 11 + +#CHECK: wfminxb %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xee] +#CHECK: wfminxb %v0, %v0, %v0, 4 # encoding: [0xe7,0x00,0x00,0x48,0x40,0xee] +#CHECK: wfminxb %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xee] +#CHECK: wfminxb %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xee] +#CHECK: wfminxb %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xee] +#CHECK: wfminxb %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0xb8,0x4a,0xee] + + wfminxb %v0, %v0, %v0, 0 + wfminxb %v0, %v0, %v0, 4 + wfminxb %v0, %v0, %v31, 0 + wfminxb %v0, %v31, %v0, 0 + wfminxb %v31, %v0, %v0, 0 + wfminxb %v18, %v3, %v20, 11 + +#CHECK: wfmasb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8f] +#CHECK: wfmasb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8f] +#CHECK: wfmasb %f0, %f0, %f0, %v31 # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x8f] +#CHECK: wfmasb %f0, %f0, %v31, %f0 # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x8f] +#CHECK: wfmasb %f0, %v31, %f0, %f0 # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x8f] +#CHECK: wfmasb %v31, %f0, %f0, %f0 # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x8f] +#CHECK: wfmasb %f13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x8f] + + wfmasb %v0, %v0, %v0, %v0 + wfmasb %f0, %f0, %f0, %f0 + wfmasb %v0, %v0, %v0, %v31 + wfmasb %v0, %v0, %v31, %v0 + wfmasb %v0, %v31, %v0, %v0 + wfmasb %v31, %v0, %v0, %v0 + wfmasb %v13, %v17, %v21, %v25 + +#CHECK: wfmaxb %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x04,0x08,0x00,0x8f] +#CHECK: wfmaxb %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x8f] +#CHECK: wfmaxb %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x8f] +#CHECK: wfmaxb %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x8f] +#CHECK: wfmaxb %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x8f] +#CHECK: wfmaxb %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x8f] + + wfmaxb %v0, %v0, %v0, %v0 + wfmaxb %v0, %v0, %v0, %v31 + wfmaxb %v0, %v0, %v31, %v0 + wfmaxb %v0, %v31, %v0, %v0 + wfmaxb %v31, %v0, %v0, %v0 + wfmaxb %v13, %v17, %v21, %v25 + +#CHECK: wfmsb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe7] +#CHECK: wfmsb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe7] +#CHECK: wfmsb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe7] +#CHECK: wfmsb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe7] +#CHECK: wfmsb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe7] +#CHECK: wfmsb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe7] + + wfmsb %v0, %v0, %v0 + wfmsb %f0, %f0, %f0 + wfmsb %v0, %v0, %v31 + wfmsb %v0, %v31, %v0 + wfmsb %v31, %v0, %v0 + wfmsb %v18, %v3, %v20 + +#CHECK: wfmxb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe7] +#CHECK: wfmxb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe7] +#CHECK: wfmxb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe7] +#CHECK: wfmxb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe7] +#CHECK: wfmxb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe7] + + wfmxb %v0, %v0, %v0 + wfmxb %v0, %v0, %v31 + wfmxb %v0, %v31, %v0 + wfmxb %v31, %v0, %v0 + wfmxb %v18, %v3, %v20 + +#CHECK: wfmssb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8e] +#CHECK: wfmssb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8e] +#CHECK: wfmssb %f0, %f0, %f0, %v31 # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x8e] +#CHECK: wfmssb %f0, %f0, %v31, %f0 # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x8e] +#CHECK: wfmssb %f0, %v31, %f0, %f0 # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x8e] +#CHECK: wfmssb %v31, %f0, %f0, %f0 # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x8e] +#CHECK: wfmssb %f13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x8e] + + wfmssb %v0, %v0, %v0, %v0 + wfmssb %f0, %f0, %f0, %f0 + wfmssb %v0, %v0, %v0, %v31 + wfmssb %v0, %v0, %v31, %v0 + wfmssb %v0, %v31, %v0, %v0 + wfmssb %v31, %v0, %v0, %v0 + wfmssb %v13, %v17, %v21, %v25 + +#CHECK: wfmsxb %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x04,0x08,0x00,0x8e] +#CHECK: wfmsxb %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x8e] +#CHECK: wfmsxb %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x8e] +#CHECK: wfmsxb %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x8e] +#CHECK: wfmsxb %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x8e] +#CHECK: wfmsxb %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x8e] + + wfmsxb %v0, %v0, %v0, %v0 + wfmsxb %v0, %v0, %v0, %v31 + wfmsxb %v0, %v0, %v31, %v0 + wfmsxb %v0, %v31, %v0, %v0 + wfmsxb %v31, %v0, %v0, %v0 + wfmsxb %v13, %v17, %v21, %v25 + +#CHECK: wfnmadb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9f] +#CHECK: wfnmadb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9f] +#CHECK: wfnmadb %f0, %f0, %f0, %v31 # encoding: [0xe7,0x00,0x03,0x08,0xf1,0x9f] +#CHECK: wfnmadb %f0, %f0, %v31, %f0 # encoding: [0xe7,0x00,0xf3,0x08,0x02,0x9f] +#CHECK: wfnmadb %f0, %v31, %f0, %f0 # encoding: [0xe7,0x0f,0x03,0x08,0x04,0x9f] +#CHECK: wfnmadb %v31, %f0, %f0, %f0 # encoding: [0xe7,0xf0,0x03,0x08,0x08,0x9f] +#CHECK: wfnmadb %f13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x53,0x08,0x97,0x9f] + + wfnmadb %v0, %v0, %v0, %v0 + wfnmadb %f0, %f0, %f0, %f0 + wfnmadb %v0, %v0, %v0, %v31 + wfnmadb %v0, %v0, %v31, %v0 + wfnmadb %v0, %v31, %v0, %v0 + wfnmadb %v31, %v0, %v0, %v0 + wfnmadb %v13, %v17, %v21, %v25 + +#CHECK: wfnmasb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9f] +#CHECK: wfnmasb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9f] +#CHECK: wfnmasb %f0, %f0, %f0, %v31 # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x9f] +#CHECK: wfnmasb %f0, %f0, %v31, %f0 # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x9f] +#CHECK: wfnmasb %f0, %v31, %f0, %f0 # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x9f] +#CHECK: wfnmasb %v31, %f0, %f0, %f0 # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x9f] +#CHECK: wfnmasb %f13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x9f] + + wfnmasb %v0, %v0, %v0, %v0 + wfnmasb %f0, %f0, %f0, %f0 + wfnmasb %v0, %v0, %v0, %v31 + wfnmasb %v0, %v0, %v31, %v0 + wfnmasb %v0, %v31, %v0, %v0 + wfnmasb %v31, %v0, %v0, %v0 + wfnmasb %v13, %v17, %v21, %v25 + +#CHECK: wfnmaxb %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x04,0x08,0x00,0x9f] +#CHECK: wfnmaxb %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x9f] +#CHECK: wfnmaxb %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x9f] +#CHECK: wfnmaxb %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x9f] +#CHECK: wfnmaxb %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x9f] +#CHECK: wfnmaxb %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x9f] + + wfnmaxb %v0, %v0, %v0, %v0 + wfnmaxb %v0, %v0, %v0, %v31 + wfnmaxb %v0, %v0, %v31, %v0 + wfnmaxb %v0, %v31, %v0, %v0 + wfnmaxb %v31, %v0, %v0, %v0 + wfnmaxb %v13, %v17, %v21, %v25 + +#CHECK: wfnmsdb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9e] +#CHECK: wfnmsdb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9e] +#CHECK: wfnmsdb %f0, %f0, %f0, %v31 # encoding: [0xe7,0x00,0x03,0x08,0xf1,0x9e] +#CHECK: wfnmsdb %f0, %f0, %v31, %f0 # encoding: [0xe7,0x00,0xf3,0x08,0x02,0x9e] +#CHECK: wfnmsdb %f0, %v31, %f0, %f0 # encoding: [0xe7,0x0f,0x03,0x08,0x04,0x9e] +#CHECK: wfnmsdb %v31, %f0, %f0, %f0 # encoding: [0xe7,0xf0,0x03,0x08,0x08,0x9e] +#CHECK: wfnmsdb %f13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x53,0x08,0x97,0x9e] + + wfnmsdb %v0, %v0, %v0, %v0 + wfnmsdb %f0, %f0, %f0, %f0 + wfnmsdb %v0, %v0, %v0, %v31 + wfnmsdb %v0, %v0, %v31, %v0 + wfnmsdb %v0, %v31, %v0, %v0 + wfnmsdb %v31, %v0, %v0, %v0 + wfnmsdb %v13, %v17, %v21, %v25 + +#CHECK: wfnmssb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9e] +#CHECK: wfnmssb %f0, %f0, %f0, %f0 # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9e] +#CHECK: wfnmssb %f0, %f0, %f0, %v31 # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x9e] +#CHECK: wfnmssb %f0, %f0, %v31, %f0 # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x9e] +#CHECK: wfnmssb %f0, %v31, %f0, %f0 # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x9e] +#CHECK: wfnmssb %v31, %f0, %f0, %f0 # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x9e] +#CHECK: wfnmssb %f13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x9e] + + wfnmssb %v0, %v0, %v0, %v0 + wfnmssb %f0, %f0, %f0, %f0 + wfnmssb %v0, %v0, %v0, %v31 + wfnmssb %v0, %v0, %v31, %v0 + wfnmssb %v0, %v31, %v0, %v0 + wfnmssb %v31, %v0, %v0, %v0 + wfnmssb %v13, %v17, %v21, %v25 + +#CHECK: wfnmsxb %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x04,0x08,0x00,0x9e] +#CHECK: wfnmsxb %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x9e] +#CHECK: wfnmsxb %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x9e] +#CHECK: wfnmsxb %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x9e] +#CHECK: wfnmsxb %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x9e] +#CHECK: wfnmsxb %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x9e] + + wfnmsxb %v0, %v0, %v0, %v0 + wfnmsxb %v0, %v0, %v0, %v31 + wfnmsxb %v0, %v0, %v31, %v0 + wfnmsxb %v0, %v31, %v0, %v0 + wfnmsxb %v31, %v0, %v0, %v0 + wfnmsxb %v13, %v17, %v21, %v25 + +#CHECK: wfssb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe2] +#CHECK: wfssb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe2] +#CHECK: wfssb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe2] +#CHECK: wfssb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe2] +#CHECK: wfssb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe2] +#CHECK: wfssb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe2] + + wfssb %v0, %v0, %v0 + wfssb %f0, %f0, %f0 + wfssb %v0, %v0, %v31 + wfssb %v0, %v31, %v0 + wfssb %v31, %v0, %v0 + wfssb %v18, %v3, %v20 + +#CHECK: wfsxb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe2] +#CHECK: wfsxb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe2] +#CHECK: wfsxb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe2] +#CHECK: wfsxb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe2] +#CHECK: wfsxb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe2] + + wfsxb %v0, %v0, %v0 + wfsxb %v0, %v0, %v31 + wfsxb %v0, %v31, %v0 + wfsxb %v31, %v0, %v0 + wfsxb %v18, %v3, %v20 + +#CHECK: wfsqsb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xce] +#CHECK: wfsqsb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0xce] +#CHECK: wfsqsb %f0, %f15 # encoding: [0xe7,0x0f,0x00,0x08,0x20,0xce] +#CHECK: wfsqsb %f0, %v31 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xce] +#CHECK: wfsqsb %f15, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x20,0xce] +#CHECK: wfsqsb %v31, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xce] +#CHECK: wfsqsb %f14, %v17 # encoding: [0xe7,0xe1,0x00,0x08,0x24,0xce] + + wfsqsb %v0, %v0 + wfsqsb %f0, %f0 + wfsqsb %v0, %v15 + wfsqsb %v0, %v31 + wfsqsb %v15, %v0 + wfsqsb %v31, %v0 + wfsqsb %v14, %v17 + +#CHECK: wfsqxb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0xce] +#CHECK: wfsqxb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x08,0x40,0xce] +#CHECK: wfsqxb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xce] +#CHECK: wfsqxb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x40,0xce] +#CHECK: wfsqxb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xce] +#CHECK: wfsqxb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x08,0x44,0xce] + + wfsqxb %v0, %v0 + wfsqxb %v0, %v15 + wfsqxb %v0, %v31 + wfsqxb %v15, %v0 + wfsqxb %v31, %v0 + wfsqxb %v14, %v17 + +#CHECK: wftcisb %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0x4a] +#CHECK: wftcisb %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x20,0x4a] +#CHECK: wftcisb %f0, %f0, 4095 # encoding: [0xe7,0x00,0xff,0xf8,0x20,0x4a] +#CHECK: wftcisb %f0, %f15, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x20,0x4a] +#CHECK: wftcisb %f0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x24,0x4a] +#CHECK: wftcisb %f15, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x20,0x4a] +#CHECK: wftcisb %v31, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x28,0x4a] +#CHECK: wftcisb %f4, %v21, 1656 # encoding: [0xe7,0x45,0x67,0x88,0x24,0x4a] + + wftcisb %v0, %v0, 0 + wftcisb %f0, %f0, 0 + wftcisb %v0, %v0, 4095 + wftcisb %v0, %v15, 0 + wftcisb %v0, %v31, 0 + wftcisb %v15, %v0, 0 + wftcisb %v31, %v0, 0 + wftcisb %v4, %v21, 0x678 + +#CHECK: wftcixb %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x40,0x4a] +#CHECK: wftcixb %v0, %v0, 4095 # encoding: [0xe7,0x00,0xff,0xf8,0x40,0x4a] +#CHECK: wftcixb %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x40,0x4a] +#CHECK: wftcixb %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x44,0x4a] +#CHECK: wftcixb %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x40,0x4a] +#CHECK: wftcixb %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x48,0x4a] +#CHECK: wftcixb %v4, %v21, 1656 # encoding: [0xe7,0x45,0x67,0x88,0x44,0x4a] + + wftcixb %v0, %v0, 0 + wftcixb %v0, %v0, 4095 + wftcixb %v0, %v15, 0 + wftcixb %v0, %v31, 0 + wftcixb %v15, %v0, 0 + wftcixb %v31, %v0, 0 + wftcixb %v4, %v21, 0x678 + diff --git a/test/MC/SystemZ/invalid-instructions-spellcheck.s b/test/MC/SystemZ/invalid-instructions-spellcheck.s new file mode 100644 index 000000000000..e77b99d9a387 --- /dev/null +++ b/test/MC/SystemZ/invalid-instructions-spellcheck.s @@ -0,0 +1,66 @@ +# RUN: not llvm-mc -triple=systemz -mcpu=z13 -show-encoding < %s 2>&1 | FileCheck %s +# RUN: not llvm-mc -triple=systemz -mcpu=zEC12 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ZEC12 + +# This tests the mnemonic spell checker. + +# First check what happens when an instruction is omitted: + + %r1, %r2, %r3 + +# CHECK: error: unexpected token at start of statement +# CHECK-NEXT: %r1, %r2, %r3 +# CHECK-NEXT: ^ + +# We don't want to see a suggestion here; the edit distance is too large to +# give sensible suggestions: + + aaaaaaaaaaaaaaa %r1, %r2, %r3 + +# CHECK: error: invalid instruction +# CHECK-NEXT: aaaaaaaaaaaaaaa %r1, %r2, %r3 +# CHECK-NEXT: ^ + +# Check that we get one suggestion: 'cpdt' is 1 edit away, i.e. an deletion. + + cpdtX %r1, 0(4, %r15), 0 + +#CHECK: error: invalid instruction, did you mean: cpdt +#CHECK-NEXT: cpdtX %r1, 0(4, %r15), 0 +#CHECK-NEXT: ^ + +# Check edit distance 1 and 2 + + ltTr %r1, %r2 + +# CHECK: error: invalid instruction, did you mean: lr, lt, ltdr, ltdtr, lter, ltgr, ltr, ltxr, ltxtr, tr, trtr? +# CHECK-NEXT: ltTr %r1, %r2 +# CHECK-NEXT: ^ + +# Check edit distance 1 and 2, just insertions: + + begin 0, 65292 + +# CHECK: error: invalid instruction, did you mean: tbegin, tbeginc? +# CHECK-NEXT: begin 0, 65292 +# CHECK-NEXT: ^ + +# Check an instruction that is 2 edits away, and also has a lot of candidates: + + adt %r1, 244(%r15) + +# CHECK: error: invalid instruction, did you mean: a, ad, adb, adr, adtr, adtra, d, lat, mad, qadtr? +# CHECK-NEXT: adt %r1, 244(%r15) +# CHECK-NEXT: ^ + +# Here it is checked that we don't suggest instructions that are not supported. +# For example, in pre-z13 mode we don't want to see suggestions for vector instructions. + + vlvggp %v1, %r2, %r3 + +# CHECK-ZEC12: error: invalid instruction +# CHECK-ZEC12: vlvggp +# CHECK-ZEC12: ^ + +# CHECK: error: invalid instruction, did you mean: vlvg, vlvgg, vlvgp? +# CHECK-NEXT: vlvggp %v1, %r2, %r3 +# CHECK-NEXT: ^ diff --git a/test/MC/X86/pr22028.s b/test/MC/X86/pr22028.s index 6ce7da07488b..d82b50f051a1 100644 --- a/test/MC/X86/pr22028.s +++ b/test/MC/X86/pr22028.s @@ -1,6 +1,6 @@ // RUN: llvm-mc -triple i386-unknown-unknown-code16 -show-encoding %s | FileCheck --check-prefix=CHECK16 %s -// RUN: llvm-mc -triple i386-unknown-unknown -show-encoding %s | FileCheck --check-prefix=CHECK %s -// RUN: llvm-mc -triple i686-unknown-unknown -show-encoding %s | FileCheck --check-prefix=CHECK %s +// RUN: llvm-mc -triple i386-unknown-unknown -show-encoding %s | FileCheck %s +// RUN: llvm-mc -triple i686-unknown-unknown -show-encoding %s | FileCheck %s .intel_syntax diff --git a/test/Object/no-section-table.test b/test/Object/no-section-table.test index bd60e681b71f..9ecde4f8c369 100644 --- a/test/Object/no-section-table.test +++ b/test/Object/no-section-table.test @@ -3,7 +3,7 @@ RUN: | FileCheck %s CHECK: DynamicSection [ (24 entries) CHECK: Tag Type Name/Value -CHECK: 0x0000000000000001 NEEDED SharedLibrary (libc.so.6) +CHECK: 0x0000000000000001 NEEDED Shared library: [libc.so.6] CHECK: 0x000000000000000C INIT 0x4B8 CHECK: 0x000000000000000D FINI 0x618 CHECK: 0x0000000000000019 INIT_ARRAY 0x2006C0 diff --git a/test/Object/readobj-shared-object.test b/test/Object/readobj-shared-object.test index 173581e60c39..59f5ff127cf3 100644 --- a/test/Object/readobj-shared-object.test +++ b/test/Object/readobj-shared-object.test @@ -302,9 +302,9 @@ ELF: ] ELF32: DynamicSection [ (9 entries) ELF32: Tag Type Name/Value -ELF32: 0x00000001 NEEDED SharedLibrary (libc.so.6) -ELF32: 0x00000001 NEEDED SharedLibrary (libm.so.6) -ELF32: 0x0000000E SONAME LibrarySoname (libfoo.so) +ELF32: 0x00000001 NEEDED Shared library: [libc.so.6] +ELF32: 0x00000001 NEEDED Shared library: [libm.so.6] +ELF32: 0x0000000E SONAME Library soname: [libfoo.so] ELF32: 0x00000004 HASH {{[0-9a-f]+}} ELF32: 0x00000005 STRTAB {{[0-9a-f]+}} ELF32: 0x00000006 SYMTAB {{[0-9a-f]+}} @@ -315,9 +315,9 @@ ELF32: ] ELF64: DynamicSection [ (9 entries) ELF64: Tag Type Name/Value -ELF64: 0x0000000000000001 NEEDED SharedLibrary (libc.so.6) -ELF64: 0x0000000000000001 NEEDED SharedLibrary (libm.so.6) -ELF64: 0x000000000000000E SONAME LibrarySoname (libfoo.so) +ELF64: 0x0000000000000001 NEEDED Shared library: [libc.so.6] +ELF64: 0x0000000000000001 NEEDED Shared library: [libm.so.6] +ELF64: 0x000000000000000E SONAME Library soname: [libfoo.so] ELF64: 0x0000000000000004 HASH {{[0-9a-f]+}} ELF64: 0x0000000000000005 STRTAB {{[0-9a-f]+}} ELF64: 0x0000000000000006 SYMTAB {{[0-9a-f]+}} diff --git a/test/ObjectYAML/CodeView/guid.yaml b/test/ObjectYAML/CodeView/guid.yaml new file mode 100644 index 000000000000..8d8d0142c5e3 --- /dev/null +++ b/test/ObjectYAML/CodeView/guid.yaml @@ -0,0 +1,59 @@ +# RUN: yaml2obj %s | obj2yaml | FileCheck %s + +--- !COFF +header: + Machine: IMAGE_FILE_MACHINE_AMD64 + Characteristics: [ ] +sections: + - Name: '.debug$T' + Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ] + Alignment: 1 + Types: + - Kind: LF_TYPESERVER2 + TypeServer2: + Guid: '{01DF191B-22BF-6B42-96CE-5258B8329FE5}' + Age: 24 + Name: 'C:\src\llvm-project\build\vc140.pdb' +symbols: + - Name: '.debug$T' + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_STATIC + SectionDefinition: + Length: 64 + NumberOfRelocations: 0 + NumberOfLinenumbers: 0 + CheckSum: 0 + Number: 0 +... + +# CHECK: --- !COFF +# CHECK: header: +# CHECK: Machine: IMAGE_FILE_MACHINE_AMD64 +# CHECK: Characteristics: [ ] +# CHECK: sections: +# CHECK: - Name: '.debug$T' +# CHECK: Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ] +# CHECK: Alignment: 1 +# CHECK: Types: +# CHECK: - Kind: LF_TYPESERVER2 +# CHECK: TypeServer2: +# CHECK: Guid: '{01DF191B-22BF-6B42-96CE-5258B8329FE5}' +# CHECK: Age: 24 +# CHECK: Name: 'C:\src\llvm-project\build\vc140.pdb' +# CHECK: symbols: +# CHECK: - Name: '.debug$T' +# CHECK: Value: 0 +# CHECK: SectionNumber: 1 +# CHECK: SimpleType: IMAGE_SYM_TYPE_NULL +# CHECK: ComplexType: IMAGE_SYM_DTYPE_NULL +# CHECK: StorageClass: IMAGE_SYM_CLASS_STATIC +# CHECK: SectionDefinition: +# CHECK: Length: 64 +# CHECK: NumberOfRelocations: 0 +# CHECK: NumberOfLinenumbers: 0 +# CHECK: CheckSum: 0 +# CHECK: Number: 0 +# CHECK: ... diff --git a/test/Other/cgscc-libcall-update.ll b/test/Other/cgscc-libcall-update.ll new file mode 100644 index 000000000000..e0833ca09266 --- /dev/null +++ b/test/Other/cgscc-libcall-update.ll @@ -0,0 +1,61 @@ +; Make sure that the CGSCC pass manager can handle when instcombine simplifies +; one libcall into an unrelated libcall and update the call graph accordingly. +; +; Also check that it can handle inlining *removing* a libcall entirely. +; +; RUN: opt -passes='cgscc(inline,function(instcombine))' -S < %s | FileCheck %s + +define i8* @wibble(i8* %arg1, i8* %arg2) { +; CHECK-LABEL: define i8* @wibble( +bb: + %tmp = alloca [1024 x i8], align 16 + %tmp2 = getelementptr inbounds [1024 x i8], [1024 x i8]* %tmp, i64 0, i64 0 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* %arg1, i64 1024, i32 0, i1 false) +; CHECK: call void @llvm.memcpy + %tmp3 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp2, i1 false, i1 true) + %tmp4 = call i8* @__strncpy_chk(i8* %arg2, i8* %tmp2, i64 1023, i64 %tmp3) +; CHECK-NOT: call +; CHECK: call i8* @strncpy(i8* %arg2, i8* %tmp2, i64 1023) +; CHECK-NOT: call + + ret i8* %tmp4 +; CHECK: ret +} + +define i8* @strncpy(i8* %arg1, i8* %arg2, i64 %size) noinline { +bb: + %result = call i8* @my_special_strncpy(i8* %arg1, i8* %arg2, i64 %size) + ret i8* %result +} + +declare i8* @my_special_strncpy(i8* %arg1, i8* %arg2, i64 %size) + +declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1) + +declare i8* @__strncpy_chk(i8*, i8*, i64, i64) + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) + +; Check that even when we completely remove a libcall we don't get the call +; graph wrong once we handle libcalls in the call graph specially to address +; the above case. +define i32 @hoge(i32* %arg1) { +; CHECK-LABEL: define i32 @hoge( +bb: + %tmp41 = load i32*, i32** null + %tmp6 = load i32, i32* %arg1 + %tmp7 = call i32 @ntohl(i32 %tmp6) +; CHECK-NOT: call i32 @ntohl + ret i32 %tmp7 +; CHECK: ret i32 +} + +; Even though this function is not used, it should be retained as it may be +; used when doing further libcall transformations. +define internal i32 @ntohl(i32 %x) { +; CHECK-LABEL: define internal i32 @ntohl( +entry: + %and2 = lshr i32 %x, 8 + %shr = and i32 %and2, 65280 + ret i32 %shr +} diff --git a/test/Other/new-pass-manager.ll b/test/Other/new-pass-manager.ll index bf8e596d118b..35f596e77988 100644 --- a/test/Other/new-pass-manager.ll +++ b/test/Other/new-pass-manager.ll @@ -23,6 +23,7 @@ ; CHECK-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(CGSCCAnalysisManager|AnalysisManager<.*LazyCallGraph::SCC.*>).*}},{{.*}}Module> ; CHECK-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(FunctionAnalysisManager|AnalysisManager<.*Function.*>).*}},{{.*}}Module> ; CHECK-CGSCC-PASS-NEXT: Running analysis: LazyCallGraphAnalysis +; CHECK-CGSCC-PASS-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-CGSCC-PASS-NEXT: Running an SCC pass across the RefSCC: [(foo)] ; CHECK-CGSCC-PASS-NEXT: Starting CGSCC pass manager run ; CHECK-CGSCC-PASS-NEXT: Running pass: NoOpCGSCCPass @@ -407,6 +408,7 @@ ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(CGSCCAnalysisManager|AnalysisManager<.*LazyCallGraph::SCC.*>).*}},{{.*}}Module> ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(FunctionAnalysisManager|AnalysisManager<.*Function.*>).*}},{{.*}}Module> ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: LazyCallGraphAnalysis +; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running an SCC pass across the RefSCC: [(foo)] ; CHECK-REPEAT-CGSCC-PASS-NEXT: Starting CGSCC pass manager run ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running pass: RepeatedPass diff --git a/test/ThinLTO/X86/debuginfo-cu-import.ll b/test/ThinLTO/X86/debuginfo-cu-import.ll index 42a751191860..e0b066c736e4 100644 --- a/test/ThinLTO/X86/debuginfo-cu-import.ll +++ b/test/ThinLTO/X86/debuginfo-cu-import.ll @@ -51,11 +51,11 @@ entry: !9 = !DIGlobalVariableExpression(var: !10) !10 = !DIGlobalVariable(name: "version", scope: !4, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true) !11 = !{!12, !16} -!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !13, line: 8) +!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !13, file: !1, line: 8) !13 = distinct !DISubprogram(name: "a", linkageName: "_ZN1A1aEv", scope: !4, file: !1, line: 7, type: !14, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !5) !14 = !DISubroutineType(types: !15) !15 = !{null} -!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, line: 8) +!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, file: !1, line: 8) !17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 9, column: 8) !18 = distinct !DISubprogram(name: "c", linkageName: "_ZN1A1cEv", scope: !4, file: !1, line: 9, type: !14, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !5) !19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 10, column: 8) diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll index 4b9e7c3956f5..1dfc08761965 100644 --- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -23,9 +23,63 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp3( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp3( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X32-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i32 +; X32-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; X32-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP6]], [[TMP7]] +; X32-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP6]], [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; X32-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X32-NEXT: [[TMP16:%.*]] = zext i8 [[TMP14]] to i32 +; X32-NEXT: [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp3( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X64-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = zext i8 [[TMP14]] to i32 +; X64-NEXT: [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) ret i32 %call @@ -50,27 +104,225 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp5( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp5( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X32-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; X32-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; X32-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; X32-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X32-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp5( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = zext i8 [[TMP14]] to i32 +; X64-NEXT: [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) ret i32 %call } define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp6( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp6( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 +; X32-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X32-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X32-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 +; X32-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; X32-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] +; X32-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp6( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP6]], [[LOADBB:%.*]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP7]], [[LOADBB]] ], [ [[TMP20:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 2 +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP16]]) +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 +; X64-NEXT: [[TMP20]] = zext i16 [[TMP18]] to i64 +; X64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]] +; X64-NEXT: br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) ret i32 %call } define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp7( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 +; X32-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X32-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X32-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 +; X32-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; X32-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] +; X32-NEXT: br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; X32-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = zext i8 [[TMP22]] to i32 +; X32-NEXT: [[TMP25:%.*]] = zext i8 [[TMP23]] to i32 +; X32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp7( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP6]], [[LOADBB:%.*]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP7]], [[LOADBB]] ], [ [[TMP20:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 2 +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP16]]) +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 +; X64-NEXT: [[TMP20]] = zext i16 [[TMP18]] to i64 +; X64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]] +; X64-NEXT: br i1 [[TMP21]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP22:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; X64-NEXT: [[TMP23:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; X64-NEXT: [[TMP24:%.*]] = load i8, i8* [[TMP22]] +; X64-NEXT: [[TMP25:%.*]] = load i8, i8* [[TMP23]] +; X64-NEXT: [[TMP26:%.*]] = zext i8 [[TMP24]] to i32 +; X64-NEXT: [[TMP27:%.*]] = zext i8 [[TMP25]] to i32 +; X64-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP27]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP28]], [[LOADBB2]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) ret i32 %call @@ -78,8 +330,35 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp8( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8) -; X32-NEXT: ret i32 [[CALL]] +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] ; ; X64-LABEL: @cmp8( ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* @@ -99,72 +378,691 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp9( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp9( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X32-NEXT: [[TMP20:%.*]] = load i8, i8* [[TMP18]] +; X32-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X32-NEXT: [[TMP22:%.*]] = zext i8 [[TMP20]] to i32 +; X32-NEXT: [[TMP23:%.*]] = zext i8 [[TMP21]] to i32 +; X32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP22]], [[TMP23]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP24]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp9( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) ret i32 %call } define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp10( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp10( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP26:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP27:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i16, i16* [[TMP18]], i16 4 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 4 +; X32-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP22]]) +; X32-NEXT: [[TMP25:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP23]]) +; X32-NEXT: [[TMP26]] = zext i16 [[TMP24]] to i32 +; X32-NEXT: [[TMP27]] = zext i16 [[TMP25]] to i32 +; X32-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]] +; X32-NEXT: br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp10( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) ret i32 %call } define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp11( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP26:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP27:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i16, i16* [[TMP18]], i16 4 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 4 +; X32-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP22]]) +; X32-NEXT: [[TMP25:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP23]]) +; X32-NEXT: [[TMP26]] = zext i16 [[TMP24]] to i32 +; X32-NEXT: [[TMP27]] = zext i16 [[TMP25]] to i32 +; X32-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]] +; X32-NEXT: br i1 [[TMP28]], label [[LOADBB3:%.*]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP29:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X32-NEXT: [[TMP30:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X32-NEXT: [[TMP31:%.*]] = load i8, i8* [[TMP29]] +; X32-NEXT: [[TMP32:%.*]] = load i8, i8* [[TMP30]] +; X32-NEXT: [[TMP33:%.*]] = zext i8 [[TMP31]] to i32 +; X32-NEXT: [[TMP34:%.*]] = zext i8 [[TMP32]] to i32 +; X32-NEXT: [[TMP35:%.*]] = sub i32 [[TMP33]], [[TMP34]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP35]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp11( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X64-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X64-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X64-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP21]] +; X64-NEXT: [[TMP24:%.*]] = zext i8 [[TMP22]] to i32 +; X64-NEXT: [[TMP25:%.*]] = zext i8 [[TMP23]] to i32 +; X64-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) ret i32 %call } define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp12( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp12( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp12( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) ret i32 %call } define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp13( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[LOADBB3:%.*]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP27:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X32-NEXT: [[TMP28:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X32-NEXT: [[TMP29:%.*]] = load i8, i8* [[TMP27]] +; X32-NEXT: [[TMP30:%.*]] = load i8, i8* [[TMP28]] +; X32-NEXT: [[TMP31:%.*]] = zext i8 [[TMP29]] to i32 +; X32-NEXT: [[TMP32:%.*]] = zext i8 [[TMP30]] to i32 +; X32-NEXT: [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP32]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP33]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp13( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X64-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X64-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X64-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP21]] +; X64-NEXT: [[TMP24:%.*]] = zext i8 [[TMP22]] to i32 +; X64-NEXT: [[TMP25:%.*]] = zext i8 [[TMP23]] to i32 +; X64-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) ret i32 %call } define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp14( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ], [ [[TMP35:%.*]], [[LOADBB3:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ], [ [[TMP36:%.*]], [[LOADBB3]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[LOADBB3]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP27:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP28:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP29:%.*]] = getelementptr i16, i16* [[TMP27]], i16 6 +; X32-NEXT: [[TMP30:%.*]] = getelementptr i16, i16* [[TMP28]], i16 6 +; X32-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP29]] +; X32-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP30]] +; X32-NEXT: [[TMP33:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP31]]) +; X32-NEXT: [[TMP34:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP32]]) +; X32-NEXT: [[TMP35]] = zext i16 [[TMP33]] to i32 +; X32-NEXT: [[TMP36]] = zext i16 [[TMP34]] to i32 +; X32-NEXT: [[TMP37:%.*]] = icmp eq i32 [[TMP35]], [[TMP36]] +; X32-NEXT: br i1 [[TMP37]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp14( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ], [ [[TMP28:%.*]], [[LOADBB2:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ], [ [[TMP29:%.*]], [[LOADBB2]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP21:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6 +; X64-NEXT: [[TMP23:%.*]] = getelementptr i16, i16* [[TMP21]], i16 6 +; X64-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP22]] +; X64-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP23]] +; X64-NEXT: [[TMP26:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP24]]) +; X64-NEXT: [[TMP27:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP25]]) +; X64-NEXT: [[TMP28]] = zext i16 [[TMP26]] to i64 +; X64-NEXT: [[TMP29]] = zext i16 [[TMP27]] to i64 +; X64-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[TMP29]] +; X64-NEXT: br i1 [[TMP30]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) ret i32 %call } define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp15( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp15( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ], [ [[TMP28:%.*]], [[LOADBB2:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ], [ [[TMP29:%.*]], [[LOADBB2]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP21:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6 +; X64-NEXT: [[TMP23:%.*]] = getelementptr i16, i16* [[TMP21]], i16 6 +; X64-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP22]] +; X64-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP23]] +; X64-NEXT: [[TMP26:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP24]]) +; X64-NEXT: [[TMP27:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP25]]) +; X64-NEXT: [[TMP28]] = zext i16 [[TMP26]] to i64 +; X64-NEXT: [[TMP29]] = zext i16 [[TMP27]] to i64 +; X64-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[TMP29]] +; X64-NEXT: br i1 [[TMP30]], label [[LOADBB3:%.*]], label [[RES_BLOCK]] +; X64: loadbb3: +; X64-NEXT: [[TMP31:%.*]] = getelementptr i8, i8* [[X]], i8 14 +; X64-NEXT: [[TMP32:%.*]] = getelementptr i8, i8* [[Y]], i8 14 +; X64-NEXT: [[TMP33:%.*]] = load i8, i8* [[TMP31]] +; X64-NEXT: [[TMP34:%.*]] = load i8, i8* [[TMP32]] +; X64-NEXT: [[TMP35:%.*]] = zext i8 [[TMP33]] to i32 +; X64-NEXT: [[TMP36:%.*]] = zext i8 [[TMP34]] to i32 +; X64-NEXT: [[TMP37:%.*]] = sub i32 [[TMP35]], [[TMP36]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP37]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) ret i32 %call } define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp16( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp16( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ], [ [[TMP33:%.*]], [[LOADBB3:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ], [ [[TMP34:%.*]], [[LOADBB3]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[LOADBB3]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP27:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP28:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP27]], i32 3 +; X32-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP28]], i32 3 +; X32-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP29]] +; X32-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP30]] +; X32-NEXT: [[TMP33]] = call i32 @llvm.bswap.i32(i32 [[TMP31]]) +; X32-NEXT: [[TMP34]] = call i32 @llvm.bswap.i32(i32 [[TMP32]]) +; X32-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP33]], [[TMP34]] +; X32-NEXT: br i1 [[TMP35]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp16( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 +; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] +; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]]) +; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) +; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]] +; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) ret i32 %call @@ -190,8 +1088,25 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -221,8 +1136,25 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -234,8 +1166,27 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 +; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -247,8 +1198,34 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 +; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; ALL: loadbb2: +; ALL-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; ALL-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; ALL-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; ALL-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; ALL-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; ALL-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -260,8 +1237,27 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp_eq8( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; X32-NEXT: ret i32 [[CONV]] ; @@ -283,11 +1279,60 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq9( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq9( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X32-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X32-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; X32-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; X32-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq9( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) %cmp = icmp eq i32 %call, 0 @@ -296,11 +1341,64 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq10( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq10( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 4 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 4 +; X32-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq10( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) %cmp = icmp eq i32 %call, 0 @@ -309,11 +1407,78 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq11( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 4 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 4 +; X32-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X32-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X32-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X32-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]] +; X32-NEXT: br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq11( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; X64-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) %cmp = icmp eq i32 %call, 0 @@ -322,11 +1487,64 @@ define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq12( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq12( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq12( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) %cmp = icmp eq i32 %call, 0 @@ -335,11 +1553,78 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq13( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X32-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X32-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X32-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]] +; X32-NEXT: br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq13( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; X64-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) %cmp = icmp eq i32 %call, 0 @@ -348,11 +1633,82 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq14( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP20:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 6 +; X32-NEXT: [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6 +; X32-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP22]] +; X32-NEXT: [[TMP25:%.*]] = icmp ne i16 [[TMP23]], [[TMP24]] +; X32-NEXT: br i1 [[TMP25]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq14( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 6 +; X64-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 6 +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X64-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) %cmp = icmp eq i32 %call, 0 @@ -361,11 +1717,52 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq15( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq15( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 6 +; X64-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 6 +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X64-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X64: loadbb3: +; X64-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 14 +; X64-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 14 +; X64-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X64-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X64-NEXT: [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]] +; X64-NEXT: br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) %cmp = icmp eq i32 %call, 0 @@ -374,11 +1771,73 @@ define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq16( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq16( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP20:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 3 +; X32-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[TMP20]], i32 3 +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP22]] +; X32-NEXT: [[TMP25:%.*]] = icmp ne i32 [[TMP23]], [[TMP24]] +; X32-NEXT: br i1 [[TMP25]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq16( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 1 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[TMP6]], i64 1 +; X64-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i64 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) %cmp = icmp eq i32 %call, 0 diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll index b6b775797826..088b177c2e11 100644 --- a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll +++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll @@ -194,7 +194,6 @@ rare.2: br label %fallthrough } - declare void @slowpath(i32, i32*) ; Make sure we don't end up in an infinite loop after we fail to sink. @@ -218,3 +217,37 @@ load.i145: pl_loop.i.i122: br label %pl_loop.i.i122 } + +; Make sure we can sink address computation even +; if there is a cycle in phi nodes. +define void @test9(i1 %cond, i64* %base) { +; CHECK-LABEL: @test9 +entry: + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br label %header + +header: + %iv = phi i32 [0, %entry], [%iv.inc, %backedge] + %casted.loop = phi i32* [%casted, %entry], [%casted.merged, %backedge] + br i1 %cond, label %if.then, label %backedge + +if.then: + call void @foo(i32 %iv) + %addr.1 = getelementptr inbounds i64, i64* %base, i64 5 + %casted.1 = bitcast i64* %addr.1 to i32* + br label %backedge + +backedge: +; CHECK-LABEL: backedge: +; CHECK: getelementptr i8, {{.+}} 40 + %casted.merged = phi i32* [%casted.loop, %header], [%casted.1, %if.then] + %v = load i32, i32* %casted.merged, align 4 + call void @foo(i32 %v) + %iv.inc = add i32 %iv, 1 + %cmp = icmp slt i32 %iv.inc, 1000 + br i1 %cmp, label %header, label %exit + +exit: + ret void +} diff --git a/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll b/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll new file mode 100644 index 000000000000..57dbdd883190 --- /dev/null +++ b/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -globals-aa -early-cse-memssa | FileCheck %s + +define i16 @f1() readonly { + ret i16 0 +} + +declare void @f2() + +; Check that EarlyCSE correctly handles function calls that don't have +; a MemoryAccess. In this case the calls to @f1 have no +; MemoryAccesses since globals-aa determines that @f1 doesn't +; read/write memory at all. + +define void @f3() { +; CHECK-LABEL: @f3( +; CHECK-NEXT: [[CALL1:%.*]] = call i16 @f1() +; CHECK-NEXT: call void @f2() +; CHECK-NEXT: ret void +; + %call1 = call i16 @f1() + call void @f2() + %call2 = call i16 @f1() + ret void +} diff --git a/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll b/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll new file mode 100644 index 000000000000..513379d0bd01 --- /dev/null +++ b/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll @@ -0,0 +1,79 @@ +; This test checks if debug loc is propagated to load/store created by GVN/Instcombine. +; RUN: opt < %s -gvn -S | FileCheck %s --check-prefixes=ALL,GVN +; RUN: opt < %s -gvn -instcombine -S | FileCheck %s --check-prefixes=ALL,INSTCOMBINE + +; struct node { +; int *v; +; struct desc *descs; +; }; + +; struct desc { +; struct node *node; +; }; + +; extern int bar(void *v, void* n); + +; int test(struct desc *desc) +; { +; void *v, *n; +; v = !desc ? ((void *)0) : desc->node->v; // Line 15 +; n = &desc->node->descs[0]; // Line 16 +; return bar(v, n); +; } + +; Line 16, Column 13: +; n = &desc->node->descs[0]; +; ^ + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%struct.desc = type { %struct.node* } +%struct.node = type { i32*, %struct.desc* } + +define i32 @test(%struct.desc* readonly %desc) local_unnamed_addr #0 !dbg !4 { +entry: + %tobool = icmp eq %struct.desc* %desc, null + br i1 %tobool, label %cond.end, label %cond.false, !dbg !9 +; ALL: br i1 %tobool, label %entry.cond.end_crit_edge, label %cond.false, !dbg [[LOC_15_6:![0-9]+]] +; ALL: entry.cond.end_crit_edge: +; GVN: %.pre = load %struct.node*, %struct.node** null, align 8, !dbg [[LOC_16_13:![0-9]+]] +; INSTCOMBINE:store %struct.node* undef, %struct.node** null, align 536870912, !dbg [[LOC_16_13:![0-9]+]] + +cond.false: + %0 = bitcast %struct.desc* %desc to i8***, !dbg !11 + %1 = load i8**, i8*** %0, align 8, !dbg !11 + %2 = load i8*, i8** %1, align 8 + br label %cond.end, !dbg !9 + +cond.end: + %3 = phi i8* [ %2, %cond.false ], [ null, %entry ], !dbg !9 + %node2 = getelementptr inbounds %struct.desc, %struct.desc* %desc, i64 0, i32 0 + %4 = load %struct.node*, %struct.node** %node2, align 8, !dbg !10 + %descs = getelementptr inbounds %struct.node, %struct.node* %4, i64 0, i32 1 + %5 = bitcast %struct.desc** %descs to i8** + %6 = load i8*, i8** %5, align 8 + %call = tail call i32 @bar(i8* %3, i8* %6) + ret i32 %call +} + +declare i32 @bar(i8*, i8*) local_unnamed_addr #1 +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug) +!1 = !DIFile(filename: "test.c", directory: ".") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 12, type: !5, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !8) +!5 = !DISubroutineType(types: !6) +!6 = !{!7} +!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!8 = !{} +!9 = !DILocation(line: 15, column: 6, scope: !4) +!10 = !DILocation(line: 16, column: 13, scope: !4) +!11 = !DILocation(line: 15, column: 34, scope: !4) + +;ALL: [[SCOPE:![0-9]+]] = distinct !DISubprogram(name: "test",{{.*}} +;ALL: [[LOC_15_6]] = !DILocation(line: 15, column: 6, scope: [[SCOPE]]) +;ALL: [[LOC_16_13]] = !DILocation(line: 16, column: 13, scope: [[SCOPE]]) diff --git a/test/Transforms/GVN/PRE/phi-translate.ll b/test/Transforms/GVN/PRE/phi-translate.ll index 1f6c7c8d33ea..55f5fd6465b6 100644 --- a/test/Transforms/GVN/PRE/phi-translate.ll +++ b/test/Transforms/GVN/PRE/phi-translate.ll @@ -6,12 +6,12 @@ target datalayout = "e-p:64:64:64" ; CHECK: entry.end_crit_edge: ; CHECK: %[[INDEX:[a-z0-9.]+]] = sext i32 %x to i64{{$}} ; CHECK: %[[ADDRESS:[a-z0-9.]+]] = getelementptr [100 x i32], [100 x i32]* @G, i64 0, i64 %[[INDEX]]{{$}} -; CHECK: %n.pre = load i32, i32* %[[ADDRESS]]{{$}} +; CHECK: %n.pre = load i32, i32* %[[ADDRESS]], !dbg [[N_LOC:![0-9]+]] ; CHECK: br label %end ; CHECK: then: ; CHECK: store i32 %z ; CHECK: end: -; CHECK: %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC:![0-9]+]] +; CHECK: %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC]] ; CHECK: ret i32 %n ; CHECK: [[N_LOC]] = !DILocation(line: 47, column: 1, scope: !{{.*}}) diff --git a/test/Transforms/GlobalOpt/pr33686.ll b/test/Transforms/GlobalOpt/pr33686.ll new file mode 100644 index 000000000000..d6bb98735f4e --- /dev/null +++ b/test/Transforms/GlobalOpt/pr33686.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -globalopt %s | FileCheck %s + +@glob = external global i16, align 1 + +define void @beth() { +; CHECK-LABEL: @beth( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + ret void + +notreachable: + %patatino = select i1 undef, i16* @glob, i16* %patatino + br label %notreachable +} diff --git a/test/Transforms/IRCE/eq_ne.ll b/test/Transforms/IRCE/eq_ne.ll new file mode 100644 index 000000000000..1b1ffe6b94ba --- /dev/null +++ b/test/Transforms/IRCE/eq_ne.ll @@ -0,0 +1,257 @@ +; RUN: opt -verify-loop-info -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s + +; CHECK: irce: in function test_01: constrained Loop at depth 1 containing: %loop
,%in.bounds +; CHECK-NOT: irce: in function test_02: constrained Loop at depth 1 containing: %loop
,%in.bounds +; CHECK: irce: in function test_03: constrained Loop at depth 1 containing: %loop
,%in.bounds +; CHECK-NOT: irce: in function test_04: constrained Loop at depth 1 containing: %loop
,%in.bounds +; CHECK: irce: in function test_05: constrained Loop at depth 1 containing: %loop
,%in.bounds +; CHECK-NOT: irce: in function test_06: constrained Loop at depth 1 containing: %loop
,%in.bounds +; CHECK: irce: in function test_07: constrained Loop at depth 1 containing: %loop
,%in.bounds +; CHECK-NOT: irce: in function test_08: constrained Loop at depth 1 containing: %loop
,%in.bounds + +; Show that IRCE can turn 'ne' condition to 'slt' in increasing IV. +define void @test_01(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_01 +; CHECK: main.exit.selector: +; CHECK-NEXT: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next, %in.bounds ] +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp slt i32 [[PSEUDO_PHI]], 100 +; CHECK-NEXT: br i1 [[COND]] + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, 1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp ne i32 %idx.next, 100 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that if n is not known to be greater than the starting value, IRCE +; doesn't apply. +define void @test_02(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_02( + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, 1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp ne i32 %idx.next, -100 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that IRCE can turn 'eq' condition to 'sge' in increasing IV. +define void @test_03(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_03( +; CHECK: main.exit.selector: +; CHECK-NEXT: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next, %in.bounds ] +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp slt i32 [[PSEUDO_PHI]], 100 +; CHECK-NEXT: br i1 [[COND]] + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, 1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp eq i32 %idx.next, 100 + br i1 %next, label %exit, label %loop + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that if n is not known to be greater than the starting value, IRCE +; doesn't apply. +define void @test_04(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_04( + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, 1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp eq i32 %idx.next, -100 + br i1 %next, label %exit, label %loop + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that IRCE can turn 'ne' condition to 'sgt' in decreasing IV. +define void @test_05(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_05( +; CHECK: preloop.exit.selector: +; CHECK-NEXT: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next.preloop, %in.bounds.preloop ] +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], 0 +; CHECK-NEXT: br i1 [[COND]] + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, -1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp ne i32 %idx.next, 0 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that IRCE cannot turn 'ne' condition to 'sgt' in decreasing IV if the end +; value is not proved to be less than the start value. +define void @test_06(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_06( + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, -1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp ne i32 %idx.next, 120 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that IRCE can turn 'eq' condition to 'slt' in decreasing IV. +define void @test_07(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_07( +; CHECK: preloop.exit.selector: +; CHECK-NEXT: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next.preloop, %in.bounds.preloop ] +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], 0 +; CHECK-NEXT: br i1 [[COND]] + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, -1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp eq i32 %idx.next, 0 + br i1 %next, label %exit, label %loop + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that IRCE cannot turn 'eq' condition to 'slt' in decreasing IV if the end +; value is not proved to be less than the start value. +define void @test_08(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_08( + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, -1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp eq i32 %idx.next, 120 + br i1 %next, label %exit, label %loop + +out.of.bounds: + ret void + +exit: + ret void +} + +!0 = !{i32 0, i32 50} +!1 = !{!"branch_weights", i32 64, i32 4} diff --git a/test/Transforms/IRCE/pre_post_loops.ll b/test/Transforms/IRCE/pre_post_loops.ll new file mode 100644 index 000000000000..2cd2e29104fe --- /dev/null +++ b/test/Transforms/IRCE/pre_post_loops.ll @@ -0,0 +1,117 @@ +; RUN: opt -verify-loop-info -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s + +; CHECK: irce: in function test_01: constrained Loop at depth 1 containing: %loop
,%in.bounds +; CHECK: irce: in function test_02: constrained Loop at depth 1 containing: %loop
,%in.bounds + +; Iterate from 0 to SINT_MAX, check that the post-loop is generated. +define void @test_01(i32* %arr, i32* %a_len_ptr) { + +; CHECK: test_01( +; CHECK: entry: +; CHECK-NEXT: %exit.mainloop.at = load i32, i32* %a_len_ptr +; CHECK: loop: +; CHECK-NEXT: %idx = phi i32 [ %idx.next, %in.bounds ], [ 0, %loop.preheader ] +; CHECK-NEXT: %idx.next = add i32 %idx, 1 +; CHECK-NEXT: %abc = icmp slt i32 %idx, %exit.mainloop.at +; CHECK-NEXT: br i1 true, label %in.bounds, +; CHECK: in.bounds: +; CHECK-NEXT: %addr = getelementptr i32, i32* %arr, i32 %idx +; CHECK-NEXT: store i32 0, i32* %addr +; CHECK-NEXT: %next = icmp slt i32 %idx.next, 2147483647 +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp slt i32 %idx.next, %exit.mainloop.at +; CHECK-NEXT: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: main.pseudo.exit: +; CHECK-NEXT: %idx.copy = phi i32 [ 0, %entry ], [ %idx.next.lcssa, %main.exit.selector ] +; CHECK-NEXT: %indvar.end = phi i32 [ 0, %entry ], [ %idx.next.lcssa, %main.exit.selector ] +; CHECK-NEXT: br label %postloop +; CHECK: postloop: +; CHECK-NEXT: br label %loop.postloop +; CHECK: loop.postloop: +; CHECK-NEXT: %idx.postloop = phi i32 [ %idx.copy, %postloop ], [ %idx.next.postloop, %in.bounds.postloop ] +; CHECK-NEXT: %idx.next.postloop = add i32 %idx.postloop, 1 +; CHECK-NEXT: %abc.postloop = icmp slt i32 %idx.postloop, %exit.mainloop.at +; CHECK-NEXT: br i1 %abc.postloop, label %in.bounds.postloop, label %out.of.bounds.loopexit +; CHECK: in.bounds.postloop: +; CHECK-NEXT: %addr.postloop = getelementptr i32, i32* %arr, i32 %idx.postloop +; CHECK-NEXT: store i32 0, i32* %addr.postloop +; CHECK-NEXT: %next.postloop = icmp slt i32 %idx.next.postloop, 2147483647 +; CHECK-NEXT: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, 1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp slt i32 %idx.next, 2147483647 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +; Iterate from SINT_MAX to 0, check that the pre-loop is generated. +define void @test_02(i32* %arr, i32* %a_len_ptr) { + +; CHECK: test_02( +; CHECK: entry: +; CHECK-NEXT: %len = load i32, i32* %a_len_ptr, !range !0 +; CHECH-NEXT: br i1 true, label %loop.preloop.preheader +; CHECK: mainloop: +; CHECK-NEXT: br label %loop +; CHECK: loop: +; CHECK-NEXT: %idx = phi i32 [ %idx.preloop.copy, %mainloop ], [ %idx.next, %in.bounds ] +; CHECK-NEXT: %idx.next = add i32 %idx, -1 +; CHECK-NEXT: %abc = icmp slt i32 %idx, %len +; CHECK-NEXT: br i1 true, label %in.bounds +; CHECK: in.bounds: +; CHECK-NEXT: %addr = getelementptr i32, i32* %arr, i32 %idx +; CHECK-NEXT: store i32 0, i32* %addr +; CHECK-NEXT: %next = icmp sgt i32 %idx.next, -1 +; CHECK-NEXT: br i1 %next, label %loop, label %exit.loopexit +; CHECK: loop.preloop: +; CHECK-NEXT: %idx.preloop = phi i32 [ %idx.next.preloop, %in.bounds.preloop ], [ 2147483647, %loop.preloop.preheader ] +; CHECK-NEXT: %idx.next.preloop = add i32 %idx.preloop, -1 +; CHECK-NEXT: %abc.preloop = icmp slt i32 %idx.preloop, %len +; CHECK-NEXT: br i1 %abc.preloop, label %in.bounds.preloop, label %out.of.bounds.loopexit +; CHECK: in.bounds.preloop: +; CHECK-NEXT: %addr.preloop = getelementptr i32, i32* %arr, i32 %idx.preloop +; CHECK-NEXT: store i32 0, i32* %addr.preloop +; CHECK-NEXT: %next.preloop = icmp sgt i32 %idx.next.preloop, -1 +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp sgt i32 %idx.next.preloop, -1 +; CHECK-NEXT: br i1 [[COND]], label %loop.preloop, label %preloop.exit.selector + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 2147483647, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, -1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp sgt i32 %idx.next, -1 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +!0 = !{i32 0, i32 50} diff --git a/test/Transforms/Inline/AArch64/ext.ll b/test/Transforms/Inline/AArch64/ext.ll new file mode 100644 index 000000000000..04095c04ee86 --- /dev/null +++ b/test/Transforms/Inline/AArch64/ext.ll @@ -0,0 +1,249 @@ +; REQUIRES: asserts +; RUN: opt -inline -mtriple=aarch64--linux-gnu -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +define i32 @outer1(i32* %ptr, i32 %i) { + %C = call i32 @inner1(i32* %ptr, i32 %i) + ret i32 %C +} + +; sext can be folded into gep. +; CHECK: Analyzing call of inner1 +; CHECK: NumInstructionsSimplified: 3 +; CHECK: NumInstructions: 4 +define i32 @inner1(i32* %ptr, i32 %i) { + %E = sext i32 %i to i64 + %G = getelementptr inbounds i32, i32* %ptr, i64 %E + %L = load i32, i32* %G + ret i32 %L +} + +define i32 @outer2(i32* %ptr, i32 %i) { + %C = call i32 @inner2(i32* %ptr, i32 %i) + ret i32 %C +} + +; zext from i32 to i64 is free. +; CHECK: Analyzing call of inner2 +; CHECK: NumInstructionsSimplified: 3 +; CHECK: NumInstructions: 4 +define i32 @inner2(i32* %ptr, i32 %i) { + %E = zext i32 %i to i64 + %G = getelementptr inbounds i32, i32* %ptr, i64 %E + %L = load i32, i32* %G + ret i32 %L +} + +define i32 @outer3(i32* %ptr, i16 %i) { + %C = call i32 @inner3(i32* %ptr, i16 %i) + ret i32 %C +} + +; zext can be folded into gep. +; CHECK: Analyzing call of inner3 +; CHECK: NumInstructionsSimplified: 3 +; CHECK: NumInstructions: 4 +define i32 @inner3(i32* %ptr, i16 %i) { + %E = zext i16 %i to i64 + %G = getelementptr inbounds i32, i32* %ptr, i64 %E + %L = load i32, i32* %G + ret i32 %L +} + +define i16 @outer4(i8* %ptr) { + %C = call i16 @inner4(i8* %ptr) + ret i16 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner4 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i16 @inner4(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i16 + ret i16 %E +} + +define i16 @outer5(i8* %ptr) { + %C = call i16 @inner5(i8* %ptr) + ret i16 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner5 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i16 @inner5(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i16 + ret i16 %E +} + +define i32 @outer6(i8* %ptr) { + %C = call i32 @inner6(i8* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner6 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner6(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i32 + ret i32 %E +} + +define i32 @outer7(i8* %ptr) { + %C = call i32 @inner7(i8* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner7 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner7(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i32 + ret i32 %E +} + +define i32 @outer8(i16* %ptr) { + %C = call i32 @inner8(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner8 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner8(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i32 + ret i32 %E +} + +define i32 @outer9(i16* %ptr) { + %C = call i32 @inner9(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner9 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner9(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i32 + ret i32 %E +} + +define i64 @outer10(i8* %ptr) { + %C = call i64 @inner10(i8* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner10 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner10(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i64 + ret i64 %E +} + +define i64 @outer11(i8* %ptr) { + %C = call i64 @inner11(i8* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner11 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner11(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i64 + ret i64 %E +} + +define i64 @outer12(i16* %ptr) { + %C = call i64 @inner12(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner12 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner12(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i64 + ret i64 %E +} + +define i64 @outer13(i16* %ptr) { + %C = call i64 @inner13(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner13 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner13(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i64 + ret i64 %E +} + +define i64 @outer14(i32* %ptr) { + %C = call i64 @inner14(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner14 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner14(i32* %ptr) { + %L = load i32, i32* %ptr + %E = zext i32 %L to i64 + ret i64 %E +} + +define i64 @outer15(i32* %ptr) { + %C = call i64 @inner15(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner15 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner15(i32* %ptr) { + %L = load i32, i32* %ptr + %E = sext i32 %L to i64 + ret i64 %E +} + +define i64 @outer16(i32 %V1, i64 %V2) { + %C = call i64 @inner16(i32 %V1, i64 %V2) + ret i64 %C +} + +; sext can be folded into shl. +; CHECK: Analyzing call of inner16 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 4 +define i64 @inner16(i32 %V1, i64 %V2) { + %E = sext i32 %V1 to i64 + %S = shl i64 %E, 3 + %A = add i64 %V2, %S + ret i64 %A +} diff --git a/test/Transforms/Inline/PowerPC/ext.ll b/test/Transforms/Inline/PowerPC/ext.ll new file mode 100644 index 000000000000..f7a409467b2c --- /dev/null +++ b/test/Transforms/Inline/PowerPC/ext.ll @@ -0,0 +1,140 @@ +; REQUIRES: asserts +; RUN: opt -inline -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s + +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64le-ibm-linux-gnu" + +define i16 @outer1(i8* %ptr) { + %C = call i16 @inner1(i8* %ptr) + ret i16 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner1 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i16 @inner1(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i16 + ret i16 %E +} + +define i32 @outer2(i8* %ptr) { + %C = call i32 @inner2(i8* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner2 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner2(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i32 + ret i32 %E +} + +define i32 @outer3(i16* %ptr) { + %C = call i32 @inner3(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner3 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner3(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i32 + ret i32 %E +} + +define i32 @outer4(i16* %ptr) { + %C = call i32 @inner4(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner4 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner4(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i32 + ret i32 %E +} + +define i64 @outer5(i8* %ptr) { + %C = call i64 @inner5(i8* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner5 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner5(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i64 + ret i64 %E +} + +define i64 @outer6(i16* %ptr) { + %C = call i64 @inner6(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner6 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner6(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i64 + ret i64 %E +} + +define i64 @outer7(i16* %ptr) { + %C = call i64 @inner7(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner7 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner7(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i64 + ret i64 %E +} + +define i64 @outer8(i32* %ptr) { + %C = call i64 @inner8(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner8 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner8(i32* %ptr) { + %L = load i32, i32* %ptr + %E = zext i32 %L to i64 + ret i64 %E +} + +define i64 @outer9(i32* %ptr) { + %C = call i64 @inner9(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner9 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner9(i32* %ptr) { + %L = load i32, i32* %ptr + %E = sext i32 %L to i64 + ret i64 %E +} diff --git a/test/Transforms/Inline/PowerPC/lit.local.cfg b/test/Transforms/Inline/PowerPC/lit.local.cfg new file mode 100644 index 000000000000..5d33887ff0a4 --- /dev/null +++ b/test/Transforms/Inline/PowerPC/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'PowerPC' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/Inline/X86/ext.ll b/test/Transforms/Inline/X86/ext.ll new file mode 100644 index 000000000000..bffda3852799 --- /dev/null +++ b/test/Transforms/Inline/X86/ext.ll @@ -0,0 +1,201 @@ +; REQUIRES: asserts +; RUN: opt -inline -mtriple=x86_64-unknown-unknown -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +define i32 @outer1(i32* %ptr, i32 %i) { + %C = call i32 @inner1(i32* %ptr, i32 %i) + ret i32 %C +} + +; zext from i32 to i64 is free. +; CHECK: Analyzing call of inner1 +; CHECK: NumInstructionsSimplified: 3 +; CHECK: NumInstructions: 4 +define i32 @inner1(i32* %ptr, i32 %i) { + %E = zext i32 %i to i64 + %G = getelementptr inbounds i32, i32* %ptr, i64 %E + %L = load i32, i32* %G + ret i32 %L +} + +define i16 @outer2(i8* %ptr) { + %C = call i16 @inner2(i8* %ptr) + ret i16 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner2 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i16 @inner2(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i16 + ret i16 %E +} + +define i16 @outer3(i8* %ptr) { + %C = call i16 @inner3(i8* %ptr) + ret i16 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner3 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i16 @inner3(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i16 + ret i16 %E +} + +define i32 @outer4(i8* %ptr) { + %C = call i32 @inner4(i8* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner4 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner4(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i32 + ret i32 %E +} + +define i32 @outer5(i8* %ptr) { + %C = call i32 @inner5(i8* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner5 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner5(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i32 + ret i32 %E +} + +define i32 @outer6(i16* %ptr) { + %C = call i32 @inner6(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner6 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner6(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i32 + ret i32 %E +} + +define i32 @outer7(i16* %ptr) { + %C = call i32 @inner7(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner7 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner7(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i32 + ret i32 %E +} + +define i64 @outer8(i8* %ptr) { + %C = call i64 @inner8(i8* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner8 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner8(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i64 + ret i64 %E +} + +define i64 @outer9(i8* %ptr) { + %C = call i64 @inner9(i8* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner9 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner9(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i64 + ret i64 %E +} + +define i64 @outer10(i16* %ptr) { + %C = call i64 @inner10(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner10 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner10(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i64 + ret i64 %E +} + +define i64 @outer11(i16* %ptr) { + %C = call i64 @inner11(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner11 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner11(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i64 + ret i64 %E +} + +define i64 @outer12(i32* %ptr) { + %C = call i64 @inner12(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner12 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner12(i32* %ptr) { + %L = load i32, i32* %ptr + %E = zext i32 %L to i64 + ret i64 %E +} + +define i64 @outer13(i32* %ptr) { + %C = call i64 @inner13(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner13 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner13(i32* %ptr) { + %L = load i32, i32* %ptr + %E = sext i32 %L to i64 + ret i64 %E +} diff --git a/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll index 3c4e08b5b515..905357817509 100644 --- a/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll +++ b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll @@ -1,7 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s -; CHECK: llvm.umul.with.overflow define i32 @sterix(i32, i8, i64) { +; CHECK-LABEL: @sterix( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[TMP0:%.*]] to i64 +; CHECK-NEXT: [[CONV1:%.*]] = sext i8 [[TMP1:%.*]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[CONV1]], 1945964878 +; CHECK-NEXT: [[SH_PROM:%.*]] = trunc i64 [[TMP2:%.*]] to i32 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[MUL]], [[SH_PROM]] +; CHECK-NEXT: [[CONV2:%.*]] = zext i32 [[SHR]] to i64 +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw nsw i64 [[CONV]], [[CONV2]] +; CHECK-NEXT: [[CONV6:%.*]] = and i64 [[MUL3]], 4294967295 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[CONV6]], [[MUL3]] +; CHECK-NEXT: br i1 [[TOBOOL]], label [[LOR_RHS:%.*]], label [[LOR_END:%.*]] +; CHECK: lor.rhs: +; CHECK-NEXT: [[AND:%.*]] = and i64 [[MUL3]], [[TMP2]] +; CHECK-NEXT: [[CONV4:%.*]] = trunc i64 [[AND]] to i32 +; CHECK-NEXT: [[TOBOOL7:%.*]] = icmp eq i32 [[CONV4]], 0 +; CHECK-NEXT: [[PHITMP:%.*]] = zext i1 [[TOBOOL7]] to i32 +; CHECK-NEXT: br label [[LOR_END]] +; CHECK: lor.end: +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[PHITMP]], [[LOR_RHS]] ] +; CHECK-NEXT: ret i32 [[TMP3]] +; entry: %conv = zext i32 %0 to i64 %conv1 = sext i8 %1 to i32 diff --git a/test/Transforms/InstCombine/and-not-or.ll b/test/Transforms/InstCombine/and-not-or.ll deleted file mode 100644 index a42140be2805..000000000000 --- a/test/Transforms/InstCombine/and-not-or.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: opt < %s -instcombine -S | grep "and i32 %x, %y" | count 4 -; RUN: opt < %s -instcombine -S | not grep "or" - -define i32 @func1(i32 %x, i32 %y) nounwind { -entry: - %n = xor i32 %y, -1 - %o = or i32 %n, %x - %a = and i32 %o, %y - ret i32 %a -} - -define i32 @func2(i32 %x, i32 %y) nounwind { -entry: - %n = xor i32 %y, -1 - %o = or i32 %x, %n - %a = and i32 %o, %y - ret i32 %a -} - -define i32 @func3(i32 %x, i32 %y) nounwind { -entry: - %n = xor i32 %y, -1 - %o = or i32 %n, %x - %a = and i32 %y, %o - ret i32 %a -} - -define i32 @func4(i32 %x, i32 %y) nounwind { -entry: - %n = xor i32 %y, -1 - %o = or i32 %x, %n - %a = and i32 %y, %o - ret i32 %a -} diff --git a/test/Transforms/InstCombine/and.ll b/test/Transforms/InstCombine/and.ll index 7bb9b95b3179..c12662d4db0e 100644 --- a/test/Transforms/InstCombine/and.ll +++ b/test/Transforms/InstCombine/and.ll @@ -628,3 +628,195 @@ define i32 @test43(i32 %a, i32 %c, i32 %d) { %and = and i32 %or, %xor ret i32 %and } + +; (~y | x) & y -> x & y +define i32 @test44(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: @test44( +; CHECK-NEXT: [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[A]] +; + %n = xor i32 %y, -1 + %o = or i32 %n, %x + %a = and i32 %o, %y + ret i32 %a +} + +; (x | ~y) & y -> x & y +define i32 @test45(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: @test45( +; CHECK-NEXT: [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[A]] +; + %n = xor i32 %y, -1 + %o = or i32 %x, %n + %a = and i32 %o, %y + ret i32 %a +} + +; y & (~y | x) -> y | x +define i32 @test46(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: @test46( +; CHECK-NEXT: [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[A]] +; + %n = xor i32 %y, -1 + %o = or i32 %n, %x + %a = and i32 %y, %o + ret i32 %a +} + +; y & (x | ~y) -> y | x +define i32 @test47(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: @test47( +; CHECK-NEXT: [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[A]] +; + %n = xor i32 %y, -1 + %o = or i32 %x, %n + %a = and i32 %y, %o + ret i32 %a +} + +; In the next 4 tests, vary the types and predicates for extra coverage. +; (X & (Y | ~X)) -> (X & Y), where 'not' is an inverted cmp + +define i1 @and_orn_cmp_1(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @and_orn_cmp_1( +; CHECK-NEXT: [[X:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[AND]] +; + %x = icmp sgt i32 %a, %b + %x_inv = icmp sle i32 %a, %b + %y = icmp ugt i32 %c, 42 ; thwart complexity-based ordering + %or = or i1 %y, %x_inv + %and = and i1 %x, %or + ret i1 %and +} + +; Commute the 'and': +; ((Y | ~X) & X) -> (X & Y), where 'not' is an inverted cmp + +define <2 x i1> @and_orn_cmp_2(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: @and_orn_cmp_2( +; CHECK-NEXT: [[X:%.*]] = icmp sge <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <2 x i32> [[C:%.*]], +; CHECK-NEXT: [[AND:%.*]] = and <2 x i1> [[Y]], [[X]] +; CHECK-NEXT: ret <2 x i1> [[AND]] +; + %x = icmp sge <2 x i32> %a, %b + %x_inv = icmp slt <2 x i32> %a, %b + %y = icmp ugt <2 x i32> %c, ; thwart complexity-based ordering + %or = or <2 x i1> %y, %x_inv + %and = and <2 x i1> %or, %x + ret <2 x i1> %and +} + +; Commute the 'or': +; (X & (~X | Y)) -> (X & Y), where 'not' is an inverted cmp + +define i1 @and_orn_cmp_3(i72 %a, i72 %b, i72 %c) { +; CHECK-LABEL: @and_orn_cmp_3( +; CHECK-NEXT: [[X:%.*]] = icmp ugt i72 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i72 [[C:%.*]], 42 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[AND]] +; + %x = icmp ugt i72 %a, %b + %x_inv = icmp ule i72 %a, %b + %y = icmp ugt i72 %c, 42 ; thwart complexity-based ordering + %or = or i1 %x_inv, %y + %and = and i1 %x, %or + ret i1 %and +} + +; Commute the 'and': +; ((~X | Y) & X) -> (X & Y), where 'not' is an inverted cmp + +define <3 x i1> @or_andn_cmp_4(<3 x i32> %a, <3 x i32> %b, <3 x i32> %c) { +; CHECK-LABEL: @or_andn_cmp_4( +; CHECK-NEXT: [[X:%.*]] = icmp eq <3 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <3 x i32> [[C:%.*]], +; CHECK-NEXT: [[AND:%.*]] = and <3 x i1> [[Y]], [[X]] +; CHECK-NEXT: ret <3 x i1> [[AND]] +; + %x = icmp eq <3 x i32> %a, %b + %x_inv = icmp ne <3 x i32> %a, %b + %y = icmp ugt <3 x i32> %c, ; thwart complexity-based ordering + %or = or <3 x i1> %x_inv, %y + %and = and <3 x i1> %or, %x + ret <3 x i1> %and +} + +; In the next 4 tests, vary the types and predicates for extra coverage. +; (~X & (Y | X)) -> (~X & Y), where 'not' is an inverted cmp + +define i1 @andn_or_cmp_1(i37 %a, i37 %b, i37 %c) { +; CHECK-LABEL: @andn_or_cmp_1( +; CHECK-NEXT: [[X_INV:%.*]] = icmp sle i37 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i37 [[C:%.*]], 42 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[X_INV]], [[Y]] +; CHECK-NEXT: ret i1 [[AND]] +; + %x = icmp sgt i37 %a, %b + %x_inv = icmp sle i37 %a, %b + %y = icmp ugt i37 %c, 42 ; thwart complexity-based ordering + %or = or i1 %y, %x + %and = and i1 %x_inv, %or + ret i1 %and +} + +; Commute the 'and': +; ((Y | X) & ~X) -> (~X & Y), where 'not' is an inverted cmp + +define i1 @andn_or_cmp_2(i16 %a, i16 %b, i16 %c) { +; CHECK-LABEL: @andn_or_cmp_2( +; CHECK-NEXT: [[X_INV:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i16 [[C:%.*]], 42 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[Y]], [[X_INV]] +; CHECK-NEXT: ret i1 [[AND]] +; + %x = icmp sge i16 %a, %b + %x_inv = icmp slt i16 %a, %b + %y = icmp ugt i16 %c, 42 ; thwart complexity-based ordering + %or = or i1 %y, %x + %and = and i1 %or, %x_inv + ret i1 %and +} + +; Commute the 'or': +; (~X & (X | Y)) -> (~X & Y), where 'not' is an inverted cmp + +define <4 x i1> @andn_or_cmp_3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: @andn_or_cmp_3( +; CHECK-NEXT: [[X_INV:%.*]] = icmp ule <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <4 x i32> [[C:%.*]], +; CHECK-NEXT: [[AND:%.*]] = and <4 x i1> [[X_INV]], [[Y]] +; CHECK-NEXT: ret <4 x i1> [[AND]] +; + %x = icmp ugt <4 x i32> %a, %b + %x_inv = icmp ule <4 x i32> %a, %b + %y = icmp ugt <4 x i32> %c, ; thwart complexity-based ordering + %or = or <4 x i1> %x, %y + %and = and <4 x i1> %x_inv, %or + ret <4 x i1> %and +} + +; Commute the 'and': +; ((X | Y) & ~X) -> (~X & Y), where 'not' is an inverted cmp + +define i1 @andn_or_cmp_4(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @andn_or_cmp_4( +; CHECK-NEXT: [[X_INV:%.*]] = icmp ne i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[Y]], [[X_INV]] +; CHECK-NEXT: ret i1 [[AND]] +; + %x = icmp eq i32 %a, %b + %x_inv = icmp ne i32 %a, %b + %y = icmp ugt i32 %c, 42 ; thwart complexity-based ordering + %or = or i1 %x, %y + %and = and i1 %or, %x_inv + ret i1 %and +} diff --git a/test/Transforms/InstCombine/and2.ll b/test/Transforms/InstCombine/and2.ll index 001ac58891e4..15772d158f62 100644 --- a/test/Transforms/InstCombine/and2.ll +++ b/test/Transforms/InstCombine/and2.ll @@ -98,8 +98,7 @@ define i64 @test9(i64 %x) { ; combine -x & 1 into x & 1 define <2 x i64> @test9vec(<2 x i64> %x) { ; CHECK-LABEL: @test9vec( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i64> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and <2 x i64> [[SUB]], +; CHECK-NEXT: [[AND:%.*]] = and <2 x i64> %x, ; CHECK-NEXT: ret <2 x i64> [[AND]] ; %sub = sub nsw <2 x i64> , %x @@ -119,6 +118,88 @@ define i64 @test10(i64 %x) { ret i64 %add } +; (1 << x) & 1 --> zext(x == 0) + +define i8 @and1_shl1_is_cmp_eq_0(i8 %x) { +; CHECK-LABEL: @and1_shl1_is_cmp_eq_0( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 %x, 0 +; CHECK-NEXT: [[AND:%.*]] = zext i1 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[AND]] +; + %sh = shl i8 1, %x + %and = and i8 %sh, 1 + ret i8 %and +} + +; Don't do it if the shift has another use. + +define i8 @and1_shl1_is_cmp_eq_0_multiuse(i8 %x) { +; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_multiuse( +; CHECK-NEXT: [[SH:%.*]] = shl i8 1, %x +; CHECK-NEXT: [[AND:%.*]] = and i8 [[SH]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[SH]], [[AND]] +; CHECK-NEXT: ret i8 [[ADD]] +; + %sh = shl i8 1, %x + %and = and i8 %sh, 1 + %add = add i8 %sh, %and + ret i8 %add +} + +; (1 << x) & 1 --> zext(x == 0) + +define <2 x i8> @and1_shl1_is_cmp_eq_0_vec(<2 x i8> %x) { +; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> %x, zeroinitializer +; CHECK-NEXT: [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[AND]] +; + %sh = shl <2 x i8> , %x + %and = and <2 x i8> %sh, + ret <2 x i8> %and +} + +; (1 >> x) & 1 --> zext(x == 0) + +define i8 @and1_lshr1_is_cmp_eq_0(i8 %x) { +; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 %x, 0 +; CHECK-NEXT: [[AND:%.*]] = zext i1 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[AND]] +; + %sh = lshr i8 1, %x + %and = and i8 %sh, 1 + ret i8 %and +} + +; Don't do it if the shift has another use. + +define i8 @and1_lshr1_is_cmp_eq_0_multiuse(i8 %x) { +; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_multiuse( +; CHECK-NEXT: [[SH:%.*]] = lshr i8 1, %x +; CHECK-NEXT: [[AND:%.*]] = and i8 [[SH]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[SH]], [[AND]] +; CHECK-NEXT: ret i8 [[ADD]] +; + %sh = lshr i8 1, %x + %and = and i8 %sh, 1 + %add = add i8 %sh, %and + ret i8 %add +} + +; (1 >> x) & 1 --> zext(x == 0) + +define <2 x i8> @and1_lshr1_is_cmp_eq_0_vec(<2 x i8> %x) { +; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> %x, zeroinitializer +; CHECK-NEXT: [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[AND]] +; + %sh = lshr <2 x i8> , %x + %and = and <2 x i8> %sh, + ret <2 x i8> %and +} + ; The add in this test is unnecessary because the LSBs of the LHS are 0 and the 'and' only consumes bits from those LSBs. It doesn't matter what happens to the upper bits. define i32 @test11(i32 %a, i32 %b) { ; CHECK-LABEL: @test11( diff --git a/test/Transforms/InstCombine/element-atomic-memintrins.ll b/test/Transforms/InstCombine/element-atomic-memintrins.ll new file mode 100644 index 000000000000..2e3bfd7b721d --- /dev/null +++ b/test/Transforms/InstCombine/element-atomic-memintrins.ll @@ -0,0 +1,98 @@ +;; Placeholder tests that will fail once element atomic @llvm.mem[move|set] instrinsics have +;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to +;; verify that inst combine handles these intrinsics properly once they have been +;; added to that class hierarchy. + +; RUN: opt -instcombine -S < %s | FileCheck %s + +;; ---- memset ----- + +; Ensure 0-length memset isn't removed +define void @test_memset_zero_length(i8* %dest) { + ; CHECK-LABEL: test_memset_zero_length + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1) + ret void +} + +; Ensure that small-sized memsets don't convert to stores +define void @test_memset_to_store(i8* %dest) { + ; CHECK-LABEL: test_memset_to_store + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1) + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1) + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1) + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1) + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1) + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1) + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1) + ret void +} + +declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture writeonly, i8, i32, i32) nounwind argmemonly + + +;; ========================================= +;; ----- memmove ------ + +; memmove from a global constant source does not become memcpy +@gconst = constant [8 x i8] c"0123456\00" +define void @test_memmove_to_memcpy(i8* %dest) { + ; CHECK-LABEL: test_memmove_to_memcpy + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1) + ret void +} + +define void @test_memmove_zero_length(i8* %dest, i8* %src) { + ; CHECK-LABEL: test_memmove_zero_length + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16) + ; CHECK-NEXT: ret void + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16) + ret void +} + +; memmove with src==dest is removed +define void @test_memmove_removed(i8* %srcdest, i32 %sz) { + ; CHECK-LABEL: test_memmove_removed + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16) + ; CHECK-NEXT: ret void + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16) + ret void +} + +; memmove with a small constant length is converted to a load/store pair +define void @test_memmove_loadstore(i8* %dest, i8* %src) { + ; CHECK-LABEL: test_memmove_loadstore + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1) + ret void +} + +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly diff --git a/test/Transforms/InstCombine/icmp-logical.ll b/test/Transforms/InstCombine/icmp-logical.ll index faae2016e207..aa95cc5a1316 100644 --- a/test/Transforms/InstCombine/icmp-logical.ll +++ b/test/Transforms/InstCombine/icmp-logical.ll @@ -1,159 +1,138 @@ ; RUN: opt -instcombine -S -o - %s | FileCheck %s define i1 @masked_and_notallzeroes(i32 %A) { -; CHECK-LABEL: @masked_and_notallzeroes -; CHECK: [[MASK:%.*]] = and i32 %A, 7 -; CHECK: icmp ne i32 [[MASK]], 0 -; CHECK-NOT: and i32 %A, 39 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_and_notallzeroes( +; CHECK-NEXT: [[MASK1:%.*]] = and i32 %A, 7 +; CHECK-NEXT: [[TST1:%.*]] = icmp ne i32 [[MASK1]], 0 +; CHECK-NEXT: ret i1 [[TST1]] +; %mask1 = and i32 %A, 7 %tst1 = icmp ne i32 %mask1, 0 - %mask2 = and i32 %A, 39 %tst2 = icmp ne i32 %mask2, 0 - %res = and i1 %tst1, %tst2 ret i1 %res } define i1 @masked_or_allzeroes(i32 %A) { -; CHECK-LABEL: @masked_or_allzeroes -; CHECK: [[MASK:%.*]] = and i32 %A, 7 -; CHECK: icmp eq i32 [[MASK]], 0 -; CHECK-NOT: and i32 %A, 39 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_or_allzeroes( +; CHECK-NEXT: [[MASK1:%.*]] = and i32 %A, 7 +; CHECK-NEXT: [[TST1:%.*]] = icmp eq i32 [[MASK1]], 0 +; CHECK-NEXT: ret i1 [[TST1]] +; %mask1 = and i32 %A, 7 %tst1 = icmp eq i32 %mask1, 0 - %mask2 = and i32 %A, 39 %tst2 = icmp eq i32 %mask2, 0 - %res = or i1 %tst1, %tst2 ret i1 %res } define i1 @masked_and_notallones(i32 %A) { -; CHECK-LABEL: @masked_and_notallones -; CHECK: [[MASK:%.*]] = and i32 %A, 7 -; CHECK: icmp ne i32 [[MASK]], 7 -; CHECK-NOT: and i32 %A, 39 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_and_notallones( +; CHECK-NEXT: [[MASK1:%.*]] = and i32 %A, 7 +; CHECK-NEXT: [[TST1:%.*]] = icmp ne i32 [[MASK1]], 7 +; CHECK-NEXT: ret i1 [[TST1]] +; %mask1 = and i32 %A, 7 %tst1 = icmp ne i32 %mask1, 7 - %mask2 = and i32 %A, 39 %tst2 = icmp ne i32 %mask2, 39 - %res = and i1 %tst1, %tst2 ret i1 %res } define i1 @masked_or_allones(i32 %A) { -; CHECK-LABEL: @masked_or_allones -; CHECK: [[MASK:%.*]] = and i32 %A, 7 -; CHECK: icmp eq i32 [[MASK]], 7 -; CHECK-NOT: and i32 %A, 39 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_or_allones( +; CHECK-NEXT: [[MASK1:%.*]] = and i32 %A, 7 +; CHECK-NEXT: [[TST1:%.*]] = icmp eq i32 [[MASK1]], 7 +; CHECK-NEXT: ret i1 [[TST1]] +; %mask1 = and i32 %A, 7 %tst1 = icmp eq i32 %mask1, 7 - %mask2 = and i32 %A, 39 %tst2 = icmp eq i32 %mask2, 39 - %res = or i1 %tst1, %tst2 ret i1 %res } define i1 @masked_and_notA(i32 %A) { -; CHECK-LABEL: @masked_and_notA -; CHECK: [[MASK:%.*]] = and i32 %A, 39 -; CHECK: icmp ne i32 [[MASK]], %A -; CHECK-NOT: and i32 %A, 7 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_and_notA( +; CHECK-NEXT: [[MASK2:%.*]] = and i32 %A, 39 +; CHECK-NEXT: [[TST2:%.*]] = icmp ne i32 [[MASK2]], %A +; CHECK-NEXT: ret i1 [[TST2]] +; %mask1 = and i32 %A, 7 %tst1 = icmp ne i32 %mask1, %A - %mask2 = and i32 %A, 39 %tst2 = icmp ne i32 %mask2, %A - %res = and i1 %tst1, %tst2 ret i1 %res } define i1 @masked_or_A(i32 %A) { -; CHECK-LABEL: @masked_or_A -; CHECK: [[MASK:%.*]] = and i32 %A, 39 -; CHECK: icmp eq i32 [[MASK]], %A -; CHECK-NOT: and i32 %A, 7 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_or_A( +; CHECK-NEXT: [[MASK2:%.*]] = and i32 %A, 39 +; CHECK-NEXT: [[TST2:%.*]] = icmp eq i32 [[MASK2]], %A +; CHECK-NEXT: ret i1 [[TST2]] +; %mask1 = and i32 %A, 7 %tst1 = icmp eq i32 %mask1, %A - %mask2 = and i32 %A, 39 %tst2 = icmp eq i32 %mask2, %A - %res = or i1 %tst1, %tst2 ret i1 %res } define i1 @masked_or_allzeroes_notoptimised(i32 %A) { -; CHECK-LABEL: @masked_or_allzeroes_notoptimised -; CHECK: [[MASK:%.*]] = and i32 %A, 15 -; CHECK: icmp eq i32 [[MASK]], 0 -; CHECK: [[MASK:%.*]] = and i32 %A, 39 -; CHECK: icmp eq i32 [[MASK]], 0 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_or_allzeroes_notoptimised( +; CHECK-NEXT: [[MASK1:%.*]] = and i32 %A, 15 +; CHECK-NEXT: [[TST1:%.*]] = icmp eq i32 [[MASK1]], 0 +; CHECK-NEXT: [[MASK2:%.*]] = and i32 %A, 39 +; CHECK-NEXT: [[TST2:%.*]] = icmp eq i32 [[MASK2]], 0 +; CHECK-NEXT: [[RES:%.*]] = or i1 [[TST1]], [[TST2]] +; CHECK-NEXT: ret i1 [[RES]] +; %mask1 = and i32 %A, 15 %tst1 = icmp eq i32 %mask1, 0 - %mask2 = and i32 %A, 39 %tst2 = icmp eq i32 %mask2, 0 - %res = or i1 %tst1, %tst2 ret i1 %res } define i1 @nomask_lhs(i32 %in) { -; CHECK-LABEL: @nomask_lhs -; CHECK: [[MASK:%.*]] = and i32 %in, 1 -; CHECK: icmp eq i32 [[MASK]], 0 -; CHECK-NOT: icmp -; CHECK: ret i1 +; CHECK-LABEL: @nomask_lhs( +; CHECK-NEXT: [[MASKED:%.*]] = and i32 %in, 1 +; CHECK-NEXT: [[TST2:%.*]] = icmp eq i32 [[MASKED]], 0 +; CHECK-NEXT: ret i1 [[TST2]] +; %tst1 = icmp eq i32 %in, 0 - %masked = and i32 %in, 1 %tst2 = icmp eq i32 %masked, 0 - %val = or i1 %tst1, %tst2 ret i1 %val } - define i1 @nomask_rhs(i32 %in) { -; CHECK-LABEL: @nomask_rhs -; CHECK: [[MASK:%.*]] = and i32 %in, 1 -; CHECK: icmp eq i32 [[MASK]], 0 -; CHECK-NOT: icmp -; CHECK: ret i1 +; CHECK-LABEL: @nomask_rhs( +; CHECK-NEXT: [[MASKED:%.*]] = and i32 %in, 1 +; CHECK-NEXT: [[TST1:%.*]] = icmp eq i32 [[MASKED]], 0 +; CHECK-NEXT: ret i1 [[TST1]] +; %masked = and i32 %in, 1 %tst1 = icmp eq i32 %masked, 0 - %tst2 = icmp eq i32 %in, 0 - %val = or i1 %tst1, %tst2 ret i1 %val } +; TODO: This test simplifies to a constant, so the functionality and test could be in InstSimplify. + define i1 @fold_mask_cmps_to_false(i32 %x) { -; CHECK-LABEL: @fold_mask_cmps_to_false -; CHECK: ret i1 false +; CHECK-LABEL: @fold_mask_cmps_to_false( +; CHECK-NEXT: ret i1 false +; %1 = and i32 %x, 2147483647 %2 = icmp eq i32 %1, 0 %3 = icmp eq i32 %x, 2147483647 @@ -161,12 +140,46 @@ define i1 @fold_mask_cmps_to_false(i32 %x) { ret i1 %4 } +; TODO: This test simplifies to a constant, so the functionality and test could be in InstSimplify. + define i1 @fold_mask_cmps_to_true(i32 %x) { -; CHECK-LABEL: @fold_mask_cmps_to_true -; CHECK: ret i1 true +; CHECK-LABEL: @fold_mask_cmps_to_true( +; CHECK-NEXT: ret i1 true +; %1 = and i32 %x, 2147483647 %2 = icmp ne i32 %1, 0 %3 = icmp ne i32 %x, 2147483647 %4 = or i1 %3, %2 ret i1 %4 } + +; PR32401 - https://bugs.llvm.org/show_bug.cgi?id=32401 + +define i1 @cmpeq_bitwise(i8 %a, i8 %b, i8 %c, i8 %d) { +; CHECK-LABEL: @cmpeq_bitwise( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 %a, %b +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 %c, %d +; CHECK-NEXT: [[CMP:%.*]] = and i1 [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %xor1 = xor i8 %a, %b + %xor2 = xor i8 %c, %d + %or = or i8 %xor1, %xor2 + %cmp = icmp eq i8 %or, 0 + ret i1 %cmp +} + +define <2 x i1> @cmpne_bitwise(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { +; CHECK-LABEL: @cmpne_bitwise( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i64> %a, %b +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> %c, %d +; CHECK-NEXT: [[CMP:%.*]] = or <2 x i1> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %xor1 = xor <2 x i64> %a, %b + %xor2 = xor <2 x i64> %c, %d + %or = or <2 x i64> %xor1, %xor2 + %cmp = icmp ne <2 x i64> %or, zeroinitializer + ret <2 x i1> %cmp +} + diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll index 947971c6c83b..be64f51b6c4c 100644 --- a/test/Transforms/InstCombine/or-xor.ll +++ b/test/Transforms/InstCombine/or-xor.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instcombine < %s | FileCheck %s -define i32 @test1(i32 %x, i32 %y) nounwind { +; X | ~(X | Y) --> X | ~Y + +define i32 @test1(i32 %x, i32 %y) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: [[Y_NOT:%.*]] = xor i32 %y, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[Y_NOT]], %x @@ -13,7 +15,10 @@ define i32 @test1(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test2(i32 %x, i32 %y) nounwind { +; Commute (rename) the inner 'or' operands: +; Y | ~(X | Y) --> ~X | Y + +define i32 @test2(i32 %x, i32 %y) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[X_NOT:%.*]] = xor i32 %x, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[X_NOT]], %y @@ -25,7 +30,9 @@ define i32 @test2(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test3(i32 %x, i32 %y) nounwind { +; X | ~(X ^ Y) --> X | ~Y + +define i32 @test3(i32 %x, i32 %y) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: [[Y_NOT:%.*]] = xor i32 %y, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[Y_NOT]], %x @@ -37,7 +44,10 @@ define i32 @test3(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test4(i32 %x, i32 %y) nounwind { +; Commute (rename) the 'xor' operands: +; Y | ~(X ^ Y) --> ~X | Y + +define i32 @test4(i32 %x, i32 %y) { ; CHECK-LABEL: @test4( ; CHECK-NEXT: [[X_NOT:%.*]] = xor i32 %x, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[X_NOT]], %y @@ -49,7 +59,7 @@ define i32 @test4(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test5(i32 %x, i32 %y) nounwind { +define i32 @test5(i32 %x, i32 %y) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: ret i32 -1 ; @@ -59,7 +69,7 @@ define i32 @test5(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test6(i32 %x, i32 %y) nounwind { +define i32 @test6(i32 %x, i32 %y) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: ret i32 -1 ; @@ -69,7 +79,7 @@ define i32 @test6(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test7(i32 %x, i32 %y) nounwind { +define i32 @test7(i32 %x, i32 %y) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: [[Z:%.*]] = or i32 %x, %y ; CHECK-NEXT: ret i32 [[Z]] @@ -79,7 +89,7 @@ define i32 @test7(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test8(i32 %x, i32 %y) nounwind { +define i32 @test8(i32 %x, i32 %y) { ; CHECK-LABEL: @test8( ; CHECK-NEXT: [[X_NOT:%.*]] = xor i32 %x, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[X_NOT]], %y @@ -91,7 +101,7 @@ define i32 @test8(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test9(i32 %x, i32 %y) nounwind { +define i32 @test9(i32 %x, i32 %y) { ; CHECK-LABEL: @test9( ; CHECK-NEXT: [[Y_NOT:%.*]] = xor i32 %y, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[Y_NOT]], %x diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll index 764fe4503b5e..fb56449ba4d4 100644 --- a/test/Transforms/InstCombine/or.ll +++ b/test/Transforms/InstCombine/or.ll @@ -397,14 +397,74 @@ define <2 x i132> @orsext_to_sel_vec_swap(<2 x i132> %x, <2 x i1> %y) { ret <2 x i132> %or } -define i32 @test39(i32 %a, i32 %b) { -; CHECK-LABEL: @test39( -; CHECK-NEXT: [[OR:%.*]] = or i32 %b, %a +; (~A & B) | A --> A | B + +define i32 @test39a(i32 %a, float %b) { +; CHECK-LABEL: @test39a( +; CHECK-NEXT: [[A1:%.*]] = mul i32 %a, 42 +; CHECK-NEXT: [[B1:%.*]] = bitcast float %b to i32 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[A1]], [[B1]] ; CHECK-NEXT: ret i32 [[OR]] ; - %xor = xor i32 %a, -1 - %and = and i32 %xor, %b - %or = or i32 %and, %a + %a1 = mul i32 %a, 42 ; thwart complexity-based ordering + %b1 = bitcast float %b to i32 ; thwart complexity-based ordering + %nota = xor i32 %a1, -1 + %and = and i32 %nota, %b1 + %or = or i32 %and, %a1 + ret i32 %or +} + +; Commute 'and' operands: +; (B & ~A) | A --> A | B + +define i32 @test39b(i32 %a, float %b) { +; CHECK-LABEL: @test39b( +; CHECK-NEXT: [[A1:%.*]] = mul i32 %a, 42 +; CHECK-NEXT: [[B1:%.*]] = bitcast float %b to i32 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[A1]], [[B1]] +; CHECK-NEXT: ret i32 [[OR]] +; + %a1 = mul i32 %a, 42 ; thwart complexity-based ordering + %b1 = bitcast float %b to i32 ; thwart complexity-based ordering + %nota = xor i32 %a1, -1 + %and = and i32 %b1, %nota + %or = or i32 %and, %a1 + ret i32 %or +} + +; Commute 'or' operands: +; A | (~A & B) --> A | B + +define i32 @test39c(i32 %a, float %b) { +; CHECK-LABEL: @test39c( +; CHECK-NEXT: [[A1:%.*]] = mul i32 %a, 42 +; CHECK-NEXT: [[B1:%.*]] = bitcast float %b to i32 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[A1]], [[B1]] +; CHECK-NEXT: ret i32 [[OR]] +; + %a1 = mul i32 %a, 42 ; thwart complexity-based ordering + %b1 = bitcast float %b to i32 ; thwart complexity-based ordering + %nota = xor i32 %a1, -1 + %and = and i32 %nota, %b1 + %or = or i32 %a1, %and + ret i32 %or +} + +; Commute 'and' operands: +; A | (B & ~A) --> A | B + +define i32 @test39d(i32 %a, float %b) { +; CHECK-LABEL: @test39d( +; CHECK-NEXT: [[A1:%.*]] = mul i32 %a, 42 +; CHECK-NEXT: [[B1:%.*]] = bitcast float %b to i32 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[A1]], [[B1]] +; CHECK-NEXT: ret i32 [[OR]] +; + %a1 = mul i32 %a, 42 ; thwart complexity-based ordering + %b1 = bitcast float %b to i32 ; thwart complexity-based ordering + %nota = xor i32 %a1, -1 + %and = and i32 %b1, %nota + %or = or i32 %a1, %and ret i32 %or } @@ -456,60 +516,6 @@ define i32 @test40d(i32 %a, i32 %b) { ret i32 %or } -define i32 @test41(i32 %a, i32 %b) { -; CHECK-LABEL: @test41( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, -1 -; CHECK-NEXT: [[OR:%.*]] = xor i32 [[TMP1]], %b -; CHECK-NEXT: ret i32 [[OR]] -; - %and = and i32 %a, %b - %nega = xor i32 %a, -1 - %xor = xor i32 %nega, %b - %or = or i32 %and, %xor - ret i32 %or -} - -; (~A ^ B) | (A & B) -> (~A ^ B) - -define i32 @test42(i32 %a, i32 %b) { -; CHECK-LABEL: @test42( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, -1 -; CHECK-NEXT: [[OR:%.*]] = xor i32 [[TMP1]], %b -; CHECK-NEXT: ret i32 [[OR]] -; - %nega = xor i32 %a, -1 - %xor = xor i32 %nega, %b - %and = and i32 %a, %b - %or = or i32 %xor, %and - ret i32 %or -} - -define i32 @test42_commuted_and(i32 %a, i32 %b) { -; CHECK-LABEL: @test42_commuted_and( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, -1 -; CHECK-NEXT: [[OR:%.*]] = xor i32 [[TMP1]], %b -; CHECK-NEXT: ret i32 [[OR]] -; - %nega = xor i32 %a, -1 - %xor = xor i32 %nega, %b - %and = and i32 %b, %a - %or = or i32 %xor, %and - ret i32 %or -} - -define i32 @test42_commuted_xor(i32 %a, i32 %b) { -; CHECK-LABEL: @test42_commuted_xor( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, -1 -; CHECK-NEXT: [[OR:%.*]] = xor i32 [[TMP1]], %b -; CHECK-NEXT: ret i32 [[OR]] -; - %nega = xor i32 %a, -1 - %xor = xor i32 %b, %nega - %and = and i32 %a, %b - %or = or i32 %xor, %and - ret i32 %or -} - define i32 @test45(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @test45( ; CHECK-NEXT: [[TMP1:%.*]] = and i32 %x, %z @@ -648,41 +654,146 @@ final: ret <2 x i32> %value } -define i8 @test51(i8 %a, i8 %b, i8 %c) { -; CHECK-LABEL: @test51( -; CHECK-NEXT: [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]] -; CHECK-NEXT: [[X:%.*]] = or i8 [[W]], [[A:%.*]] -; CHECK-NEXT: ret i8 [[X]] +; In the next 4 tests, vary the types and predicates for extra coverage. +; (X | (Y & ~X)) -> (X | Y), where 'not' is an inverted cmp + +define i1 @or_andn_cmp_1(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @or_andn_cmp_1( +; CHECK-NEXT: [[X:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[OR]] +; + %x = icmp sgt i32 %a, %b + %x_inv = icmp sle i32 %a, %b + %y = icmp ugt i32 %c, 42 ; thwart complexity-based ordering + %and = and i1 %y, %x_inv + %or = or i1 %x, %and + ret i1 %or +} + +; Commute the 'or': +; ((Y & ~X) | X) -> (X | Y), where 'not' is an inverted cmp + +define <2 x i1> @or_andn_cmp_2(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: @or_andn_cmp_2( +; CHECK-NEXT: [[X:%.*]] = icmp sge <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <2 x i32> [[C:%.*]], +; CHECK-NEXT: [[OR:%.*]] = or <2 x i1> [[Y]], [[X]] +; CHECK-NEXT: ret <2 x i1> [[OR]] +; + %x = icmp sge <2 x i32> %a, %b + %x_inv = icmp slt <2 x i32> %a, %b + %y = icmp ugt <2 x i32> %c, ; thwart complexity-based ordering + %and = and <2 x i1> %y, %x_inv + %or = or <2 x i1> %and, %x + ret <2 x i1> %or +} + +; Commute the 'and': +; (X | (~X & Y)) -> (X | Y), where 'not' is an inverted cmp + +define i1 @or_andn_cmp_3(i72 %a, i72 %b, i72 %c) { +; CHECK-LABEL: @or_andn_cmp_3( +; CHECK-NEXT: [[X:%.*]] = icmp ugt i72 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i72 [[C:%.*]], 42 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[OR]] +; + %x = icmp ugt i72 %a, %b + %x_inv = icmp ule i72 %a, %b + %y = icmp ugt i72 %c, 42 ; thwart complexity-based ordering + %and = and i1 %x_inv, %y + %or = or i1 %x, %and + ret i1 %or +} + +; Commute the 'or': +; ((~X & Y) | X) -> (X | Y), where 'not' is an inverted cmp + +define <3 x i1> @or_andn_cmp_4(<3 x i32> %a, <3 x i32> %b, <3 x i32> %c) { +; CHECK-LABEL: @or_andn_cmp_4( +; CHECK-NEXT: [[X:%.*]] = icmp eq <3 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <3 x i32> [[C:%.*]], +; CHECK-NEXT: [[OR:%.*]] = or <3 x i1> [[Y]], [[X]] +; CHECK-NEXT: ret <3 x i1> [[OR]] +; + %x = icmp eq <3 x i32> %a, %b + %x_inv = icmp ne <3 x i32> %a, %b + %y = icmp ugt <3 x i32> %c, ; thwart complexity-based ordering + %and = and <3 x i1> %x_inv, %y + %or = or <3 x i1> %and, %x + ret <3 x i1> %or +} + +; In the next 4 tests, vary the types and predicates for extra coverage. +; (~X | (Y & X)) -> (~X | Y), where 'not' is an inverted cmp + +define i1 @orn_and_cmp_1(i37 %a, i37 %b, i37 %c) { +; CHECK-LABEL: @orn_and_cmp_1( +; CHECK-NEXT: [[X_INV:%.*]] = icmp sle i37 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i37 [[C:%.*]], 42 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[X_INV]], [[Y]] +; CHECK-NEXT: ret i1 [[OR]] +; + %x = icmp sgt i37 %a, %b + %x_inv = icmp sle i37 %a, %b + %y = icmp ugt i37 %c, 42 ; thwart complexity-based ordering + %and = and i1 %y, %x + %or = or i1 %x_inv, %and + ret i1 %or +} + +; Commute the 'or': +; ((Y & X) | ~X) -> (~X | Y), where 'not' is an inverted cmp + +define i1 @orn_and_cmp_2(i16 %a, i16 %b, i16 %c) { +; CHECK-LABEL: @orn_and_cmp_2( +; CHECK-NEXT: [[X_INV:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i16 [[C:%.*]], 42 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[Y]], [[X_INV]] +; CHECK-NEXT: ret i1 [[OR]] ; - %w = mul i8 %b, %c - %z = xor i8 %a, -1 - %y = and i8 %w, %z - %x = or i8 %y, %a - ret i8 %x + %x = icmp sge i16 %a, %b + %x_inv = icmp slt i16 %a, %b + %y = icmp ugt i16 %c, 42 ; thwart complexity-based ordering + %and = and i1 %y, %x + %or = or i1 %and, %x_inv + ret i1 %or } -define i8 @test52(i8 %a, i8 %b, i8 %c) { -; CHECK-LABEL: @test52( -; CHECK-NEXT: [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]] -; CHECK-NEXT: [[X:%.*]] = or i8 [[W]], [[A:%.*]] -; CHECK-NEXT: ret i8 [[X]] +; Commute the 'and': +; (~X | (X & Y)) -> (~X | Y), where 'not' is an inverted cmp + +define <4 x i1> @orn_and_cmp_3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: @orn_and_cmp_3( +; CHECK-NEXT: [[X_INV:%.*]] = icmp ule <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <4 x i32> [[C:%.*]], +; CHECK-NEXT: [[OR:%.*]] = or <4 x i1> [[X_INV]], [[Y]] +; CHECK-NEXT: ret <4 x i1> [[OR]] ; - %w = mul i8 %b, %c - %z = xor i8 %w, -1 - %y = and i8 %z, %a - %x = or i8 %w, %y - ret i8 %x + %x = icmp ugt <4 x i32> %a, %b + %x_inv = icmp ule <4 x i32> %a, %b + %y = icmp ugt <4 x i32> %c, ; thwart complexity-based ordering + %and = and <4 x i1> %x, %y + %or = or <4 x i1> %x_inv, %and + ret <4 x i1> %or } -define i8 @test53(i8 %a, i8 %b, i8 %c) { -; CHECK-LABEL: @test53( -; CHECK-NEXT: [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]] -; CHECK-NEXT: [[X:%.*]] = or i8 [[W]], [[A:%.*]] -; CHECK-NEXT: ret i8 [[X]] +; Commute the 'or': +; ((X & Y) | ~X) -> (~X | Y), where 'not' is an inverted cmp + +define i1 @orn_and_cmp_4(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @orn_and_cmp_4( +; CHECK-NEXT: [[X_INV:%.*]] = icmp ne i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[Y]], [[X_INV]] +; CHECK-NEXT: ret i1 [[OR]] ; - %w = mul i8 %b, %c - %z = xor i8 %w, -1 - %y = and i8 %z, %a - %x = or i8 %w, %y - ret i8 %x + %x = icmp eq i32 %a, %b + %x_inv = icmp ne i32 %a, %b + %y = icmp ugt i32 %c, 42 ; thwart complexity-based ordering + %and = and i1 %x, %y + %or = or i1 %and, %x_inv + ret i1 %or } diff --git a/test/Transforms/InstCombine/pr33765.ll b/test/Transforms/InstCombine/pr33765.ll new file mode 100644 index 000000000000..99ed0d13b5cf --- /dev/null +++ b/test/Transforms/InstCombine/pr33765.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S %s -instcombine | FileCheck %s + +@glob = external global i16 + +define void @patatino(i8 %beth) { +; CHECK-LABEL: @patatino( +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[BETH:%.*]] to i32 +; CHECK-NEXT: br i1 undef, label [[IF_THEN9:%.*]], label [[IF_THEN9]] +; CHECK: if.then9: +; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]] +; CHECK-NEXT: [[TINKY:%.*]] = load i16, i16* @glob, align 2 +; CHECK-NEXT: [[CONV131:%.*]] = zext i16 [[TINKY]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[MUL]], [[CONV131]] +; CHECK-NEXT: [[CONV14:%.*]] = trunc i32 [[AND]] to i16 +; CHECK-NEXT: store i16 [[CONV14]], i16* @glob, align 2 +; CHECK-NEXT: ret void +; + %conv = zext i8 %beth to i32 + %mul = mul nuw nsw i32 %conv, %conv + %conv3 = and i32 %mul, 255 + %tobool8 = icmp ne i32 %mul, %conv3 + br i1 %tobool8, label %if.then9, label %if.then9 + +if.then9: + %tinky = load i16, i16* @glob + %conv13 = sext i16 %tinky to i32 + %and = and i32 %mul, %conv13 + %conv14 = trunc i32 %and to i16 + store i16 %conv14, i16* @glob + ret void +} diff --git a/test/Transforms/JumpThreading/select.ll b/test/Transforms/JumpThreading/select.ll index 6a3cf7edd7dc..5e84ec54971a 100644 --- a/test/Transforms/JumpThreading/select.ll +++ b/test/Transforms/JumpThreading/select.ll @@ -280,10 +280,85 @@ cond.false.15.i: ; preds = %cond.false.10.i ret i32 %j.add3 ; CHECK-LABEL: @unfold3 -; CHECK: br i1 %cmp.i, label %.exit.thread2, label %cond.false.i +; CHECK: br i1 %cmp.i, label %.exit.thread2, label %cond.false.i ; CHECK: br i1 %cmp4.i, label %.exit.thread, label %cond.false.6.i ; CHECK: br i1 %cmp8.i, label %.exit.thread2, label %cond.false.10.i ; CHECK: br i1 %cmp13.i, label %.exit.thread, label %.exit ; CHECK: br i1 %phitmp, label %.exit.thread, label %.exit.thread2 ; CHECK: br label %.exit.thread2 } + +define i32 @unfold4(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind { +entry: + %add3 = add nsw i32 %j, 2 + %cmp.i = icmp slt i32 %u, %v + br i1 %cmp.i, label %.exit, label %cond.false.i + +cond.false.i: ; preds = %entry + %cmp4.i = icmp sgt i32 %u, %v + br i1 %cmp4.i, label %.exit, label %cond.false.6.i + +cond.false.6.i: ; preds = %cond.false.i + %cmp8.i = icmp slt i32 %w, %x + br i1 %cmp8.i, label %.exit, label %cond.false.10.i + +cond.false.10.i: ; preds = %cond.false.6.i + %cmp13.i = icmp sgt i32 %w, %x + br i1 %cmp13.i, label %.exit, label %cond.false.15.i + +cond.false.15.i: ; preds = %cond.false.10.i + %cmp19.i = icmp sge i32 %y, %z + %conv = zext i1 %cmp19.i to i32 + br label %.exit + +.exit: ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i + %cond23.i = phi i32 [ 1, %entry ], [ 0, %cond.false.i ], [ 1, %cond.false.6.i ], [ %conv, %cond.false.15.i ], [ 0, %cond.false.10.i ] + %lnot.i18 = icmp eq i32 %cond23.i, 1 + %j.add3 = select i1 %lnot.i18, i32 %j, i32 %add3 + ret i32 %j.add3 + +; CHECK-LABEL: @unfold4 +; CHECK: br i1 %cmp.i, label %.exit.thread, label %cond.false.i +; CHECK: br i1 %cmp4.i, label %.exit.thread3, label %cond.false.6.i +; CHECK: br i1 %cmp8.i, label %.exit.thread, label %cond.false.10.i +; CHECK: br i1 %cmp13.i, label %.exit.thread3, label %.exit +; CHECK: br i1 %lnot.i18, label %.exit.thread, label %.exit.thread3 +; CHECK: br label %.exit.thread3 +} + +define i32 @unfold5(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind { +entry: + %add3 = add nsw i32 %j, 2 + %cmp.i = icmp slt i32 %u, %v + br i1 %cmp.i, label %.exit, label %cond.false.i + +cond.false.i: ; preds = %entry + %cmp4.i = icmp sgt i32 %u, %v + br i1 %cmp4.i, label %.exit, label %cond.false.6.i + +cond.false.6.i: ; preds = %cond.false.i + %cmp8.i = icmp slt i32 %w, %x + br i1 %cmp8.i, label %.exit, label %cond.false.10.i + +cond.false.10.i: ; preds = %cond.false.6.i + %cmp13.i = icmp sgt i32 %w, %x + br i1 %cmp13.i, label %.exit, label %cond.false.15.i + +cond.false.15.i: ; preds = %cond.false.10.i + %cmp19.i = icmp sge i32 %y, %z + %conv = zext i1 %cmp19.i to i32 + br label %.exit + +.exit: ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i + %cond23.i = phi i32 [ 2, %entry ], [ 3, %cond.false.i ], [ 1, %cond.false.6.i ], [ %conv, %cond.false.15.i ], [ 7, %cond.false.10.i ] + %lnot.i18 = icmp sgt i32 %cond23.i, 5 + %j.add3 = select i1 %lnot.i18, i32 %j, i32 %cond23.i + ret i32 %j.add3 + +; CHECK-LABEL: @unfold5 +; CHECK: br i1 %cmp.i, label %.exit, label %cond.false.i +; CHECK: br i1 %cmp4.i, label %.exit, label %cond.false.6.i +; CHECK: br i1 %cmp8.i, label %.exit, label %cond.false.10.i +; CHECK: br i1 %cmp13.i, label %.exit, label %cond.false.15.i +; CHECK: br label %.exit +} diff --git a/test/Transforms/LoopInterchange/current-limitations-lcssa.ll b/test/Transforms/LoopInterchange/current-limitations-lcssa.ll new file mode 100644 index 000000000000..df6c6cfdbcb5 --- /dev/null +++ b/test/Transforms/LoopInterchange/current-limitations-lcssa.ll @@ -0,0 +1,76 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@C = common global [100 x [100 x i32]] zeroinitializer + +;; FIXME: +;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported. +;; for(gi=1;gi=0;j--) +;; A[j][i] = A[j][i]+k; + +define void @interchange_02(i32 %k) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ] + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 + %0 = load i32, i32* %arrayidx5 + %add = add nsw i32 %0, %k + store i32 %add, i32* %arrayidx5 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp2 = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp2, label %for.body3, label %for.inc10 + +for.inc10: + %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 + %exitcond = icmp eq i64 %indvars.iv.next20, 100 + br i1 %exitcond, label %for.end11, label %for.cond1.preheader + +for.end11: + ret void +} + +; CHECK-LABEL: @interchange_02 +; CHECK: entry: +; CHECK: br label %for.body3.preheader +; CHECK: for.cond1.preheader.preheader: +; CHECK: br label %for.cond1.preheader +; CHECK: for.cond1.preheader: +; CHECK: %indvars.iv19 = phi i64 [ %indvars.iv.next20, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ] +; CHECK: br label %for.body3.split1 +; CHECK: for.body3.preheader: +; CHECK: br label %for.body3 +; CHECK: for.body3: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ] +; CHECK: br label %for.cond1.preheader.preheader +; CHECK: for.body3.split1: ; preds = %for.cond1.preheader +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 +; CHECK: %0 = load i32, i32* %arrayidx5 +; CHECK: %add = add nsw i32 %0, %k +; CHECK: store i32 %add, i32* %arrayidx5 +; CHECK: br label %for.inc10 +; CHECK: for.body3.split: +; CHECK: %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK: %cmp2 = icmp sgt i64 %indvars.iv, 0 +; CHECK: br i1 %cmp2, label %for.body3, label %for.end11 +; CHECK: for.inc10: +; CHECK: %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 +; CHECK: %exitcond = icmp eq i64 %indvars.iv.next20, 100 +; CHECK: br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader +; CHECK: for.end11: +; CHECK: ret void diff --git a/test/Transforms/LoopInterchange/interchange-simple-count-up.ll b/test/Transforms/LoopInterchange/interchange-simple-count-up.ll new file mode 100644 index 000000000000..4febe0269810 --- /dev/null +++ b/test/Transforms/LoopInterchange/interchange-simple-count-up.ll @@ -0,0 +1,86 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@B = common global [100 x i32] zeroinitializer + +;; for(int i=0;i=0;j--) -;; A[j][i] = A[j][i]+k; - -define void @interchange_02(i32 %k) { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ] - br label %for.body3 - -for.body3: - %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 - %0 = load i32, i32* %arrayidx5 - %add = add nsw i32 %0, %k - store i32 %add, i32* %arrayidx5 - %indvars.iv.next = add nsw i64 %indvars.iv, -1 - %cmp2 = icmp sgt i64 %indvars.iv, 0 - br i1 %cmp2, label %for.body3, label %for.inc10 - -for.inc10: - %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 - %exitcond = icmp eq i64 %indvars.iv.next20, 100 - br i1 %exitcond, label %for.end11, label %for.cond1.preheader - -for.end11: - ret void -} - -; CHECK-LABEL: @interchange_02 -; CHECK: entry: -; CHECK: br label %for.body3.preheader -; CHECK: for.cond1.preheader.preheader: -; CHECK: br label %for.cond1.preheader -; CHECK: for.cond1.preheader: -; CHECK: %indvars.iv19 = phi i64 [ %indvars.iv.next20, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ] -; CHECK: br label %for.body3.split1 -; CHECK: for.body3.preheader: -; CHECK: br label %for.body3 -; CHECK: for.body3: -; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ] -; CHECK: br label %for.cond1.preheader.preheader -; CHECK: for.body3.split1: ; preds = %for.cond1.preheader -; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 -; CHECK: %0 = load i32, i32* %arrayidx5 -; CHECK: %add = add nsw i32 %0, %k -; CHECK: store i32 %add, i32* %arrayidx5 -; CHECK: br label %for.inc10 -; CHECK: for.body3.split: -; CHECK: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK: %cmp2 = icmp sgt i64 %indvars.iv, 0 -; CHECK: br i1 %cmp2, label %for.body3, label %for.end11 -; CHECK: for.inc10: -; CHECK: %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 -; CHECK: %exitcond = icmp eq i64 %indvars.iv.next20, 100 -; CHECK: br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader -; CHECK: for.end11: -; CHECK: ret void - -;;--------------------------------------Test case 03------------------------------------- -;; Loops should not be interchanged in this case as it is not profitable. -;; for(int i=0;i<100;i++) -;; for(int j=0;j<100;j++) -;; A[i][j] = A[i][j]+k; - -define void @interchange_03(i32 %k) { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ] - br label %for.body3 - -for.body3: - %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv - %0 = load i32, i32* %arrayidx5 - %add = add nsw i32 %0, %k - store i32 %add, i32* %arrayidx5 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.inc10, label %for.body3 - -for.inc10: - %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 - %exitcond23 = icmp eq i64 %indvars.iv.next22, 100 - br i1 %exitcond23, label %for.end12, label %for.cond1.preheader - -for.end12: - ret void -} - -; CHECK-LABEL: @interchange_03 -; CHECK: entry: -; CHECK: br label %for.cond1.preheader.preheader -; CHECK: for.cond1.preheader.preheader: ; preds = %entry -; CHECK: br label %for.cond1.preheader -; CHECK: for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc10 -; CHECK: %indvars.iv21 = phi i64 [ %indvars.iv.next22, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ] -; CHECK: br label %for.body3.preheader -; CHECK: for.body3.preheader: ; preds = %for.cond1.preheader -; CHECK: br label %for.body3 -; CHECK: for.body3: ; preds = %for.body3.preheader, %for.body3 -; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ] -; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv -; CHECK: %0 = load i32, i32* %arrayidx5 -; CHECK: %add = add nsw i32 %0, %k -; CHECK: store i32 %add, i32* %arrayidx5 -; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 100 -; CHECK: br i1 %exitcond, label %for.inc10, label %for.body3 -; CHECK: for.inc10: ; preds = %for.body3 -; CHECK: %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 -; CHECK: %exitcond23 = icmp eq i64 %indvars.iv.next22, 100 -; CHECK: br i1 %exitcond23, label %for.end12, label %for.cond1.preheader -; CHECK: for.end12: ; preds = %for.inc10 -; CHECK: ret void - - -;;--------------------------------------Test case 04------------------------------------- -;; Loops should not be interchanged in this case as it is not legal due to dependency. -;; for(int j=0;j<99;j++) -;; for(int i=0;i<99;i++) -;; A[j][i+1] = A[j+1][i]+k; - -define void @interchange_04(i32 %k){ -entry: - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ] - %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 - br label %for.body3 - -for.body3: - %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv - %0 = load i32, i32* %arrayidx5 - %add6 = add nsw i32 %0, %k - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next - store i32 %add6, i32* %arrayidx11 - %exitcond = icmp eq i64 %indvars.iv.next, 99 - br i1 %exitcond, label %for.inc12, label %for.body3 - -for.inc12: - %exitcond25 = icmp eq i64 %indvars.iv.next24, 99 - br i1 %exitcond25, label %for.end14, label %for.cond1.preheader - -for.end14: - ret void -} - -; CHECK-LABEL: @interchange_04 -; CHECK: entry: -; CHECK: br label %for.cond1.preheader -; CHECK: for.cond1.preheader: ; preds = %for.inc12, %entry -; CHECK: %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ] -; CHECK: %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 -; CHECK: br label %for.body3 -; CHECK: for.body3: ; preds = %for.body3, %for.cond1.preheader -; CHECK: %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] -; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv -; CHECK: %0 = load i32, i32* %arrayidx5 -; CHECK: %add6 = add nsw i32 %0, %k -; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next -; CHECK: store i32 %add6, i32* %arrayidx11 -; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 99 -; CHECK: br i1 %exitcond, label %for.inc12, label %for.body3 -; CHECK: for.inc12: ; preds = %for.body3 -; CHECK: %exitcond25 = icmp eq i64 %indvars.iv.next24, 99 -; CHECK: br i1 %exitcond25, label %for.end14, label %for.cond1.preheader -; CHECK: for.end14: ; preds = %for.inc12 -; CHECK: ret void - - - -;;--------------------------------------Test case 05------------------------------------- -;; Loops not tightly nested are not interchanged -;; for(int j=0;j [[L1]], i32 3 ; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ] +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %for.preheader ] ; CHECK: scalar.body: ; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] ; @@ -79,7 +79,7 @@ for.exit: ; CHECK: middle.block: ; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 ; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ] +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %for.preheader ] ; CHECK: scalar.body: ; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] ; @@ -144,7 +144,7 @@ scalar.body: ; CHECK: middle.block: ; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3 ; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ] +; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %for.preheader ] ; CHECK: scalar.body: ; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] ; @@ -288,7 +288,7 @@ for.cond.cleanup3: ; UNROLL-NO-IC-LABEL: @PR30183( ; UNROLL-NO-IC: vector.ph: -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3 +; UNROLL-NO-IC: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3 ; UNROLL-NO-IC-NEXT: br label %vector.body ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] diff --git a/test/Transforms/LoopVectorize/float-induction.ll b/test/Transforms/LoopVectorize/float-induction.ll index 8eec6e262c1a..a7cc4530ceb3 100644 --- a/test/Transforms/LoopVectorize/float-induction.ll +++ b/test/Transforms/LoopVectorize/float-induction.ll @@ -15,7 +15,7 @@ ; VEC4_INTERL1-LABEL: @fp_iv_loop1( ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer @@ -37,7 +37,7 @@ ; VEC4_INTERL2-LABEL: @fp_iv_loop1( ; VEC4_INTERL2: vector.ph: -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL2: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer @@ -63,7 +63,7 @@ ; VEC1_INTERL2-LABEL: @fp_iv_loop1( ; VEC1_INTERL2: vector.ph: -; VEC1_INTERL2-NEXT: br label %vector.body +; VEC1_INTERL2: br label %vector.body ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; VEC1_INTERL2-NEXT: [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1 @@ -115,7 +115,7 @@ for.end: ; preds = %for.end.loopexit, % ; VEC4_INTERL1-LABEL: @fp_iv_loop2( ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], ; VEC4_INTERL1-NEXT: br label %vector.body @@ -172,7 +172,7 @@ for.end: ; preds = %for.end.loopexit, % ; VEC4_INTERL1: for.body.lr.ph: ; VEC4_INTERL1: [[TMP0:%.*]] = load float, float* @fp_inc, align 4 ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer @@ -250,7 +250,7 @@ for.end: ; VEC4_INTERL1-LABEL: @fp_iv_loop4( ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: br label %vector.body +; VEC4_INTERL1: br label %vector.body ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] @@ -289,7 +289,7 @@ for.end: ; preds = %for.end.loopexit, % ; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar( ; VEC2_INTERL1_PRED_STORE: vector.body: -; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ], [ 0, %min.iters.checked ] +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ] ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>* diff --git a/test/Transforms/LoopVectorize/if-conversion-nest.ll b/test/Transforms/LoopVectorize/if-conversion-nest.ll index 7f381ae6ad7b..0d6e4b1e61b4 100644 --- a/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -13,24 +13,21 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[MIN_ITERS_CHECKED:%.*]] -; CHECK: min.iters.checked: -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[N]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = zext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[CMP_ZERO:%.*]] = icmp eq i64 [[N_VEC]], 0 -; CHECK-NEXT: br i1 [[CMP_ZERO]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] ; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[N]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -55,10 +52,10 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[MIN_ITERS_CHECKED]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/test/Transforms/LoopVectorize/induction-step.ll b/test/Transforms/LoopVectorize/induction-step.ll index 33e8ed067160..b37537efcc51 100644 --- a/test/Transforms/LoopVectorize/induction-step.ll +++ b/test/Transforms/LoopVectorize/induction-step.ll @@ -15,7 +15,7 @@ ; CHECK: for.body.lr.ph: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @int_inc, align 4 ; CHECK: vector.ph: -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0 +; CHECK: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer @@ -86,7 +86,7 @@ for.end: ; preds = %for.end.loopexit, % ; CHECK-LABEL: @induction_with_loop_inv( ; CHECK: vector.ph: -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0 +; CHECK: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 %j.012, i32 0 ; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll index 7e9e6b1cdc8e..d77806da59be 100644 --- a/test/Transforms/LoopVectorize/induction.ll +++ b/test/Transforms/LoopVectorize/induction.ll @@ -501,13 +501,13 @@ define i32 @i16_loop() nounwind readnone ssp uwtable { ; condition and branch directly to the scalar loop. ; CHECK-LABEL: max_i32_backedgetaken -; CHECK: br i1 true, label %scalar.ph, label %min.iters.checked +; CHECK: br i1 true, label %scalar.ph, label %vector.ph ; CHECK: middle.block: ; CHECK: %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0 ; CHECK: scalar.ph: ; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ] -; CHECK: %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ 1, %min.iters.checked ], [ %[[v9]], %middle.block ] +; CHECK: %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ %[[v9]], %middle.block ] define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable { diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll index 1e8b982363d8..89c0ac109167 100644 --- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" ; ; CHECK-LABEL: @interleaved_with_cond_store_0( ; -; CHECK: min.iters.checked +; CHECK: vector.ph ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf @@ -58,7 +58,7 @@ for.end: ; ; CHECK-LABEL: @interleaved_with_cond_store_1( ; -; CHECK: min.iters.checked +; CHECK: vector.ph ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf @@ -117,7 +117,7 @@ for.end: ; ; CHECK-LABEL: @interleaved_with_cond_store_2( ; -; CHECK: min.iters.checked +; CHECK: vector.ph ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf diff --git a/test/Transforms/LoopVectorize/interleaved-accesses.ll b/test/Transforms/LoopVectorize/interleaved-accesses.ll index d84dc42bdf54..530c2f66552a 100644 --- a/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -338,7 +338,7 @@ for.body: ; preds = %for.body, %entry ; } ; CHECK-LABEL: @even_load_dynamic_tc( -; CHECK: min.iters.checked: +; CHECK: vector.ph: ; CHECK: %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf @@ -579,7 +579,7 @@ for.body: ; preds = %for.body, %entry ; } ; CHECK-LABEL: @PR27626_0( -; CHECK: min.iters.checked: +; CHECK: vector.ph: ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf @@ -627,7 +627,7 @@ for.end: ; } ; CHECK-LABEL: @PR27626_1( -; CHECK: min.iters.checked: +; CHECK: vector.ph: ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf @@ -680,7 +680,7 @@ for.end: ; } ; CHECK-LABEL: @PR27626_2( -; CHECK: min.iters.checked: +; CHECK: vector.ph: ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf @@ -728,7 +728,7 @@ for.end: ; } ; CHECK-LABEL: @PR27626_3( -; CHECK: min.iters.checked: +; CHECK: vector.ph: ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf diff --git a/test/Transforms/LoopVectorize/iv_outside_user.ll b/test/Transforms/LoopVectorize/iv_outside_user.ll index 8a44af96e7f4..265188886996 100644 --- a/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -135,7 +135,7 @@ for.end: } ; CHECK-LABEL: @PR30742 -; CHECK: min.iters.checked +; CHECK: vector.ph ; CHECK: %[[N_MOD_VF:.+]] = urem i32 %[[T5:.+]], 2 ; CHECK: %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]] ; CHECK: middle.block diff --git a/test/Transforms/LoopVectorize/miniters.ll b/test/Transforms/LoopVectorize/miniters.ll index 1cb67f9150ac..f5f4eb5eaa01 100644 --- a/test/Transforms/LoopVectorize/miniters.ll +++ b/test/Transforms/LoopVectorize/miniters.ll @@ -10,10 +10,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Generate min.iters.check to skip the vector loop and jump to scalar.ph directly when loop iteration number is less than VF * UF. ; CHECK-LABEL: foo( ; CHECK: %min.iters.check = icmp ult i64 %N, 4 -; CHECK: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked +; CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.ph ; UNROLL-LABEL: foo( ; UNROLL: %min.iters.check = icmp ult i64 %N, 8 -; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked +; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %vector.ph define void @foo(i64 %N) { entry: diff --git a/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll new file mode 100644 index 000000000000..40af8f3adf02 --- /dev/null +++ b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll @@ -0,0 +1,240 @@ +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Check that the vectorizer identifies the %p.09 phi, +; as an induction variable, despite the potential overflow +; due to the truncation from 32bit to 8bit. +; SCEV will detect the pattern "sext(trunc(%p.09)) + %step" +; and generate the required runtime checks under which +; we can assume no overflow. We check here that we generate +; exactly two runtime checks: +; 1) an overflow check: +; {0,+,(trunc i32 %step to i8)}<%for.body> Added Flags: +; 2) an equality check verifying that the step of the induction +; is equal to sext(trunc(step)): +; Equal predicate: %step == (sext i8 (trunc i32 %step to i8) to i32) +; +; See also pr30654. +; +; int a[N]; +; void doit1(int n, int step) { +; int i; +; char p = 0; +; for (i = 0; i < n; i++) { +; a[i] = p; +; p = p + step; +; } +; } +; + +; CHECK-LABEL: @doit1 +; CHECK: vector.scevcheck +; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: %[[TEST:[0-9]+]] = or i1 {{.*}}, %mul.overflow +; CHECK: %[[NTEST:[0-9]+]] = or i1 false, %[[TEST]] +; CHECK: %ident.check = icmp ne i32 {{.*}}, %{{.*}} +; CHECK: %{{.*}} = or i1 %[[NTEST]], %ident.check +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: vector.body: +; CHECK: <4 x i32> + +@a = common local_unnamed_addr global [250 x i32] zeroinitializer, align 16 + +; Function Attrs: norecurse nounwind uwtable +define void @doit1(i32 %n, i32 %step) local_unnamed_addr { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %sext = shl i32 %p.09, 24 + %conv = ashr exact i32 %sext, 24 + %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %conv, i32* %arrayidx, align 4 + %add = add nsw i32 %conv, %step + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; Same as above, but for checking the SCEV "zext(trunc(%p.09)) + %step". +; Here we expect the following two predicates to be added for runtime checking: +; 1) {0,+,(trunc i32 %step to i8)}<%for.body> Added Flags: +; 2) Equal predicate: %step == (zext i8 (trunc i32 %step to i8) to i32) +; +; int a[N]; +; void doit2(int n, int step) { +; int i; +; unsigned char p = 0; +; for (i = 0; i < n; i++) { +; a[i] = p; +; p = p + step; +; } +; } +; + +; CHECK-LABEL: @doit2 +; CHECK: vector.scevcheck +; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: %[[TEST:[0-9]+]] = or i1 {{.*}}, %mul.overflow +; CHECK: %[[NTEST:[0-9]+]] = or i1 false, %[[TEST]] +; CHECK: %ident.check = icmp ne i32 {{.*}}, %{{.*}} +; CHECK: %{{.*}} = or i1 %[[NTEST]], %ident.check +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: vector.body: +; CHECK: <4 x i32> + +; Function Attrs: norecurse nounwind uwtable +define void @doit2(i32 %n, i32 %step) local_unnamed_addr { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %conv = and i32 %p.09, 255 + %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %conv, i32* %arrayidx, align 4 + %add = add nsw i32 %conv, %step + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; Here we check that the same phi scev analysis would fail +; to create the runtime checks because the step is not invariant. +; As a result vectorization will fail. +; +; int a[N]; +; void doit3(int n, int step) { +; int i; +; char p = 0; +; for (i = 0; i < n; i++) { +; a[i] = p; +; p = p + step; +; step += 2; +; } +; } +; + +; CHECK-LABEL: @doit3 +; CHECK-NOT: vector.scevcheck +; CHECK-NOT: vector.body: +; CHECK-LABEL: for.body: + +; Function Attrs: norecurse nounwind uwtable +define void @doit3(i32 %n, i32 %step) local_unnamed_addr { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %p.012 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %step.addr.010 = phi i32 [ %add3, %for.body ], [ %step, %for.body.preheader ] + %sext = shl i32 %p.012, 24 + %conv = ashr exact i32 %sext, 24 + %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %conv, i32* %arrayidx, align 4 + %add = add nsw i32 %conv, %step.addr.010 + %add3 = add nsw i32 %step.addr.010, 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + + +; Lastly, we also check the case where we can tell at compile time that +; the step of the induction is equal to sext(trunc(step)), in which case +; we don't have to check this equality at runtime (we only need the +; runtime overflow check). Therefore only the following overflow predicate +; will be added for runtime checking: +; {0,+,%cstep}<%for.body> Added Flags: +; +; a[N]; +; void doit4(int n, char cstep) { +; int i; +; char p = 0; +; int istep = cstep; +; for (i = 0; i < n; i++) { +; a[i] = p; +; p = p + istep; +; } +; } + +; CHECK-LABEL: @doit4 +; CHECK: vector.scevcheck +; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: %{{.*}} = or i1 {{.*}}, %mul.overflow +; CHECK-NOT: %ident.check = icmp ne i32 {{.*}}, %{{.*}} +; CHECK-NOT: %{{.*}} = or i1 %{{.*}}, %ident.check +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: vector.body: +; CHECK: <4 x i32> + +; Function Attrs: norecurse nounwind uwtable +define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr { +entry: + %conv = sext i8 %cstep to i32 + %cmp10 = icmp sgt i32 %n, 0 + br i1 %cmp10, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %p.011 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %sext = shl i32 %p.011, 24 + %conv2 = ashr exact i32 %sext, 24 + %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %conv2, i32* %arrayidx, align 4 + %add = add nsw i32 %conv2, %conv + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/test/Transforms/LoopVectorize/runtime-check-readonly.ll index ac1145aab67b..b37d94c0c328 100644 --- a/test/Transforms/LoopVectorize/runtime-check-readonly.ll +++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll @@ -4,7 +4,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ;CHECK-LABEL: @add_ints( ;CHECK: br -;CHECK: br ;CHECK: getelementptr ;CHECK-DAG: getelementptr ;CHECK-DAG: icmp ugt diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll index 958b3c135c97..fb0548612715 100644 --- a/test/Transforms/LoopVectorize/runtime-check.ll +++ b/test/Transforms/LoopVectorize/runtime-check.ll @@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ;CHECK-LABEL: define i32 @foo ;CHECK: for.body.preheader: -;CHECK: br i1 %cmp.zero, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]] +;CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]] ;CHECK: vector.memcheck: ;CHECK: br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph, !dbg [[BODY_LOC]] ;CHECK: load <4 x float> diff --git a/test/tools/llvm-cov/showTabsHTML.cpp b/test/tools/llvm-cov/showTabsHTML.cpp index 953c06a3c60d..c092841aeb22 100644 --- a/test/tools/llvm-cov/showTabsHTML.cpp +++ b/test/tools/llvm-cov/showTabsHTML.cpp @@ -1,5 +1,5 @@ // RUN: llvm-profdata merge -o %t.profdata %S/Inputs/showTabsHTML.proftext -// RUN: llvm-cov show %S/Inputs/showTabsHTML.covmapping -format html -instr-profile %t.profdata -filename-equivalence %s | FileCheck -check-prefix=CHECK %s +// RUN: llvm-cov show %S/Inputs/showTabsHTML.covmapping -format html -instr-profile %t.profdata -filename-equivalence %s | FileCheck %s int main(int argc, char ** argv) { (void) "This tab starts at column 0"; // CHECK:   (void) "This tab starts at column 0"; @@ -13,4 +13,4 @@ int main(int argc, char ** argv) { // CHECK-TABSIZE:    (void) "This tab starts at column 0"; // CHECK-TABSIZE: (void) "  This tab starts at column 10"; -// CHECK-TABSIZE: (void) "This     tab starts at column 15"; \ No newline at end of file +// CHECK-TABSIZE: (void) "This     tab starts at column 15"; diff --git a/test/tools/llvm-dwarfdump/X86/verify_debug_info.s b/test/tools/llvm-dwarfdump/X86/verify_debug_info.s new file mode 100644 index 000000000000..90733eda278f --- /dev/null +++ b/test/tools/llvm-dwarfdump/X86/verify_debug_info.s @@ -0,0 +1,193 @@ +# RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \ +# RUN: | not llvm-dwarfdump -verify - \ +# RUN: | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DIE has invalid DW_AT_stmt_list encoding:{{[[:space:]]}} +# CHECK-NEXT: 0x0000000c: DW_TAG_compile_unit [1] * +# CHECK-NEXT: DW_AT_producer [DW_FORM_strp] ( .debug_str[0x00000000] = "clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)") +# CHECK-NEXT: DW_AT_language [DW_FORM_data2] (DW_LANG_C99) +# CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000037] = "basic.c") +# CHECK-NEXT: DW_AT_stmt_list [DW_FORM_strx4] ( indexed (00000000) string = ) +# CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strp] ( .debug_str[0x0000003f] = "/Users/sgravani/Development/tests") +# CHECK-NEXT: DW_AT_low_pc [DW_FORM_addr] (0x0000000000000000) +# CHECK-NEXT: DW_AT_high_pc [DW_FORM_data4] (0x00000016){{[[:space:]]}} +# CHECK-NEXT: Units[2] - start offset: 0x00000068 +# CHECK-NEXT: Error: The length for this unit is too large for the .debug_info provided. +# CHECK-NEXT: Error: The unit type encoding is not valid. + + + .section __TEXT,__text,regular,pure_instructions + .globl _main ## -- Begin function main + .p2align 4, 0x90 +_main: ## @main +Lfunc_begin0: + .file 1 "basic.c" + .loc 1 1 0 ## basic.c:1:0 + .cfi_startproc +## BB#0: ## %entry + pushq %rbp +Lcfi0: + .cfi_def_cfa_offset 16 +Lcfi1: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +Lcfi2: + .cfi_def_cfa_register %rbp + xorl %eax, %eax + movl $0, -4(%rbp) +Ltmp0: + .loc 1 2 7 prologue_end ## basic.c:2:7 + movl $1, -8(%rbp) + .loc 1 3 3 ## basic.c:3:3 + popq %rbp + retq +Ltmp1: +Lfunc_end0: + .cfi_endproc + ## -- End function + .section __DWARF,__debug_str,regular,debug +Linfo_string: + .asciz "clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)" ## string offset=0 + .asciz "basic.c" ## string offset=55 + .asciz "/Users/sgravani/Development/tests" ## string offset=63 + .asciz "main" ## string offset=97 + .asciz "int" ## string offset=102 + .asciz "a" ## string offset=106 + .section __DWARF,__debug_abbrev,regular,debug +Lsection_abbrev: + .byte 1 ## Abbreviation Code + .byte 17 ## DW_TAG_compile_unit + .byte 1 ## DW_CHILDREN_yes + .byte 37 ## DW_AT_producer + .byte 14 ## DW_FORM_strp + .byte 19 ## DW_AT_language + .byte 5 ## DW_FORM_data2 + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 16 ## DW_AT_stmt_list + .byte 40 ## DW_FORM_sec_offset -- error: DIE has invalid DW_AT_stmt_list encoding: + .byte 27 ## DW_AT_comp_dir + .byte 14 ## DW_FORM_strp + .byte 17 ## DW_AT_low_pc + .byte 1 ## DW_FORM_addr + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 2 ## Abbreviation Code + .byte 46 ## DW_TAG_subprogram + .byte 1 ## DW_CHILDREN_yes + .byte 17 ## DW_AT_low_pc + .byte 1 ## DW_FORM_addr + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 64 ## DW_AT_frame_base + .byte 24 ## DW_FORM_exprloc + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 39 ## DW_AT_prototyped + .byte 25 ## DW_FORM_flag_present + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 63 ## DW_AT_external + .byte 25 ## DW_FORM_flag_present + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 3 ## Abbreviation Code + .byte 52 ## DW_TAG_variable + .byte 0 ## DW_CHILDREN_no + .byte 2 ## DW_AT_location + .byte 24 ## DW_FORM_exprloc + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 58 ## DW_AT_decl_file + .byte 11 ## DW_FORM_data1 + .byte 59 ## DW_AT_decl_line + .byte 11 ## DW_FORM_data1 + .byte 73 ## DW_AT_type + .byte 19 ## DW_FORM_ref4 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 4 ## Abbreviation Code + .byte 36 ## DW_TAG_base_type + .byte 0 ## DW_CHILDREN_no + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 62 ## DW_AT_encoding + .byte 11 ## DW_FORM_data1 + .byte 11 ## DW_AT_byte_size + .byte 11 ## DW_FORM_data1 + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 0 ## EOM(3) + .section __DWARF,__debug_info,regular,debug +Lsection_info: +Lcu_begin0: + .long 87 ## Length of Unit + .short 5 ## DWARF version number + .byte 1 ## DWARF Unit Type + .byte 8 ## Address Size (in bytes) +Lset0 = Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section + .long Lset0 + .byte 1 ## Abbrev [1] 0xc:0x4f DW_TAG_compile_unit + .long 0 ## DW_AT_producer + .short 12 ## DW_AT_language + .long 55 ## DW_AT_name +Lset1 = Lline_table_start0-Lsection_line ## DW_AT_stmt_list + .long Lset1 + .long 63 ## DW_AT_comp_dir + .quad Lfunc_begin0 ## DW_AT_low_pc +Lset2 = Lfunc_end0-Lfunc_begin0 ## DW_AT_high_pc + .long Lset2 + .byte 2 ## Abbrev [2] 0x2b:0x28 DW_TAG_subprogram + .quad Lfunc_begin0 ## DW_AT_low_pc +Lset3 = Lfunc_end0-Lfunc_begin0 ## DW_AT_high_pc + .long Lset3 + .byte 1 ## DW_AT_frame_base + .byte 86 + .long 97 ## DW_AT_name + .byte 1 ## DW_AT_decl_file + .byte 1 ## DW_AT_decl_line + ## DW_AT_prototyped + .long 83 ## DW_AT_type + ## DW_AT_external + .byte 3 ## Abbrev [3] 0x44:0xe DW_TAG_variable + .byte 2 ## DW_AT_location + .byte 145 + .byte 120 + .long 106 ## DW_AT_name + .byte 1 ## DW_AT_decl_file + .byte 2 ## DW_AT_decl_line + .long 83 ## DW_AT_type + .byte 0 ## End Of Children Mark + .byte 4 ## Abbrev [4] 0x53:0x7 DW_TAG_base_type + .long 102 ## DW_AT_name + .byte 5 ## DW_AT_encoding + .byte 4 ## DW_AT_byte_size + .byte 0 ## End Of Children Mark +Lcu_begin1: + .long 9 ## Length of Unit + .short 5 ## DWARF version number + .byte 1 ## DWARF Unit Type + .byte 4 ## Address Size (in bytes) + .long 0 ## Abbrev offset + .byte 0 +Ltu_begin0: + .long 26 ## Length of Unit -- Error: The length for this unit is too large for the .debug_info provided. + .short 5 ## DWARF version number + .byte 0 ## DWARF Unit Type + .byte 4 ## Address Size (in bytes) + .long 0 + .quad 0 + .long 0 + .byte 0 + +.subsections_via_symbols + .section __DWARF,__debug_line,regular,debug +Lsection_line: +Lline_table_start0: diff --git a/test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s b/test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s new file mode 100644 index 000000000000..a3a54077bbf9 --- /dev/null +++ b/test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s @@ -0,0 +1,81 @@ +# RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \ +# RUN: | not llvm-dwarfdump -verify - \ +# RUN: | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: Units[1] - start offset: 0x0000000d +# CHECK-NEXT: Error: The unit type encoding is not valid. +# CHECK-NEXT: Error: The address size is unsupported. +# CHECK-NEXT: Units[2] - start offset: 0x00000026 +# CHECK-NEXT: Error: The 16 bit unit header version is not valid. +# CHECK-NEXT: Error: The offset into the .debug_abbrev section is not valid. +# CHECK-NEXT: Units[4] - start offset: 0x00000041 +# CHECK-NEXT: Error: The length for this unit is too large for the .debug_info provided. + + .section __TEXT,__text,regular,pure_instructions + .file 1 "basic.c" + .comm _i,4,2 ## @i + .comm _j,4,2 ## @j + .section __DWARF,__debug_str,regular,debug +Linfo_string: + .asciz "clang version 5.0.0 (trunk 307232) (llvm/trunk 307042)" ## string offset=0 + .asciz "basic.c" ## string offset=55 + .asciz "/Users/sgravani/Development/tests" ## string offset=63 + .asciz "i" ## string offset=97 + .asciz "int" ## string offset=99 + .asciz "j" ## string offset=103 + .section __DWARF,__debug_abbrev,regular,debug +Lsection_abbrev: + .byte 1 ## Abbreviation Code + .byte 17 ## DW_TAG_compile_unit + .byte 0 ## EOM(1) + .byte 0 ## EOM(2) + .byte 0 ## EOM(3) + .section __DWARF,__debug_info,regular,debug +Lsection_info: +Lcu_begin0: + .long 9 ## Length of Unit + .short 4 ## DWARF version number +Lset0 = Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section + .long Lset0 + .byte 4 ## Address Size (in bytes) + .byte 1 ## Abbrev [1] 0xc:0x45 DW_TAG_compile_unit + .byte 0 ## End Of Children Mark +Ltu_begin0: + .long 21 ## Length of Unit + .short 5 ## DWARF version number + .byte 0 ## DWARF Unit Type -- Error: The unit type encoding is not valid. + .byte 3 ## Address Size (in bytes) -- Error: The address size is unsupported. + .long 0 + .quad 0 + .long 0 + .byte 0 +Lcu_begin1: + .long 10 ## Length of Unit + .short 6 ## DWARF version number -- Error: The 16 bit unit header version is not valid. + .byte 1 ## DWARF Unit Type + .byte 4 ## Address Size (in bytes) -- The offset into the .debug_abbrev section is not valid. + .long Lline_table_start0 + .byte 1 ## Abbrev [1] 0xc:0x45 DW_TAG_compile_unit + .byte 0 ## End Of Children Mark +Lcu_begin2: + .long 9 ## Length of Unit + .short 5 ## DWARF version number + .byte 1 ## DWARF Unit Type + .byte 4 ## Address Size (in bytes) + .long 0 ## Abbrev offset + .byte 0 +Ltu_begin1: + .long 26 ## Length of Unit -- Error: The length for this unit is too large for the .debug_info provided. + .short 5 ## DWARF version number + .byte 2 ## DWARF Unit Type + .byte 4 ## Address Size (in bytes) + .long 0 + .quad 0 + .long 0 + .byte 0 + +.subsections_via_symbols + .section __DWARF,__debug_line,regular,debug +Lsection_line: +Lline_table_start0: diff --git a/test/tools/llvm-mt/help.test b/test/tools/llvm-mt/help.test new file mode 100644 index 000000000000..29e3667ec2ca --- /dev/null +++ b/test/tools/llvm-mt/help.test @@ -0,0 +1,7 @@ +RUN: llvm-mt /h | FileCheck %s -check-prefix=HELP + +RUN: llvm-mt /inputresource:foo.res /manifest foo.manifest | FileCheck %s -check-prefix=NOT_SUPPORTED + +HELP: OVERVIEW: Manifest Tool + +NOT_SUPPORTED: llvm-mt: ignoring unsupported 'inputresource:' option diff --git a/test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64 b/test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64 new file mode 100644 index 000000000000..58ed3c7a48fc Binary files /dev/null and b/test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64 differ diff --git a/test/tools/llvm-objdump/AArch64/macho-reloc-addend.test b/test/tools/llvm-objdump/AArch64/macho-reloc-addend.test new file mode 100644 index 000000000000..4cc4a6eca2a1 --- /dev/null +++ b/test/tools/llvm-objdump/AArch64/macho-reloc-addend.test @@ -0,0 +1,6 @@ +RUN: llvm-objdump -r %p/Inputs/reloc-addend.obj.macho-aarch64 | FileCheck %s + +CHECK-DAG: 0000000000000004 ARM64_RELOC_ADDEND 0x999 +CHECK-DAG: 0000000000000004 ARM64_RELOC_PAGEOFF12 _stringbuf +CHECK-DAG: 0000000000000000 ARM64_RELOC_ADDEND 0x999 +CHECK-DAG: 0000000000000000 ARM64_RELOC_PAGE21 _stringbuf diff --git a/test/tools/llvm-readobj/Inputs/dynamic-table-so.x86 b/test/tools/llvm-readobj/Inputs/dynamic-table-so.x86 index fb9d37881c98..01bd1c2fc1ed 100644 Binary files a/test/tools/llvm-readobj/Inputs/dynamic-table-so.x86 and b/test/tools/llvm-readobj/Inputs/dynamic-table-so.x86 differ diff --git a/test/tools/llvm-readobj/Inputs/dynamic-table.c b/test/tools/llvm-readobj/Inputs/dynamic-table.c index b5251f87ee53..0af66ca0c53f 100644 --- a/test/tools/llvm-readobj/Inputs/dynamic-table.c +++ b/test/tools/llvm-readobj/Inputs/dynamic-table.c @@ -1,10 +1,10 @@ // clang -target x86_64-linux-gnu -shared -fPIC -lc dynamic-table.c \ -// -o dynamic-table-so.x86 -Wl,-f,aux_val +// -o dynamic-table-so.x86 -Wl,-f,aux.so -Wl,-F,filter.so // clang -target mipsel-linux-gnu -shared -fPIC -lc dynamic-table.c \ // -o dynamic-table-so.mips // clang -target mipsel-linux-gnu -lc dynamic-table.c \ // -o dynamic-table-exe.mips -// clang -target aarch64-linux-gnu -fPIC -shared dynamic-table.c \ +// clang -target aarch64-linux-gnu -fPIC -shared dynamic-table.c\ // -o dynamic-table-so.aarch64 int puts(const char *); diff --git a/test/tools/llvm-readobj/dynamic.test b/test/tools/llvm-readobj/dynamic.test index 5079d3556957..71b6b06cbc08 100644 --- a/test/tools/llvm-readobj/dynamic.test +++ b/test/tools/llvm-readobj/dynamic.test @@ -8,7 +8,7 @@ ELF-MIPS: AddressSize: 32bit ELF-MIPS: LoadName: ELF-MIPS: DynamicSection [ (23 entries) ELF-MIPS: Tag Type Name/Value -ELF-MIPS: 0x00000001 NEEDED SharedLibrary (libc.so.6) +ELF-MIPS: 0x00000001 NEEDED Shared library: [libc.so.6] ELF-MIPS: 0x0000000C INIT 0x528 ELF-MIPS: 0x0000000D FINI 0x860 ELF-MIPS: 0x00000004 HASH 0x210 @@ -43,7 +43,7 @@ ELF-MIPS-EXE: AddressSize: 32bit ELF-MIPS-EXE: LoadName: ELF-MIPS-EXE: DynamicSection [ (26 entries) ELF-MIPS-EXE: Tag Type Name/Value -ELF-MIPS-EXE: 0x00000001 NEEDED SharedLibrary (libc.so.6) +ELF-MIPS-EXE: 0x00000001 NEEDED Shared library: [libc.so.6] ELF-MIPS-EXE: 0x0000000C INIT 0x400418 ELF-MIPS-EXE: 0x0000000D FINI 0x4007B0 ELF-MIPS-EXE: 0x00000004 HASH 0x4002B8 @@ -80,9 +80,9 @@ ELF-X86-EXE: AddressSize: 32bit ELF-X86-EXE: LoadName: ELF-X86-EXE: DynamicSection [ (30 entries) ELF-X86-EXE: Tag Type Name/Value -ELF-X86-EXE: 0x00000001 NEEDED SharedLibrary (libstdc++.so.6) -ELF-X86-EXE: 0x00000001 NEEDED SharedLibrary (libgcc_s.so.1) -ELF-X86-EXE: 0x00000001 NEEDED SharedLibrary (libc.so.6) +ELF-X86-EXE: 0x00000001 NEEDED Shared library: [libstdc++.so.6] +ELF-X86-EXE: 0x00000001 NEEDED Shared library: [libgcc_s.so.1] +ELF-X86-EXE: 0x00000001 NEEDED Shared library: [libc.so.6] ELF-X86-EXE: 0x0000000C INIT 0x62C ELF-X86-EXE: 0x0000000D FINI 0x920 ELF-X86-EXE: 0x00000019 INIT_ARRAY 0x19FC @@ -119,32 +119,33 @@ ELF-X86-SO: Format: ELF64-x86-64 ELF-X86-SO: Arch: x86_64 ELF-X86-SO: AddressSize: 64bit ELF-X86-SO: LoadName: -ELF-X86-SO: DynamicSection [ (26 entries) +ELF-X86-SO: DynamicSection [ ({{[0-9]+}} entries) ELF-X86-SO: Tag Type Name/Value -ELF-X86-SO: 0x0000000000000001 NEEDED SharedLibrary (libc.so.6) -ELF-X86-SO: 0x0000000000000001 NEEDED SharedLibrary (ld-linux-x86-64.so.2) -ELF-X86-SO: 0x000000007FFFFFFD AUXILIARY Auxiliary library: [aux_val] -ELF-X86-SO: 0x000000000000000C INIT 0x610 -ELF-X86-SO: 0x000000000000000D FINI 0x7AC -ELF-X86-SO: 0x0000000000000019 INIT_ARRAY 0x200DD0 +ELF-X86-SO: 0x0000000000000001 NEEDED Shared library: [libc.so.6] +ELF-X86-SO: 0x0000000000000001 NEEDED Shared library: [ld-linux-x86-64.so.2] +ELF-X86-SO: 0x000000007FFFFFFF FILTER Filter library: [filter.so] +ELF-X86-SO: 0x000000007FFFFFFD AUXILIARY Auxiliary library: [aux.so] +ELF-X86-SO: 0x000000000000000C INIT 0x{{[0-9A-F]+}} +ELF-X86-SO: 0x000000000000000D FINI 0x{{[0-9A-F]+}} +ELF-X86-SO: 0x0000000000000019 INIT_ARRAY 0x{{[0-9A-F]+}} ELF-X86-SO: 0x000000000000001B INIT_ARRAYSZ 8 (bytes) -ELF-X86-SO: 0x000000000000001A FINI_ARRAY 0x200DD8 +ELF-X86-SO: 0x000000000000001A FINI_ARRAY 0x{{[0-9A-F]+}} ELF-X86-SO: 0x000000000000001C FINI_ARRAYSZ 8 (bytes) ELF-X86-SO: 0x000000006FFFFEF5 GNU_HASH 0x1C8 ELF-X86-SO: 0x0000000000000005 STRTAB 0x3A0 ELF-X86-SO: 0x0000000000000006 SYMTAB 0x208 -ELF-X86-SO: 0x000000000000000A STRSZ 231 (bytes) +ELF-X86-SO: 0x000000000000000A STRSZ {{[0-9]+}} (bytes) ELF-X86-SO: 0x000000000000000B SYMENT 24 (bytes) ELF-X86-SO: 0x0000000000000003 PLTGOT 0x201000 ELF-X86-SO: 0x0000000000000002 PLTRELSZ 48 (bytes) ELF-X86-SO: 0x0000000000000014 PLTREL RELA -ELF-X86-SO: 0x0000000000000017 JMPREL 0x5E0 -ELF-X86-SO: 0x0000000000000007 RELA 0x4F0 +ELF-X86-SO: 0x0000000000000017 JMPREL 0x{{[0-9A-F]+}} +ELF-X86-SO: 0x0000000000000007 RELA 0x{{[0-9A-F]+}} ELF-X86-SO: 0x0000000000000008 RELASZ 240 (bytes) ELF-X86-SO: 0x0000000000000009 RELAENT 24 (bytes) -ELF-X86-SO: 0x000000006FFFFFFE VERNEED 0x4B0 +ELF-X86-SO: 0x000000006FFFFFFE VERNEED 0x{{[0-9A-F]+}} ELF-X86-SO: 0x000000006FFFFFFF VERNEEDNUM 2 -ELF-X86-SO: 0x000000006FFFFFF0 VERSYM 0x488 +ELF-X86-SO: 0x000000006FFFFFF0 VERSYM 0x{{[0-9A-F]+}} ELF-X86-SO: 0x000000006FFFFFF9 RELACOUNT 3 ELF-X86-SO: 0x0000000000000000 NULL 0x0 @@ -157,7 +158,7 @@ ELF-AARCH64-SO: AddressSize: 64bit ELF-AARCH64-SO: LoadName: ELF-AARCH64-SO: DynamicSection [ (26 entries) ELF-AARCH64-SO: Tag Type Name/Value -ELF-AARCH64-SO: 0x0000000000000001 NEEDED SharedLibrary (libc.so.6) +ELF-AARCH64-SO: 0x0000000000000001 NEEDED Shared library: [libc.so.6] ELF-AARCH64-SO: 0x000000000000000C INIT 0x660 ELF-AARCH64-SO: 0x000000000000000D FINI 0x83C ELF-AARCH64-SO: 0x0000000000000019 INIT_ARRAY 0x10DB8 diff --git a/test/tools/llvm-readobj/gnu-sections.test b/test/tools/llvm-readobj/gnu-sections.test index fb90ce44d10f..f5b504fa66fa 100644 --- a/test/tools/llvm-readobj/gnu-sections.test +++ b/test/tools/llvm-readobj/gnu-sections.test @@ -1,6 +1,14 @@ RUN: llvm-readobj -s %p/Inputs/relocs.obj.elf-i386 --elf-output-style=GNU \ RUN: | FileCheck %s -check-prefix ELF32 -RUN: llvm-readobj -s %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \ +RUN: llvm-readobj -S %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \ +RUN: | FileCheck %s -check-prefix ELF64 +RUN: llvm-readobj --wide --sections \ +RUN: %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \ +RUN: | FileCheck %s -check-prefix ELF64 +RUN: llvm-readobj -W --sections \ +RUN: %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \ +RUN: | FileCheck %s -check-prefix ELF64 +RUN: llvm-readelf -W -S %p/Inputs/relocs.obj.elf-x86_64 \ RUN: | FileCheck %s -check-prefix ELF64 ELF32: Section Headers: diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt index 3bb0c8f7b7c5..731bcbd8ac9d 100644 --- a/tools/llvm-ar/CMakeLists.txt +++ b/tools/llvm-ar/CMakeLists.txt @@ -1,6 +1,7 @@ set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} Core + DlltoolDriver LibDriver Object Support @@ -15,3 +16,4 @@ add_llvm_tool(llvm-ar add_llvm_tool_symlink(llvm-ranlib llvm-ar) add_llvm_tool_symlink(llvm-lib llvm-ar) +add_llvm_tool_symlink(llvm-dlltool llvm-ar) diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp index 500507fd4966..af4d3efa52f7 100644 --- a/tools/llvm-ar/llvm-ar.cpp +++ b/tools/llvm-ar/llvm-ar.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Triple.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h" #include "llvm/ToolDrivers/llvm-lib/LibDriver.h" #include "llvm/Object/Archive.h" #include "llvm/Object/ArchiveWriter.h" @@ -863,6 +864,9 @@ int main(int argc, char **argv) { llvm::InitializeAllAsmParsers(); StringRef Stem = sys::path::stem(ToolName); + if (Stem.find("dlltool") != StringRef::npos) + return dlltoolDriverMain(makeArrayRef(argv, argc)); + if (Stem.find("ranlib") == StringRef::npos && Stem.find("lib") != StringRef::npos) return libDriverMain(makeArrayRef(argv, argc)); @@ -878,5 +882,5 @@ int main(int argc, char **argv) { return ranlib_main(); if (Stem.find("ar") != StringRef::npos) return ar_main(); - fail("Not ranlib, ar or lib!"); + fail("Not ranlib, ar, lib or dlltool!"); } diff --git a/tools/llvm-mt/CMakeLists.txt b/tools/llvm-mt/CMakeLists.txt new file mode 100644 index 000000000000..d97cc4fe446f --- /dev/null +++ b/tools/llvm-mt/CMakeLists.txt @@ -0,0 +1,13 @@ +set(LLVM_LINK_COMPONENTS + Option + Support + ) + +set(LLVM_TARGET_DEFINITIONS Opts.td) + +tablegen(LLVM Opts.inc -gen-opt-parser-defs) +add_public_tablegen_target(MtTableGen) + +add_llvm_tool(llvm-mt + llvm-mt.cpp + ) diff --git a/tools/llvm-mt/LLVMBuild.txt b/tools/llvm-mt/LLVMBuild.txt new file mode 100644 index 000000000000..894d3527924b --- /dev/null +++ b/tools/llvm-mt/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./tools/llvm-mt/LLVMBuild.txt ---------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Tool +name = llvm-mt +parent = Tools +required_libraries = Option Support diff --git a/tools/llvm-mt/Opts.td b/tools/llvm-mt/Opts.td new file mode 100644 index 000000000000..6dc3eea524e6 --- /dev/null +++ b/tools/llvm-mt/Opts.td @@ -0,0 +1,29 @@ +include "llvm/Option/OptParser.td" + +def unsupported : OptionGroup<"unsupported">; +def manifest : Separate<["/", "-"], "manifest">, HelpText<"Used to specify each manifest that need to be processed">, MetaVarName<"manifest">; +def identity : Joined<["/", "-"], "identity:">, HelpText<"Not supported">, MetaVarName<"identity">, Group; +def rgs : Joined<["/", "-"], "rgs:">, HelpText<"Not supported">, MetaVarName<"script">, Group; +def tlb : Joined<["/", "-"], "tlb:">, HelpText<"Not supported">, MetaVarName<"file">, Group; +def dll : Joined<["/", "-"], "dll:">, HelpText<"Not supported">, MetaVarName<"dll">, Group; +def replacements : Joined<["/", "-"], "replacements:">, HelpText<"Not supported">, MetaVarName<"file">, Group; +def managed_assembly_name : Joined<["/", "-"], "managedassemblyname:">, HelpText<"Not supported">, MetaVarName<"assembly">, Group; +def no_dependency : Flag<["/", "-"], "nodependency">, HelpText<"Not supported">, Group; +def category : Flag<["/", "-"], "category">, HelpText<"Not supported">, Group; +def no_logo : Flag<["/", "-"], "nologo">, HelpText<"No effect as this tool never writes copyright data. Included for parity">; +def out : Joined<["/", "-"], "out:">, HelpText<"Name of the output manifest. If this is skipped and only one manifest is being operated upon by the tool, that manifest is modified in place">, MetaVarName<"manifest">; +def input_resource : Joined<["/", "-"], "inputresource:">, HelpText<"Not supported">, MetaVarName<"file">, Group; +def output_resource : Joined<["/", "-"], "outputresource:">, HelpText<"Not supported">, MetaVarName<"file">, Group; +def output_resource_flag : Flag<["/", "-"], "outputresource">, Alias, HelpText<"Not supported">, Group; +def update_resource : Joined<["/", "-"], "updateresource:">, HelpText<"Not supported">, MetaVarName<"file">, Group; +def hash_update : Joined<["/", "-"], "hashupdate:">, HelpText<"Not supported">, MetaVarName<"file">, Group; +def hash_update_flag : Flag<["/", "-"], "hashupdate">, Alias, HelpText<"Not supported">, Group; +def validate_manifest : Flag<["/", "-"], "validate_manifest">, HelpText<"Not supported">, Group; +def validate_file_hashes : Joined<["/", "-"], "validate_file_hashes:">, HelpText<"Not supported">, MetaVarName<"">, Group; +def canonicalize : Flag<["/", "-"], "canonicalize:">, HelpText<"Not supported">, Group; +def check_for_duplicates : Flag<["/", "-"], "check_for_duplicates:">, HelpText<"Not supported">, Group; +def make_cdfs : Flag<["/", "-"], "makecdfs:">, HelpText<"Not supported">, Group; +def verbose : Flag<["/", "-"], "verbose">, HelpText<"Not supported">, Group; +def help : Flag<["/", "-"], "?">; +def help_long : Flag<["/", "-"], "help">, Alias; +def h : Flag<["/", "-"], "h">, Alias; diff --git a/tools/llvm-mt/llvm-mt.cpp b/tools/llvm-mt/llvm-mt.cpp new file mode 100644 index 000000000000..05c9238c7c64 --- /dev/null +++ b/tools/llvm-mt/llvm-mt.cpp @@ -0,0 +1,117 @@ +//===- llvm-mt.cpp - Merge .manifest files ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// Merge .manifest files. This is intended to be a platform-independent port +// of Microsoft's mt.exe. +// +//===---------------------------------------------------------------------===// + +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/Signals.h" +#include "llvm/Support/raw_ostream.h" + +#include + +using namespace llvm; + +namespace { + +enum ID { + OPT_INVALID = 0, // This is not an option ID. +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + OPT_##ID, +#include "Opts.inc" +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE; +#include "Opts.inc" +#undef PREFIX + +static const opt::OptTable::Info InfoTable[] = { +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ +{ \ + PREFIX, NAME, HELPTEXT, \ + METAVAR, OPT_##ID, opt::Option::KIND##Class, \ + PARAM, FLAGS, OPT_##GROUP, \ + OPT_##ALIAS, ALIASARGS, VALUES}, +#include "Opts.inc" +#undef OPTION +}; + +class CvtResOptTable : public opt::OptTable { +public: + CvtResOptTable() : OptTable(InfoTable, true) {} +}; + +static ExitOnError ExitOnErr; +} // namespace + +LLVM_ATTRIBUTE_NORETURN void reportError(Twine Msg) { + errs() << "llvm-mt error: " << Msg << "\n"; + exit(1); +} + +int main(int argc, const char **argv) { + sys::PrintStackTraceOnErrorSignal(argv[0]); + PrettyStackTraceProgram X(argc, argv); + + ExitOnErr.setBanner("llvm-mt: "); + + SmallVector argv_buf; + SpecificBumpPtrAllocator ArgAllocator; + ExitOnErr(errorCodeToError(sys::Process::GetArgumentVector( + argv_buf, makeArrayRef(argv, argc), ArgAllocator))); + + llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. + + CvtResOptTable T; + unsigned MAI, MAC; + ArrayRef ArgsArr = makeArrayRef(argv + 1, argc); + opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MAI, MAC); + + for (auto &Arg : InputArgs) { + if (Arg->getOption().matches(OPT_unsupported)) { + outs() << "llvm-mt: ignoring unsupported '" << Arg->getOption().getName() + << "' option\n"; + } + } + + if (InputArgs.hasArg(OPT_help)) { + T.PrintHelp(outs(), "mt", "Manifest Tool", false); + return 0; + } + + std::vector InputFiles = InputArgs.getAllArgValues(OPT_manifest); + + if (InputFiles.size() == 0) { + reportError("no input file specified"); + } + + StringRef OutputFile; + + if (InputArgs.hasArg(OPT_out)) { + OutputFile = InputArgs.getLastArgValue(OPT_out); + } else if (InputFiles.size() == 1) { + OutputFile = InputFiles[0]; + } else { + reportError("no output file specified"); + } + + return 0; +} diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp index 812f1af3ac68..d54b45515f05 100644 --- a/tools/llvm-objdump/llvm-objdump.cpp +++ b/tools/llvm-objdump/llvm-objdump.cpp @@ -870,7 +870,10 @@ static void printRelocationTargetName(const MachOObjectFile *O, bool isExtern = O->getPlainRelocationExternal(RE); uint64_t Val = O->getPlainRelocationSymbolNum(RE); - if (isExtern) { + if (O->getAnyRelocationType(RE) == MachO::ARM64_RELOC_ADDEND) { + fmt << format("0x%x", Val); + return; + } else if (isExtern) { symbol_iterator SI = O->symbol_begin(); advance(SI, Val); Expected SOrErr = SI->getName(); diff --git a/tools/llvm-pdbutil/DumpOutputStyle.cpp b/tools/llvm-pdbutil/DumpOutputStyle.cpp index 0642d841fd9f..01c7481c3086 100644 --- a/tools/llvm-pdbutil/DumpOutputStyle.cpp +++ b/tools/llvm-pdbutil/DumpOutputStyle.cpp @@ -654,7 +654,7 @@ static void dumpFullTypeStream(LinePrinter &Printer, NumDigits(TypeIndex::FirstNonSimpleIndex + Stream.getNumTypeRecords()); MinimalTypeDumpVisitor V(Printer, Width + 2, Bytes, Extras, Types, - Stream.getHashValues()); + Stream.getNumHashBuckets(), Stream.getHashValues()); if (auto EC = codeview::visitTypeStream(Types, V)) { Printer.formatLine("An error occurred dumping type records: {0}", @@ -670,7 +670,7 @@ static void dumpPartialTypeStream(LinePrinter &Printer, NumDigits(TypeIndex::FirstNonSimpleIndex + Stream.getNumTypeRecords()); MinimalTypeDumpVisitor V(Printer, Width + 2, Bytes, Extras, Types, - Stream.getHashValues()); + Stream.getNumHashBuckets(), Stream.getHashValues()); if (opts::dump::DumpTypeDependents) { // If we need to dump all dependents, then iterate each index and find diff --git a/tools/llvm-pdbutil/MinimalSymbolDumper.cpp b/tools/llvm-pdbutil/MinimalSymbolDumper.cpp index ab7045ca4492..d93843649db0 100644 --- a/tools/llvm-pdbutil/MinimalSymbolDumper.cpp +++ b/tools/llvm-pdbutil/MinimalSymbolDumper.cpp @@ -725,8 +725,9 @@ Error MinimalSymbolDumper::visitKnownRecord(CVSymbol &CVR, ProcSym &Proc) { Proc.Parent, Proc.End, formatSegmentOffset(Proc.Segment, Proc.CodeOffset), Proc.CodeSize); - P.formatLine("debug start = {0}, debug end = {1}, flags = {2}", Proc.DbgStart, - Proc.DbgEnd, + // FIXME: It seems FunctionType is sometimes an id and sometimes a type. + P.formatLine("type = `{0}`, debug start = {1}, debug end = {2}, flags = {3}", + typeIndex(Proc.FunctionType), Proc.DbgStart, Proc.DbgEnd, formatProcSymFlags(P.getIndentLevel() + 9, Proc.Flags)); return Error::success(); } diff --git a/tools/llvm-pdbutil/MinimalTypeDumper.cpp b/tools/llvm-pdbutil/MinimalTypeDumper.cpp index 9621320ea99a..0079b9e7eaa4 100644 --- a/tools/llvm-pdbutil/MinimalTypeDumper.cpp +++ b/tools/llvm-pdbutil/MinimalTypeDumper.cpp @@ -18,6 +18,7 @@ #include "llvm/DebugInfo/CodeView/Formatters.h" #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/DebugInfo/PDB/Native/TpiHashing.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MathExtras.h" @@ -214,10 +215,20 @@ Error MinimalTypeDumpVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) { getLeafTypeName(Record.Type), Record.length()); } else { std::string H; - if (Index.toArrayIndex() >= HashValues.size()) + if (Index.toArrayIndex() >= HashValues.size()) { H = "(not present)"; - else - H = utostr(HashValues[Index.toArrayIndex()]); + } else { + uint32_t Hash = HashValues[Index.toArrayIndex()]; + Expected MaybeHash = hashTypeRecord(Record); + if (!MaybeHash) + return MaybeHash.takeError(); + uint32_t OurHash = *MaybeHash; + OurHash %= NumHashBuckets; + if (Hash == OurHash) + H = "0x" + utohexstr(Hash); + else + H = "0x" + utohexstr(Hash) + ", our hash = 0x" + utohexstr(OurHash); + } P.formatLine("{0} | {1} [size = {2}, hash = {3}]", fmt_align(Index, AlignStyle::Right, Width), getLeafTypeName(Record.Type), Record.length(), H); @@ -395,8 +406,7 @@ Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR, Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) { - P.formatLine("name = {0}, age = {1}, guid = {2}", TS.Name, TS.Age, - fmt_guid(TS.Guid)); + P.formatLine("name = {0}, age = {1}, guid = {2}", TS.Name, TS.Age, TS.Guid); return Error::success(); } diff --git a/tools/llvm-pdbutil/MinimalTypeDumper.h b/tools/llvm-pdbutil/MinimalTypeDumper.h index 42882b4b4060..4227688f0f71 100644 --- a/tools/llvm-pdbutil/MinimalTypeDumper.h +++ b/tools/llvm-pdbutil/MinimalTypeDumper.h @@ -25,9 +25,10 @@ class MinimalTypeDumpVisitor : public codeview::TypeVisitorCallbacks { public: MinimalTypeDumpVisitor(LinePrinter &P, uint32_t Width, bool RecordBytes, bool Hashes, codeview::LazyRandomTypeCollection &Types, + uint32_t NumHashBuckets, FixedStreamArray HashValues) : P(P), Width(Width), RecordBytes(RecordBytes), Hashes(Hashes), - Types(Types), HashValues(HashValues) {} + Types(Types), NumHashBuckets(NumHashBuckets), HashValues(HashValues) {} Error visitTypeBegin(codeview::CVType &Record, codeview::TypeIndex Index) override; @@ -53,6 +54,7 @@ private: bool RecordBytes = false; bool Hashes = false; codeview::LazyRandomTypeCollection &Types; + uint32_t NumHashBuckets; FixedStreamArray HashValues; }; } // namespace pdb diff --git a/tools/llvm-pdbutil/PdbYaml.cpp b/tools/llvm-pdbutil/PdbYaml.cpp index 315ae2e6711f..9c3beb566d2c 100644 --- a/tools/llvm-pdbutil/PdbYaml.cpp +++ b/tools/llvm-pdbutil/PdbYaml.cpp @@ -38,41 +38,6 @@ LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::pdb::PdbRaw_FeatureSig) namespace llvm { namespace yaml { -template <> struct ScalarTraits { - static void output(const llvm::pdb::PDB_UniqueId &S, void *, - llvm::raw_ostream &OS) { - OS << S; - } - - static StringRef input(StringRef Scalar, void *Ctx, - llvm::pdb::PDB_UniqueId &S) { - if (Scalar.size() != 38) - return "GUID strings are 38 characters long"; - if (Scalar[0] != '{' || Scalar[37] != '}') - return "GUID is not enclosed in {}"; - if (Scalar[9] != '-' || Scalar[14] != '-' || Scalar[19] != '-' || - Scalar[24] != '-') - return "GUID sections are not properly delineated with dashes"; - - uint8_t *OutBuffer = S.Guid; - for (auto Iter = Scalar.begin(); Iter != Scalar.end();) { - if (*Iter == '-' || *Iter == '{' || *Iter == '}') { - ++Iter; - continue; - } - uint8_t Value = (llvm::hexDigitValue(*Iter) << 4); - ++Iter; - Value |= llvm::hexDigitValue(*Iter); - ++Iter; - *OutBuffer++ = Value; - } - - return ""; - } - - static bool mustQuote(StringRef Scalar) { return needsQuotes(Scalar); } -}; - template <> struct ScalarEnumerationTraits { static void enumeration(IO &io, llvm::pdb::PDB_Machine &Value) { io.enumCase(Value, "Invalid", PDB_Machine::Invalid); diff --git a/tools/llvm-pdbutil/PdbYaml.h b/tools/llvm-pdbutil/PdbYaml.h index 62ed608916fc..91e054490a5f 100644 --- a/tools/llvm-pdbutil/PdbYaml.h +++ b/tools/llvm-pdbutil/PdbYaml.h @@ -57,7 +57,7 @@ struct PdbInfoStream { PdbRaw_ImplVer Version = PdbImplVC70; uint32_t Signature = 0; uint32_t Age = 1; - PDB_UniqueId Guid; + codeview::GUID Guid; std::vector Features; std::vector NamedStreams; }; diff --git a/tools/llvm-pdbutil/llvm-pdbutil.cpp b/tools/llvm-pdbutil/llvm-pdbutil.cpp index 6aa08ff3cd87..f2bd194622ed 100644 --- a/tools/llvm-pdbutil/llvm-pdbutil.cpp +++ b/tools/llvm-pdbutil/llvm-pdbutil.cpp @@ -956,8 +956,8 @@ static void mergePdbs() { SmallVector IdMap; if (File.hasPDBTpiStream()) { auto &Tpi = ExitOnErr(File.getPDBTpiStream()); - ExitOnErr(codeview::mergeTypeRecords(MergedTpi, TypeMap, nullptr, - Tpi.typeArray())); + ExitOnErr( + codeview::mergeTypeRecords(MergedTpi, TypeMap, Tpi.typeArray())); } if (File.hasPDBIpiStream()) { auto &Ipi = ExitOnErr(File.getPDBIpiStream()); diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt index bde486a5f0db..f5b1a5b256ad 100644 --- a/tools/llvm-readobj/CMakeLists.txt +++ b/tools/llvm-readobj/CMakeLists.txt @@ -20,3 +20,5 @@ add_llvm_tool(llvm-readobj WasmDumper.cpp Win64EHDumper.cpp ) + +add_llvm_tool_symlink(llvm-readelf llvm-readobj) diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp index 9fb3267e2f9d..74c44116b127 100644 --- a/tools/llvm-readobj/COFFDumper.cpp +++ b/tools/llvm-readobj/COFFDumper.cpp @@ -1215,8 +1215,7 @@ void COFFDumper::mergeCodeViewTypes(TypeTableBuilder &CVIDs, error(object_error::parse_failed); } SmallVector SourceToDest; - if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, nullptr, - Types)) + if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, Types)) return error(std::move(EC)); } } diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp index a1db96cba081..5698420bbcc2 100644 --- a/tools/llvm-readobj/ELFDumper.cpp +++ b/tools/llvm-readobj/ELFDumper.cpp @@ -1532,6 +1532,7 @@ static const char *getTypeString(unsigned Arch, uint64_t Type) { LLVM_READOBJ_TYPE_CASE(TLSDESC_PLT); LLVM_READOBJ_TYPE_CASE(TLSDESC_GOT); LLVM_READOBJ_TYPE_CASE(AUXILIARY); + LLVM_READOBJ_TYPE_CASE(FILTER); default: return "unknown"; } } @@ -1624,6 +1625,10 @@ StringRef ELFDumper::getDynamicString(uint64_t Value) const { return StringRef(DynamicStringTable.data() + Value); } +static void printLibrary(raw_ostream &OS, const Twine &Tag, const Twine &Name) { + OS << Tag << ": [" << Name << "]"; +} + template void ELFDumper::printValue(uint64_t Type, uint64_t Value) { raw_ostream &OS = W.getOStream(); @@ -1687,13 +1692,16 @@ void ELFDumper::printValue(uint64_t Type, uint64_t Value) { OS << Value << " (bytes)"; break; case DT_NEEDED: - OS << "SharedLibrary (" << getDynamicString(Value) << ")"; + printLibrary(OS, "Shared library", getDynamicString(Value)); break; case DT_SONAME: - OS << "LibrarySoname (" << getDynamicString(Value) << ")"; + printLibrary(OS, "Library soname", getDynamicString(Value)); break; case DT_AUXILIARY: - OS << "Auxiliary library: [" << getDynamicString(Value) << "]"; + printLibrary(OS, "Auxiliary library", getDynamicString(Value)); + break; + case DT_FILTER: + printLibrary(OS, "Filter library", getDynamicString(Value)); break; case DT_RPATH: case DT_RUNPATH: diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp index 51991a3f067b..7bfb18fab12b 100644 --- a/tools/llvm-readobj/llvm-readobj.cpp +++ b/tools/llvm-readobj/llvm-readobj.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/Path.h" #include "llvm/Support/PrettyStackTrace.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/Signals.h" @@ -50,6 +51,13 @@ namespace opts { cl::desc(""), cl::ZeroOrMore); + // -wide, -W + cl::opt WideOutput("wide", + cl::desc("Ignored for compatibility with GNU readelf")); + cl::alias WideOutputShort("W", + cl::desc("Alias for --wide"), + cl::aliasopt(WideOutput)); + // -file-headers, -h cl::opt FileHeaders("file-headers", cl::desc("Display file headers ")); @@ -57,12 +65,16 @@ namespace opts { cl::desc("Alias for --file-headers"), cl::aliasopt(FileHeaders)); - // -sections, -s + // -sections, -s, -S + // Note: In GNU readelf, -s means --symbols! cl::opt Sections("sections", cl::desc("Display all sections.")); cl::alias SectionsShort("s", cl::desc("Alias for --sections"), cl::aliasopt(Sections)); + cl::alias SectionsShortUpper("S", + cl::desc("Alias for --sections"), + cl::aliasopt(Sections)); // -section-relocations, -sr cl::opt SectionRelocations("section-relocations", @@ -533,13 +545,19 @@ static void dumpInput(StringRef File) { } int main(int argc, const char *argv[]) { - sys::PrintStackTraceOnErrorSignal(argv[0]); + StringRef ToolName = argv[0]; + sys::PrintStackTraceOnErrorSignal(ToolName); PrettyStackTraceProgram X(argc, argv); llvm_shutdown_obj Y; // Register the target printer for --version. cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); + opts::WideOutput.setHiddenFlag(cl::Hidden); + + if (sys::path::stem(ToolName).find("readelf") != StringRef::npos) + opts::Output = opts::GNU; + cl::ParseCommandLineOptions(argc, argv, "LLVM Object Reader\n"); // Default to stdin if no filename is specified. diff --git a/tools/opt-viewer/opt-diff.py b/tools/opt-viewer/opt-diff.py index 9e921f8488d3..f39432c8bc9d 100755 --- a/tools/opt-viewer/opt-diff.py +++ b/tools/opt-viewer/opt-diff.py @@ -20,24 +20,17 @@ import optrecord import argparse from collections import defaultdict from multiprocessing import cpu_count, Pool -import os, os.path -import fnmatch - -def find_files(dir_or_file): - if os.path.isfile(dir_or_file): - return [dir_or_file] - - all = [] - for dir, subdirs, files in os.walk(dir_or_file): - for file in files: - if fnmatch.fnmatch(file, "*.opt.yaml"): - all.append( os.path.join(dir, file)) - return all if __name__ == '__main__': parser = argparse.ArgumentParser(description=desc) - parser.add_argument('yaml_dir_or_file_1') - parser.add_argument('yaml_dir_or_file_2') + parser.add_argument( + 'yaml_dir_or_file_1', + help='An optimization record file or a directory searched for optimization ' + 'record files that are used as the old version for the comparison') + parser.add_argument( + 'yaml_dir_or_file_2', + help='An optimization record file or a directory searched for optimization ' + 'record files that are used as the new version for the comparison') parser.add_argument( '--jobs', '-j', @@ -53,8 +46,8 @@ if __name__ == '__main__': parser.add_argument('--output', '-o', default='diff.opt.yaml') args = parser.parse_args() - files1 = find_files(args.yaml_dir_or_file_1) - files2 = find_files(args.yaml_dir_or_file_2) + files1 = optrecord.find_opt_files([args.yaml_dir_or_file_1]) + files2 = optrecord.find_opt_files([args.yaml_dir_or_file_2]) print_progress = not args.no_progress_indicator all_remarks1, _, _ = optrecord.gather_results(files1, args.jobs, print_progress) diff --git a/tools/opt-viewer/opt-stats.py b/tools/opt-viewer/opt-stats.py index a7e598fdfd02..205b08ba8a74 100755 --- a/tools/opt-viewer/opt-stats.py +++ b/tools/opt-viewer/opt-stats.py @@ -15,7 +15,11 @@ from multiprocessing import cpu_count, Pool if __name__ == '__main__': parser = argparse.ArgumentParser(description=desc) - parser.add_argument('yaml_files', nargs='+') + parser.add_argument( + 'yaml_dirs_or_files', + nargs='+', + help='List of optimization record files or directories searched ' + 'for optimization record files.') parser.add_argument( '--jobs', '-j', @@ -31,8 +35,14 @@ if __name__ == '__main__': args = parser.parse_args() print_progress = not args.no_progress_indicator + + files = optrecord.find_opt_files(args.yaml_dirs_or_files) + if not files: + parser.error("No *.opt.yaml files found") + sys.exit(1) + all_remarks, file_remarks, _ = optrecord.gather_results( - args.yaml_files, args.jobs, print_progress) + files, args.jobs, print_progress) if print_progress: print('\n') diff --git a/tools/opt-viewer/opt-viewer.py b/tools/opt-viewer/opt-viewer.py index e6dd6a0286fe..69bcaedb7669 100755 --- a/tools/opt-viewer/opt-viewer.py +++ b/tools/opt-viewer/opt-viewer.py @@ -219,7 +219,11 @@ def generate_report(all_remarks, if __name__ == '__main__': parser = argparse.ArgumentParser(description=desc) - parser.add_argument('yaml_files', nargs='+') + parser.add_argument( + 'yaml_dirs_or_files', + nargs='+', + help='List of optimization record files or directories searched ' + 'for optimization record files.') parser.add_argument( '--output-dir', '-o', @@ -248,8 +252,14 @@ if __name__ == '__main__': args = parser.parse_args() print_progress = not args.no_progress_indicator + + files = optrecord.find_opt_files(args.yaml_dirs_or_files) + if not files: + parser.error("No *.opt.yaml files found") + sys.exit(1) + all_remarks, file_remarks, should_display_hotness = \ - optrecord.gather_results(args.yaml_files, args.jobs, print_progress) + optrecord.gather_results(files, args.jobs, print_progress) map_remarks(all_remarks) diff --git a/tools/opt-viewer/optpmap.py b/tools/opt-viewer/optpmap.py index 01e848e03976..16cb22e21491 100644 --- a/tools/opt-viewer/optpmap.py +++ b/tools/opt-viewer/optpmap.py @@ -20,6 +20,7 @@ def _wrapped_func(func_and_args): with _current.get_lock(): _current.value += 1 sys.stdout.write('\r\t{} of {}'.format(_current.value, _total.value)) + sys.stdout.flush() return func(argument) diff --git a/tools/opt-viewer/optrecord.py b/tools/opt-viewer/optrecord.py index 61ed9626cffa..4599e12d7e68 100644 --- a/tools/opt-viewer/optrecord.py +++ b/tools/opt-viewer/optrecord.py @@ -12,8 +12,10 @@ except ImportError: import cgi from collections import defaultdict +import fnmatch import functools from multiprocessing import Lock +import os, os.path import subprocess import optpmap @@ -47,7 +49,7 @@ def demangle(name): def html_file_name(filename): - return filename.replace('/', '_') + ".html" + return filename.replace('/', '_').replace('#', '_') + ".html" def make_link(File, Line): @@ -233,3 +235,19 @@ def gather_results(filenames, num_jobs, should_print_progress): all_remarks.update(all_remarks_job) return all_remarks, file_remarks, max_hotness != 0 + + +def find_opt_files(dirs_or_files): + all = [] + for dir_or_file in dirs_or_files: + if os.path.isfile(dir_or_file): + all.append(dir_or_file) + else: + for dir, subdirs, files in os.walk(dir_or_file): + # Exclude mounted directories and symlinks (os.walk default). + subdirs[:] = [d for d in subdirs + if not os.path.ismount(os.path.join(dir, d))] + for file in files: + if fnmatch.fnmatch(file, "*.opt.yaml"): + all.append(os.path.join(dir, file)) + return all diff --git a/unittests/Analysis/CGSCCPassManagerTest.cpp b/unittests/Analysis/CGSCCPassManagerTest.cpp index d46d9535fa4b..e24818265975 100644 --- a/unittests/Analysis/CGSCCPassManagerTest.cpp +++ b/unittests/Analysis/CGSCCPassManagerTest.cpp @@ -9,6 +9,7 @@ #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/LazyCallGraph.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" @@ -227,6 +228,7 @@ public: "entry:\n" " ret void\n" "}\n")) { + MAM.registerPass([&] { return TargetLibraryAnalysis(); }); MAM.registerPass([&] { return LazyCallGraphAnalysis(); }); MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); }); MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); }); diff --git a/unittests/Analysis/LazyCallGraphTest.cpp b/unittests/Analysis/LazyCallGraphTest.cpp index 65730486cd75..9e7e128bcfb3 100644 --- a/unittests/Analysis/LazyCallGraphTest.cpp +++ b/unittests/Analysis/LazyCallGraphTest.cpp @@ -216,10 +216,17 @@ static const char DiamondOfTrianglesRefGraph[] = " ret void\n" "}\n"; +static LazyCallGraph buildCG(Module &M) { + TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple())); + TargetLibraryInfo TLI(TLII); + LazyCallGraph CG(M, TLI); + return CG; +} + TEST(LazyCallGraphTest, BasicGraphFormation) { LLVMContext Context; std::unique_ptr M = parseAssembly(Context, DiamondOfTriangles); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // The order of the entry nodes should be stable w.r.t. the source order of // the IR, and everything in our module is an entry node, so just directly @@ -407,7 +414,7 @@ TEST(LazyCallGraphTest, BasicGraphMutation) { "entry:\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); LazyCallGraph::Node &A = CG.get(lookupFunction(*M, "a")); LazyCallGraph::Node &B = CG.get(lookupFunction(*M, "b")); @@ -445,7 +452,7 @@ TEST(LazyCallGraphTest, BasicGraphMutation) { TEST(LazyCallGraphTest, InnerSCCFormation) { LLVMContext Context; std::unique_ptr M = parseAssembly(Context, DiamondOfTriangles); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Now mutate the graph to connect every node into a single RefSCC to ensure // that our inner SCC formation handles the rest. @@ -542,7 +549,7 @@ TEST(LazyCallGraphTest, MultiArmSCC) { " call void @f1()\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -593,7 +600,7 @@ TEST(LazyCallGraphTest, OutgoingEdgeMutation) { "entry:\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -739,7 +746,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertion) { // a3--a2 | // std::unique_ptr M = parseAssembly(Context, DiamondOfTriangles); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -831,7 +838,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionRefGraph) { // references rather than calls. std::unique_ptr M = parseAssembly(Context, DiamondOfTrianglesRefGraph); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -938,7 +945,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeCallCycle) { "entry:\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -1015,7 +1022,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeRefCycle) { "entry:\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -1077,7 +1084,7 @@ TEST(LazyCallGraphTest, InlineAndDeleteFunction) { // a3--a2 | // std::unique_ptr M = parseAssembly(Context, DiamondOfTriangles); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -1221,7 +1228,7 @@ TEST(LazyCallGraphTest, InternalEdgeMutation) { " call void @a()\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -1315,7 +1322,7 @@ TEST(LazyCallGraphTest, InternalEdgeRemoval) { " store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -1390,7 +1397,7 @@ TEST(LazyCallGraphTest, InternalNoOpEdgeRemoval) { " store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -1467,7 +1474,7 @@ TEST(LazyCallGraphTest, InternalCallEdgeToRef) { " call void @c()\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -1560,7 +1567,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCall) { " store void()* @a, void()** undef\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -1672,7 +1679,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallNoCycleInterleaved) { " store void()* @a, void()** undef\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -1802,7 +1809,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallBothPartitionAndMerge) { " store void()* @a, void()** undef\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -1885,7 +1892,7 @@ TEST(LazyCallGraphTest, HandleBlockAddress) { " store i8* blockaddress(@f, %bb), i8** %ptr\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); CG.buildRefSCCs(); auto I = CG.postorder_ref_scc_begin(); @@ -1933,7 +1940,7 @@ TEST(LazyCallGraphTest, ReplaceNodeFunction) { " store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Force the graph to be fully expanded. CG.buildRefSCCs(); @@ -2011,7 +2018,7 @@ TEST(LazyCallGraphTest, RemoveFunctionWithSpurriousRef) { "entry:\n" " ret void\n" "}\n"); - LazyCallGraph CG(*M); + LazyCallGraph CG = buildCG(*M); // Insert spurious ref edges. LazyCallGraph::Node &AN = CG.get(lookupFunction(*M, "a")); diff --git a/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp index 92134513b75b..04b7bb0ba936 100644 --- a/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp +++ b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp @@ -13,7 +13,6 @@ #include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h" #include "llvm/DebugInfo/CodeView/TypeSerializer.h" -#include "llvm/DebugInfo/CodeView/TypeServerHandler.h" #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h" #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" #include "llvm/DebugInfo/PDB/Native/RawTypes.h" @@ -402,4 +401,4 @@ TEST_F(RandomAccessVisitorTest, CrossChunkName) { StringRef Name = Types.getTypeName(IndexOne); EXPECT_EQ("const FooClass", Name); -} \ No newline at end of file +} diff --git a/unittests/DebugInfo/PDB/CMakeLists.txt b/unittests/DebugInfo/PDB/CMakeLists.txt index 989cb396f674..583b065f464c 100644 --- a/unittests/DebugInfo/PDB/CMakeLists.txt +++ b/unittests/DebugInfo/PDB/CMakeLists.txt @@ -10,11 +10,10 @@ set(DebugInfoPDBSources StringTableBuilderTest.cpp MSFBuilderTest.cpp PDBApiTest.cpp - TypeServerHandlerTest.cpp ) add_llvm_unittest(DebugInfoPDBTests ${DebugInfoPDBSources} ) -target_link_libraries(DebugInfoPDBTests LLVMTestingSupport) \ No newline at end of file +target_link_libraries(DebugInfoPDBTests LLVMTestingSupport) diff --git a/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp b/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp deleted file mode 100644 index d09b9130ee27..000000000000 --- a/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp +++ /dev/null @@ -1,183 +0,0 @@ -//===- llvm/unittest/DebugInfo/PDB/TypeServerHandlerTest.cpp --------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "llvm/DebugInfo/CodeView/TypeServerHandler.h" -#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" -#include "llvm/DebugInfo/CodeView/TypeRecord.h" -#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h" -#include "llvm/DebugInfo/CodeView/TypeSerializer.h" -#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h" -#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h" -#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" -#include "llvm/DebugInfo/PDB/Native/RawTypes.h" -#include "llvm/Support/Allocator.h" -#include "llvm/Support/Error.h" -#include "llvm/Testing/Support/Error.h" - -#include "gtest/gtest.h" - -using namespace llvm; -using namespace llvm::codeview; -using namespace llvm::pdb; - -namespace { - -constexpr uint8_t Guid[] = {0x2a, 0x2c, 0x1c, 0x2a, 0xcb, 0x9e, 0x48, 0x18, - 0x82, 0x82, 0x7a, 0x87, 0xc3, 0xfe, 0x16, 0xe8}; -StringRef GuidStr(reinterpret_cast(Guid), - llvm::array_lengthof(Guid)); - -constexpr const char *Name = "Test Name"; -constexpr int Age = 1; - -class MockTypeServerHandler : public TypeServerHandler { -public: - explicit MockTypeServerHandler(bool HandleAlways) - : HandleAlways(HandleAlways) {} - - Expected handle(TypeServer2Record &TS, - TypeVisitorCallbacks &Callbacks) override { - if (TS.Age != Age || TS.Guid != GuidStr || TS.Name != Name) - return make_error(cv_error_code::corrupt_record, - "Invalid TypeServer record!"); - - if (Handled && !HandleAlways) - return false; - - Handled = true; - return true; - } - - bool Handled = false; - bool HandleAlways; -}; - -class MockTypeVisitorCallbacks : public TypeVisitorCallbacks { -public: - enum class State { - Ready, - VisitTypeBegin, - VisitKnownRecord, - VisitTypeEnd, - }; - Error visitTypeBegin(CVType &CVT) override { - if (S != State::Ready) - return make_error(cv_error_code::unspecified, - "Invalid visitor state!"); - - S = State::VisitTypeBegin; - return Error::success(); - } - - Error visitKnownRecord(CVType &CVT, TypeServer2Record &TS) override { - if (S != State::VisitTypeBegin) - return make_error(cv_error_code::unspecified, - "Invalid visitor state!"); - - S = State::VisitKnownRecord; - return Error::success(); - } - - Error visitTypeEnd(CVType &CVT) override { - if (S != State::VisitKnownRecord) - return make_error(cv_error_code::unspecified, - "Invalid visitor state!"); - - S = State::VisitTypeEnd; - return Error::success(); - } - - State S = State::Ready; -}; - -class TypeServerHandlerTest : public testing::Test { -public: - void SetUp() override { - TypeServer2Record R(TypeRecordKind::TypeServer2); - R.Age = Age; - R.Guid = GuidStr; - R.Name = Name; - - TypeTableBuilder Builder(Allocator); - Builder.writeKnownType(R); - TypeServerRecord.RecordData = Builder.records().front(); - TypeServerRecord.Type = TypeLeafKind::LF_TYPESERVER2; - } - -protected: - BumpPtrAllocator Allocator; - CVType TypeServerRecord; -}; - -// Test that when no type server handler is registered, it gets handled by the -// normal -// visitor callbacks. -TEST_F(TypeServerHandlerTest, VisitRecordNoTypeServer) { - MockTypeVisitorCallbacks C2; - MockTypeVisitorCallbacks C1; - TypeVisitorCallbackPipeline Pipeline; - - Pipeline.addCallbackToPipeline(C1); - Pipeline.addCallbackToPipeline(C2); - - EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, Pipeline), - Succeeded()); - - EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C1.S); - EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C2.S); -} - -// Test that when a TypeServerHandler is registered, it gets consumed by the -// handler if and only if the handler returns true. -TEST_F(TypeServerHandlerTest, VisitRecordWithTypeServerOnce) { - MockTypeServerHandler Handler(false); - - MockTypeVisitorCallbacks C1; - - // Our mock server returns true the first time. - EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1, - codeview::VDS_BytesExternal, - &Handler), - Succeeded()); - EXPECT_TRUE(Handler.Handled); - EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S); - - // And false the second time. - EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1, - codeview::VDS_BytesExternal, - &Handler), - Succeeded()); - EXPECT_TRUE(Handler.Handled); - EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C1.S); -} - -// Test that when a type server handler is registered, if the handler keeps -// returning true, it will keep getting consumed by the handler and not go -// to the default processor. -TEST_F(TypeServerHandlerTest, VisitRecordWithTypeServerAlways) { - MockTypeServerHandler Handler(true); - - MockTypeVisitorCallbacks C1; - - EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1, - codeview::VDS_BytesExternal, - &Handler), - Succeeded()); - EXPECT_TRUE(Handler.Handled); - EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S); - - EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1, - codeview::VDS_BytesExternal, - &Handler), - Succeeded()); - EXPECT_TRUE(Handler.Handled); - EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S); -} - -} // end anonymous namespace diff --git a/unittests/IR/CFGBuilder.cpp b/unittests/IR/CFGBuilder.cpp new file mode 100644 index 000000000000..50494ab5c7ca --- /dev/null +++ b/unittests/IR/CFGBuilder.cpp @@ -0,0 +1,269 @@ +//===- llvm/Testing/Support/CFGBuilder.cpp --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "CFGBuilder.h" + +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/TypeBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "gtest/gtest.h" + +#define DEBUG_TYPE "cfg-builder" + +using namespace llvm; + +CFGHolder::CFGHolder(StringRef ModuleName, StringRef FunctionName) + : Context(llvm::make_unique()), + M(llvm::make_unique(ModuleName, *Context)) { + FunctionType *FTy = TypeBuilder::get(*Context); + F = cast(M->getOrInsertFunction(FunctionName, FTy)); +} +CFGHolder::~CFGHolder() = default; + +CFGBuilder::CFGBuilder(Function *F, const std::vector &InitialArcs, + std::vector Updates) + : F(F), Updates(std::move(Updates)) { + assert(F); + buildCFG(InitialArcs); +} + +static void ConnectBlocks(BasicBlock *From, BasicBlock *To) { + DEBUG(dbgs() << "Creating BB arc " << From->getName() << " -> " + << To->getName() << "\n"; + dbgs().flush()); + auto *IntTy = IntegerType::get(From->getContext(), 32); + + if (isa(From->getTerminator())) + From->getTerminator()->eraseFromParent(); + if (!From->getTerminator()) { + IRBuilder<> IRB(From); + IRB.CreateSwitch(ConstantInt::get(IntTy, 0), To); + return; + } + + SwitchInst *SI = cast(From->getTerminator()); + const auto Last = SI->getNumCases(); + + auto *IntVal = ConstantInt::get(IntTy, Last); + SI->addCase(IntVal, To); +} + +static void DisconnectBlocks(BasicBlock *From, BasicBlock *To) { + DEBUG(dbgs() << "Deleting BB arc " << From->getName() << " -> " + << To->getName() << "\n"; + dbgs().flush()); + SwitchInst *SI = cast(From->getTerminator()); + + if (SI->getNumCases() == 0) { + SI->eraseFromParent(); + IRBuilder<> IRB(From); + IRB.CreateUnreachable(); + return; + } + + if (SI->getDefaultDest() == To) { + auto FirstC = SI->case_begin(); + SI->setDefaultDest(FirstC->getCaseSuccessor()); + SI->removeCase(FirstC); + return; + } + + for (auto CIt = SI->case_begin(); CIt != SI->case_end(); ++CIt) + if (CIt->getCaseSuccessor() == To) { + SI->removeCase(CIt); + return; + } +} + +BasicBlock *CFGBuilder::getOrAddBlock(StringRef BlockName) { + auto BIt = NameToBlock.find(BlockName); + if (BIt != NameToBlock.end()) + return BIt->second; + + auto *BB = BasicBlock::Create(F->getParent()->getContext(), BlockName, F); + IRBuilder<> IRB(BB); + IRB.CreateUnreachable(); + NameToBlock[BlockName] = BB; + return BB; +} + +bool CFGBuilder::connect(const Arc &A) { + BasicBlock *From = getOrAddBlock(A.From); + BasicBlock *To = getOrAddBlock(A.To); + if (Arcs.count(A) != 0) + return false; + + Arcs.insert(A); + ConnectBlocks(From, To); + return true; +} + +bool CFGBuilder::disconnect(const Arc &A) { + assert(NameToBlock.count(A.From) != 0 && "No block to disconnect (From)"); + assert(NameToBlock.count(A.To) != 0 && "No block to disconnect (To)"); + if (Arcs.count(A) == 0) + return false; + + BasicBlock *From = getOrAddBlock(A.From); + BasicBlock *To = getOrAddBlock(A.To); + Arcs.erase(A); + DisconnectBlocks(From, To); + return true; +} + +void CFGBuilder::buildCFG(const std::vector &NewArcs) { + for (const auto &A : NewArcs) { + const bool Connected = connect(A); + (void)Connected; + assert(Connected); + } +} + +Optional CFGBuilder::getNextUpdate() const { + if (UpdateIdx == Updates.size()) + return None; + return Updates[UpdateIdx]; +} + +Optional CFGBuilder::applyUpdate() { + if (UpdateIdx == Updates.size()) + return None; + Update NextUpdate = Updates[UpdateIdx++]; + if (NextUpdate.Action == ActionKind::Insert) + connect(NextUpdate.Edge); + else + disconnect(NextUpdate.Edge); + + return NextUpdate; +} + +void CFGBuilder::dump(raw_ostream &OS) const { + OS << "Arcs:\n"; + size_t i = 0; + for (const auto &A : Arcs) + OS << " " << i++ << ":\t" << A.From << " -> " << A.To << "\n"; + + OS << "Updates:\n"; + i = 0; + for (const auto &U : Updates) { + OS << (i + 1 == UpdateIdx ? "->" : " ") << i + << ((U.Action == ActionKind::Insert) ? "\tIns " : "\tDel ") + << U.Edge.From << " -> " << U.Edge.To << "\n"; + ++i; + } +} + +//---- CFGBuilder tests ---------------------------------------------------===// + +TEST(CFGBuilder, Construction) { + CFGHolder Holder; + std::vector Arcs = {{"entry", "a"}, {"a", "b"}, {"a", "c"}, + {"c", "d"}, {"d", "b"}, {"d", "e"}, + {"d", "f"}, {"e", "f"}}; + CFGBuilder B(Holder.F, Arcs, {}); + + EXPECT_TRUE(B.getOrAddBlock("entry") == &Holder.F->getEntryBlock()); + EXPECT_TRUE(isa(B.getOrAddBlock("entry")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("a")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("b")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("d")->getTerminator())); + + auto *DSwitch = cast(B.getOrAddBlock("d")->getTerminator()); + // d has 3 successors, but one of them if going to be a default case + EXPECT_EQ(DSwitch->getNumCases(), 2U); + EXPECT_FALSE(B.getNextUpdate()); // No updates to apply. +} + +TEST(CFGBuilder, Insertions) { + CFGHolder Holder; + const auto Insert = CFGBuilder::ActionKind::Insert; + std::vector Updates = { + {Insert, {"entry", "a"}}, {Insert, {"a", "b"}}, {Insert, {"a", "c"}}, + {Insert, {"c", "d"}}, {Insert, {"d", "b"}}, {Insert, {"d", "e"}}, + {Insert, {"d", "f"}}, {Insert, {"e", "f"}}}; + const size_t NumUpdates = Updates.size(); + + CFGBuilder B(Holder.F, {}, Updates); + + size_t i = 0; + while (B.applyUpdate()) + ++i; + EXPECT_EQ(i, NumUpdates); + + EXPECT_TRUE(B.getOrAddBlock("entry") == &Holder.F->getEntryBlock()); + EXPECT_TRUE(isa(B.getOrAddBlock("entry")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("a")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("b")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("d")->getTerminator())); + + auto *DSwitch = cast(B.getOrAddBlock("d")->getTerminator()); + // d has 3 successors, but one of them if going to be a default case + EXPECT_EQ(DSwitch->getNumCases(), 2U); + EXPECT_FALSE(B.getNextUpdate()); // No updates to apply. +} + +TEST(CFGBuilder, Deletions) { + CFGHolder Holder; + std::vector Arcs = { + {"entry", "a"}, {"a", "b"}, {"a", "c"}, {"c", "d"}, {"d", "b"}}; + const auto Delete = CFGBuilder::ActionKind::Delete; + std::vector Updates = { + {Delete, {"c", "d"}}, {Delete, {"a", "c"}}, {Delete, {"entry", "a"}}, + }; + const size_t NumUpdates = Updates.size(); + + CFGBuilder B(Holder.F, Arcs, Updates); + + EXPECT_TRUE(isa(B.getOrAddBlock("entry")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("a")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("c")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("d")->getTerminator())); + + auto UpdateC = B.applyUpdate(); + + EXPECT_TRUE(UpdateC); + EXPECT_EQ(UpdateC->Action, CFGBuilder::ActionKind::Delete); + EXPECT_EQ(UpdateC->Edge.From, "c"); + EXPECT_EQ(UpdateC->Edge.To, "d"); + EXPECT_TRUE(isa(B.getOrAddBlock("c")->getTerminator())); + + size_t i = 1; + while (B.applyUpdate()) + ++i; + EXPECT_EQ(i, NumUpdates); + + EXPECT_TRUE(isa(B.getOrAddBlock("a")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("entry")->getTerminator())); +} + +TEST(CFGBuilder, Rebuild) { + CFGHolder Holder; + std::vector Arcs = { + {"entry", "a"}, {"a", "b"}, {"a", "c"}, {"c", "d"}, {"d", "b"}}; + const auto Insert = CFGBuilder::ActionKind::Insert; + const auto Delete = CFGBuilder::ActionKind::Delete; + std::vector Updates = { + {Delete, {"c", "d"}}, {Delete, {"a", "c"}}, {Delete, {"entry", "a"}}, + {Insert, {"c", "d"}}, {Insert, {"a", "c"}}, {Insert, {"entry", "a"}}, + }; + const size_t NumUpdates = Updates.size(); + + CFGBuilder B(Holder.F, Arcs, Updates); + size_t i = 0; + while (B.applyUpdate()) + ++i; + EXPECT_EQ(i, NumUpdates); + + EXPECT_TRUE(isa(B.getOrAddBlock("entry")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("a")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("c")->getTerminator())); + EXPECT_TRUE(isa(B.getOrAddBlock("d")->getTerminator())); +} diff --git a/unittests/IR/CFGBuilder.h b/unittests/IR/CFGBuilder.h new file mode 100644 index 000000000000..d9d9c378e110 --- /dev/null +++ b/unittests/IR/CFGBuilder.h @@ -0,0 +1,94 @@ +//===- CFGBuilder.h - CFG building and updating utility ----------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// CFGBuilders provides utilities fo building and updating CFG for testing +/// purposes. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UNITTESTS_CFG_BUILDER_H +#define LLVM_UNITTESTS_CFG_BUILDER_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Debug.h" + +#include +#include +#include +#include + +namespace llvm { + +class LLVMContext; +class Module; +class Function; +class BasicBlock; +class raw_ostream; + +struct CFGHolder { + std::unique_ptr Context; + std::unique_ptr M; + Function *F; + + CFGHolder(StringRef ModuleName = "m", StringRef FunctionName = "foo"); + ~CFGHolder(); // Defined in the .cpp file so we can use forward declarations. +}; + +/// \brief +/// CFGBuilder builds IR with specific CFG, based on the supplied list of arcs. +/// It's able to apply the provided updates and automatically modify the IR. +/// +/// Internally it makes every basic block end with either SwitchInst or with +/// UnreachableInst. When all arc to a BB are deleted, the BB remains in the +/// function and doesn't get deleted. +/// +class CFGBuilder { +public: + struct Arc { + StringRef From; + StringRef To; + + friend bool operator<(const Arc &LHS, const Arc &RHS) { + return std::tie(LHS.From, LHS.To) < + std::tie(RHS.From, RHS.To); + } + }; + + enum class ActionKind { Insert, Delete }; + struct Update { + ActionKind Action; + Arc Edge; + }; + + CFGBuilder(Function *F, const std::vector &InitialArcs, + std::vector Updates); + + BasicBlock *getOrAddBlock(StringRef BlockName); + Optional getNextUpdate() const; + Optional applyUpdate(); + void dump(raw_ostream &OS = dbgs()) const; + +private: + void buildCFG(const std::vector &Arcs); + bool connect(const Arc &A); + bool disconnect(const Arc &A); + + Function *F; + unsigned UpdateIdx = 0; + StringMap NameToBlock; + std::set Arcs; + std::vector Updates; +}; + +} // namespace llvm + +#endif diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt index d76ebfa64d88..92c9f8d3788d 100644 --- a/unittests/IR/CMakeLists.txt +++ b/unittests/IR/CMakeLists.txt @@ -10,6 +10,7 @@ set(IRSources AsmWriterTest.cpp AttributesTest.cpp BasicBlockTest.cpp + CFGBuilder.cpp ConstantRangeTest.cpp ConstantsTest.cpp DebugInfoTest.cpp diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp index fa3dad8a2ab1..df1e2993dc85 100644 --- a/unittests/IR/DominatorTreeTest.cpp +++ b/unittests/IR/DominatorTreeTest.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include #include "llvm/Analysis/PostDominators.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/Constants.h" @@ -15,22 +16,24 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/SourceMgr.h" +#include "CFGBuilder.h" #include "gtest/gtest.h" using namespace llvm; +struct PostDomTree : PostDomTreeBase { + PostDomTree(Function &F) { recalculate(F); } +}; + /// Build the dominator tree for the function and run the Test. -static void -runWithDomTree(Module &M, StringRef FuncName, - function_ref *PDT)> - Test) { +static void runWithDomTree( + Module &M, StringRef FuncName, + function_ref Test) { auto *F = M.getFunction(FuncName); ASSERT_NE(F, nullptr) << "Could not find " << FuncName; // Compute the dominator tree for the function. DominatorTree DT(*F); - DominatorTreeBase PDT(/*isPostDom*/ true); - PDT.recalculate(*F); + PostDomTree PDT(*F); Test(*F, &DT, &PDT); } @@ -72,8 +75,7 @@ TEST(DominatorTree, Unreachable) { std::unique_ptr M = makeLLVMModule(Context, ModuleString); runWithDomTree( - *M, "f", - [&](Function &F, DominatorTree *DT, DominatorTreeBase *PDT) { + *M, "f", [&](Function &F, DominatorTree *DT, PostDomTree *PDT) { Function::iterator FI = F.begin(); BasicBlock *BB0 = &*FI++; @@ -293,8 +295,7 @@ TEST(DominatorTree, NonUniqueEdges) { std::unique_ptr M = makeLLVMModule(Context, ModuleString); runWithDomTree( - *M, "f", - [&](Function &F, DominatorTree *DT, DominatorTreeBase *PDT) { + *M, "f", [&](Function &F, DominatorTree *DT, PostDomTree *PDT) { Function::iterator FI = F.begin(); BasicBlock *BB0 = &*FI++; @@ -324,3 +325,280 @@ TEST(DominatorTree, NonUniqueEdges) { EXPECT_FALSE(DT->dominates(Edge_BB0_BB1_b, BB2)); }); } + +namespace { +const auto Insert = CFGBuilder::ActionKind::Insert; +const auto Delete = CFGBuilder::ActionKind::Delete; + +bool CompUpdates(const CFGBuilder::Update &A, const CFGBuilder::Update &B) { + return std::tie(A.Action, A.Edge.From, A.Edge.To) < + std::tie(B.Action, B.Edge.From, B.Edge.To); +} +} // namespace + +TEST(DominatorTree, InsertReachable) { + CFGHolder Holder; + std::vector Arcs = { + {"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"}, + {"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}}; + + std::vector Updates = {{Insert, {"12", "10"}}, + {Insert, {"10", "9"}}, + {Insert, {"7", "6"}}, + {Insert, {"7", "5"}}}; + CFGBuilder B(Holder.F, Arcs, Updates); + DominatorTree DT(*Holder.F); + EXPECT_TRUE(DT.verify()); + PostDomTree PDT(*Holder.F); + EXPECT_TRUE(PDT.verify()); + + Optional LastUpdate; + while ((LastUpdate = B.applyUpdate())) { + EXPECT_EQ(LastUpdate->Action, Insert); + BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From); + BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To); + DT.insertEdge(From, To); + EXPECT_TRUE(DT.verify()); + PDT.insertEdge(From, To); + EXPECT_TRUE(PDT.verify()); + } +} + +TEST(DominatorTree, InsertReachable2) { + CFGHolder Holder; + std::vector Arcs = { + {"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"}, + {"7", "5"}, {"2", "8"}, {"8", "11"}, {"11", "12"}, {"12", "10"}, + {"10", "9"}, {"9", "10"}}; + + std::vector Updates = {{Insert, {"10", "7"}}}; + CFGBuilder B(Holder.F, Arcs, Updates); + DominatorTree DT(*Holder.F); + EXPECT_TRUE(DT.verify()); + PostDomTree PDT(*Holder.F); + EXPECT_TRUE(PDT.verify()); + + Optional LastUpdate = B.applyUpdate(); + EXPECT_TRUE(LastUpdate); + + EXPECT_EQ(LastUpdate->Action, Insert); + BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From); + BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To); + DT.insertEdge(From, To); + EXPECT_TRUE(DT.verify()); + PDT.insertEdge(From, To); + EXPECT_TRUE(PDT.verify()); +} + +TEST(DominatorTree, InsertUnreachable) { + CFGHolder Holder; + std::vector Arcs = {{"1", "2"}, {"2", "3"}, {"3", "4"}, + {"5", "6"}, {"5", "7"}, {"3", "8"}, + {"9", "10"}, {"11", "12"}}; + + std::vector Updates = {{Insert, {"4", "5"}}, + {Insert, {"8", "9"}}, + {Insert, {"10", "12"}}, + {Insert, {"10", "11"}}}; + CFGBuilder B(Holder.F, Arcs, Updates); + DominatorTree DT(*Holder.F); + EXPECT_TRUE(DT.verify()); + PostDomTree PDT(*Holder.F); + EXPECT_TRUE(PDT.verify()); + + Optional LastUpdate; + while ((LastUpdate = B.applyUpdate())) { + EXPECT_EQ(LastUpdate->Action, Insert); + BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From); + BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To); + DT.insertEdge(From, To); + EXPECT_TRUE(DT.verify()); + PDT.insertEdge(From, To); + EXPECT_TRUE(PDT.verify()); + } +} + +TEST(DominatorTree, InsertMixed) { + CFGHolder Holder; + std::vector Arcs = { + {"1", "2"}, {"2", "3"}, {"3", "4"}, {"5", "6"}, {"5", "7"}, + {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}, {"7", "3"}}; + + std::vector Updates = { + {Insert, {"4", "5"}}, {Insert, {"2", "5"}}, {Insert, {"10", "9"}}, + {Insert, {"12", "10"}}, {Insert, {"12", "10"}}, {Insert, {"7", "8"}}, + {Insert, {"7", "5"}}}; + CFGBuilder B(Holder.F, Arcs, Updates); + DominatorTree DT(*Holder.F); + EXPECT_TRUE(DT.verify()); + PostDomTree PDT(*Holder.F); + EXPECT_TRUE(PDT.verify()); + + Optional LastUpdate; + while ((LastUpdate = B.applyUpdate())) { + EXPECT_EQ(LastUpdate->Action, Insert); + BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From); + BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To); + DT.insertEdge(From, To); + EXPECT_TRUE(DT.verify()); + PDT.insertEdge(From, To); + EXPECT_TRUE(PDT.verify()); + } +} + +TEST(DominatorTree, InsertPermut) { + std::vector Arcs = { + {"1", "2"}, {"2", "3"}, {"3", "4"}, {"5", "6"}, {"5", "7"}, + {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}, {"7", "3"}}; + + std::vector Updates = {{Insert, {"4", "5"}}, + {Insert, {"2", "5"}}, + {Insert, {"10", "9"}}, + {Insert, {"12", "10"}}}; + + while (std::next_permutation(Updates.begin(), Updates.end(), CompUpdates)) { + CFGHolder Holder; + CFGBuilder B(Holder.F, Arcs, Updates); + DominatorTree DT(*Holder.F); + EXPECT_TRUE(DT.verify()); + PostDomTree PDT(*Holder.F); + EXPECT_TRUE(PDT.verify()); + + Optional LastUpdate; + while ((LastUpdate = B.applyUpdate())) { + EXPECT_EQ(LastUpdate->Action, Insert); + BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From); + BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To); + DT.insertEdge(From, To); + EXPECT_TRUE(DT.verify()); + PDT.insertEdge(From, To); + EXPECT_TRUE(PDT.verify()); + } + } +} + +TEST(DominatorTree, DeleteReachable) { + CFGHolder Holder; + std::vector Arcs = { + {"1", "2"}, {"2", "3"}, {"2", "4"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, + {"5", "7"}, {"7", "8"}, {"3", "8"}, {"8", "9"}, {"9", "10"}, {"10", "2"}}; + + std::vector Updates = { + {Delete, {"2", "4"}}, {Delete, {"7", "8"}}, {Delete, {"10", "2"}}}; + CFGBuilder B(Holder.F, Arcs, Updates); + DominatorTree DT(*Holder.F); + EXPECT_TRUE(DT.verify()); + PostDomTree PDT(*Holder.F); + EXPECT_TRUE(PDT.verify()); + + Optional LastUpdate; + while ((LastUpdate = B.applyUpdate())) { + EXPECT_EQ(LastUpdate->Action, Delete); + BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From); + BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To); + DT.deleteEdge(From, To); + EXPECT_TRUE(DT.verify()); + PDT.deleteEdge(From, To); + EXPECT_TRUE(PDT.verify()); + } +} + +TEST(DominatorTree, DeleteUnreachable) { + CFGHolder Holder; + std::vector Arcs = { + {"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"}, + {"7", "8"}, {"3", "8"}, {"8", "9"}, {"9", "10"}, {"10", "2"}}; + + std::vector Updates = { + {Delete, {"8", "9"}}, {Delete, {"7", "8"}}, {Delete, {"3", "4"}}}; + CFGBuilder B(Holder.F, Arcs, Updates); + DominatorTree DT(*Holder.F); + EXPECT_TRUE(DT.verify()); + PostDomTree PDT(*Holder.F); + EXPECT_TRUE(PDT.verify()); + + Optional LastUpdate; + while ((LastUpdate = B.applyUpdate())) { + EXPECT_EQ(LastUpdate->Action, Delete); + BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From); + BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To); + DT.deleteEdge(From, To); + EXPECT_TRUE(DT.verify()); + PDT.deleteEdge(From, To); + EXPECT_TRUE(PDT.verify()); + } +} + +TEST(DominatorTree, InsertDelete) { + std::vector Arcs = { + {"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"}, + {"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}}; + + std::vector Updates = { + {Insert, {"2", "4"}}, {Insert, {"12", "10"}}, {Insert, {"10", "9"}}, + {Insert, {"7", "6"}}, {Insert, {"7", "5"}}, {Delete, {"3", "8"}}, + {Insert, {"10", "7"}}, {Insert, {"2", "8"}}, {Delete, {"3", "4"}}, + {Delete, {"8", "9"}}, {Delete, {"11", "12"}}}; + + CFGHolder Holder; + CFGBuilder B(Holder.F, Arcs, Updates); + DominatorTree DT(*Holder.F); + EXPECT_TRUE(DT.verify()); + PostDomTree PDT(*Holder.F); + EXPECT_TRUE(PDT.verify()); + + Optional LastUpdate; + while ((LastUpdate = B.applyUpdate())) { + BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From); + BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To); + if (LastUpdate->Action == Insert) { + DT.insertEdge(From, To); + PDT.insertEdge(From, To); + } else { + DT.deleteEdge(From, To); + PDT.deleteEdge(From, To); + } + + EXPECT_TRUE(DT.verify()); + EXPECT_TRUE(PDT.verify()); + } +} + +TEST(DominatorTree, InsertDeleteExhaustive) { + std::vector Arcs = { + {"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"}, {"5", "6"}, {"5", "7"}, + {"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}}; + + std::vector Updates = { + {Insert, {"2", "4"}}, {Insert, {"12", "10"}}, {Insert, {"10", "9"}}, + {Insert, {"7", "6"}}, {Insert, {"7", "5"}}, {Delete, {"3", "8"}}, + {Insert, {"10", "7"}}, {Insert, {"2", "8"}}, {Delete, {"3", "4"}}, + {Delete, {"8", "9"}}, {Delete, {"11", "12"}}}; + + std::mt19937 Generator(0); + for (unsigned i = 0; i < 16; ++i) { + std::shuffle(Updates.begin(), Updates.end(), Generator); + CFGHolder Holder; + CFGBuilder B(Holder.F, Arcs, Updates); + DominatorTree DT(*Holder.F); + EXPECT_TRUE(DT.verify()); + PostDomTree PDT(*Holder.F); + EXPECT_TRUE(PDT.verify()); + + Optional LastUpdate; + while ((LastUpdate = B.applyUpdate())) { + BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From); + BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To); + if (LastUpdate->Action == Insert) { + DT.insertEdge(From, To); + PDT.insertEdge(From, To); + } else { + DT.deleteEdge(From, To); + PDT.deleteEdge(From, To); + } + + EXPECT_TRUE(DT.verify()); + EXPECT_TRUE(PDT.verify()); + } + } +} diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp index 186330f10573..d361107cc0d2 100644 --- a/unittests/IR/IRBuilderTest.cpp +++ b/unittests/IR/IRBuilderTest.cpp @@ -463,13 +463,14 @@ TEST_F(IRBuilderTest, DebugLoc) { TEST_F(IRBuilderTest, DIImportedEntity) { IRBuilder<> Builder(BB); DIBuilder DIB(*M); + auto F = DIB.createFile("F.CBL", "/"); auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74, - DIB.createFile("F.CBL", "/"), "llvm-cobol74", + F, "llvm-cobol74", true, "", 0); - DIB.createImportedDeclaration(CU, nullptr, 1); - DIB.createImportedDeclaration(CU, nullptr, 1); - DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, 2); - DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, 2); + DIB.createImportedDeclaration(CU, nullptr, F, 1); + DIB.createImportedDeclaration(CU, nullptr, F, 1); + DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, F, 2); + DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, F, 2); DIB.finalize(); EXPECT_TRUE(verifyModule(*M)); EXPECT_TRUE(CU->getImportedEntities().size() == 2); diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp index cb38b30f43e6..e47afca532d2 100644 --- a/unittests/IR/MetadataTest.cpp +++ b/unittests/IR/MetadataTest.cpp @@ -2116,29 +2116,35 @@ TEST_F(DIImportedEntityTest, get) { unsigned Tag = dwarf::DW_TAG_imported_module; DIScope *Scope = getSubprogram(); DINode *Entity = getCompositeType(); + DIFile *File = getFile(); unsigned Line = 5; StringRef Name = "name"; - auto *N = DIImportedEntity::get(Context, Tag, Scope, Entity, Line, Name); + auto *N = + DIImportedEntity::get(Context, Tag, Scope, Entity, File, Line, Name); EXPECT_EQ(Tag, N->getTag()); EXPECT_EQ(Scope, N->getScope()); EXPECT_EQ(Entity, N->getEntity()); + EXPECT_EQ(File, N->getFile()); EXPECT_EQ(Line, N->getLine()); EXPECT_EQ(Name, N->getName()); - EXPECT_EQ(N, DIImportedEntity::get(Context, Tag, Scope, Entity, Line, Name)); + EXPECT_EQ( + N, DIImportedEntity::get(Context, Tag, Scope, Entity, File, Line, Name)); EXPECT_NE(N, DIImportedEntity::get(Context, dwarf::DW_TAG_imported_declaration, - Scope, Entity, Line, Name)); + Scope, Entity, File, Line, Name)); EXPECT_NE(N, DIImportedEntity::get(Context, Tag, getSubprogram(), Entity, - Line, Name)); + File, Line, Name)); EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, getCompositeType(), - Line, Name)); - EXPECT_NE(N, - DIImportedEntity::get(Context, Tag, Scope, Entity, Line + 1, Name)); - EXPECT_NE(N, - DIImportedEntity::get(Context, Tag, Scope, Entity, Line, "other")); + File, Line, Name)); + EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, Entity, nullptr, Line, + Name)); + EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, Entity, File, + Line + 1, Name)); + EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, Entity, File, Line, + "other")); TempDIImportedEntity Temp = N->clone(); EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp))); diff --git a/unittests/Support/TargetParserTest.cpp b/unittests/Support/TargetParserTest.cpp index b252641f1a13..b9b725f934b3 100644 --- a/unittests/Support/TargetParserTest.cpp +++ b/unittests/Support/TargetParserTest.cpp @@ -737,7 +737,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { unsigned Extensions = AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP | AArch64::AEK_SIMD | AArch64::AEK_FP16 | AArch64::AEK_PROFILE | - AArch64::AEK_RAS; + AArch64::AEK_RAS | AArch64::AEK_SVE; for (unsigned i = 0; i <= Extensions; i++) EXPECT_TRUE(i == 0 ? !AArch64::getExtensionFeatures(i, Features) @@ -762,7 +762,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) { {"simd", "nosimd", "+neon", "-neon"}, {"fp16", "nofp16", "+fullfp16", "-fullfp16"}, {"profile", "noprofile", "+spe", "-spe"}, - {"ras", "noras", "+ras", "-ras"}}; + {"ras", "noras", "+ras", "-ras"}, + {"sve", "nosve", "+sve", "-sve"}}; for (unsigned i = 0; i < array_lengthof(ArchExt); i++) { EXPECT_EQ(StringRef(ArchExt[i][2]), diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp index 5cf0e9d0f5b3..120773a0c8dd 100644 --- a/unittests/Support/YAMLIOTest.cpp +++ b/unittests/Support/YAMLIOTest.cpp @@ -232,6 +232,22 @@ TEST(YAMLIO, TestSequenceMapWriteAndRead) { } } +// +// Test YAML filename handling. +// +static void testErrorFilename(const llvm::SMDiagnostic &Error, void *) { + EXPECT_EQ(Error.getFilename(), "foo.yaml"); +} + +TEST(YAMLIO, TestGivenFilename) { + auto Buffer = llvm::MemoryBuffer::getMemBuffer("{ x: 42 }", "foo.yaml"); + Input yin(*Buffer, nullptr, testErrorFilename); + FooBar Value; + yin >> Value; + + EXPECT_TRUE(!!yin.error()); +} + //===----------------------------------------------------------------------===// // Test built-in types diff --git a/unittests/Support/raw_ostream_test.cpp b/unittests/Support/raw_ostream_test.cpp index a7a5ce8dd6d4..a75f446e4ba1 100644 --- a/unittests/Support/raw_ostream_test.cpp +++ b/unittests/Support/raw_ostream_test.cpp @@ -151,6 +151,11 @@ TEST(raw_ostreamTest, Justify) { EXPECT_EQ(" xyz", printToString(right_justify("xyz", 6), 6)); EXPECT_EQ("abc", printToString(right_justify("abc", 3), 3)); EXPECT_EQ("big", printToString(right_justify("big", 1), 3)); + EXPECT_EQ(" on ", printToString(center_justify("on", 9), 9)); + EXPECT_EQ(" off ", printToString(center_justify("off", 10), 10)); + EXPECT_EQ("single ", printToString(center_justify("single", 7), 7)); + EXPECT_EQ("none", printToString(center_justify("none", 1), 4)); + EXPECT_EQ("none", printToString(center_justify("none", 1), 1)); } TEST(raw_ostreamTest, FormatHex) { diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp index d4a21a986c58..6399fb5ec1dd 100644 --- a/utils/TableGen/CodeGenRegisters.cpp +++ b/utils/TableGen/CodeGenRegisters.cpp @@ -1268,12 +1268,12 @@ void CodeGenRegBank::computeSubRegLaneMasks() { CoveringLanes = LaneBitmask::getAll(); for (auto &Idx : SubRegIndices) { if (Idx.getComposites().empty()) { - if (Bit > 32) { + if (Bit > LaneBitmask::BitWidth) { PrintFatalError( Twine("Ran out of lanemask bits to represent subregister ") + Idx.getName()); } - Idx.LaneMask = LaneBitmask(1 << Bit); + Idx.LaneMask = LaneBitmask::getLane(Bit); ++Bit; } else { Idx.LaneMask = LaneBitmask::getNone(); @@ -1298,9 +1298,9 @@ void CodeGenRegBank::computeSubRegLaneMasks() { static_assert(sizeof(Idx.LaneMask.getAsInteger()) == 4, "Change Log2_32 to a proper one"); unsigned DstBit = Log2_32(Idx.LaneMask.getAsInteger()); - assert(Idx.LaneMask == LaneBitmask(1 << DstBit) && + assert(Idx.LaneMask == LaneBitmask::getLane(DstBit) && "Must be a leaf subregister"); - MaskRolPair MaskRol = { LaneBitmask(1), (uint8_t)DstBit }; + MaskRolPair MaskRol = { LaneBitmask::getLane(0), (uint8_t)DstBit }; LaneTransforms.push_back(MaskRol); } else { // Go through all leaf subregisters and find the ones that compose with @@ -1314,7 +1314,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() { continue; // Replicate the behaviour from the lane mask generation loop above. unsigned SrcBit = NextBit; - LaneBitmask SrcMask = LaneBitmask(1 << SrcBit); + LaneBitmask SrcMask = LaneBitmask::getLane(SrcBit); if (NextBit < LaneBitmask::BitWidth-1) ++NextBit; assert(Idx2.LaneMask == SrcMask); @@ -1386,7 +1386,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() { // For classes without any subregisters set LaneMask to 1 instead of 0. // This makes it easier for client code to handle classes uniformly. if (LaneMask.none()) - LaneMask = LaneBitmask(1); + LaneMask = LaneBitmask::getLane(0); RegClass.LaneMask = LaneMask; } diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py index 2b680846e176..2ef0a8f77ec9 100644 --- a/utils/lit/lit/LitConfig.py +++ b/utils/lit/lit/LitConfig.py @@ -25,7 +25,8 @@ class LitConfig(object): params, config_prefix = None, maxIndividualTestTime = 0, maxFailures = None, - parallelism_groups = []): + parallelism_groups = [], + echo_all_commands = False): # The name of the test runner. self.progname = progname # The items to add to the PATH environment variable. @@ -64,6 +65,7 @@ class LitConfig(object): self.maxIndividualTestTime = maxIndividualTestTime self.maxFailures = maxFailures self.parallelism_groups = parallelism_groups + self.echo_all_commands = echo_all_commands @property def maxIndividualTestTime(self): diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py index 8260d3813345..46bcac4b306e 100644 --- a/utils/lit/lit/TestRunner.py +++ b/utils/lit/lit/TestRunner.py @@ -715,6 +715,8 @@ def executeScript(test, litConfig, tmpBase, commands, cwd): else: if test.config.pipefail: f.write('set -o pipefail;') + if litConfig.echo_all_commands: + f.write('set -x;') f.write('{ ' + '; } &&\n{ '.join(commands) + '; }') f.write('\n') f.close() diff --git a/utils/lit/lit/main.py b/utils/lit/lit/main.py index 530f962d336d..f0162464ce33 100755 --- a/utils/lit/lit/main.py +++ b/utils/lit/lit/main.py @@ -199,6 +199,12 @@ def main_with_tmp(builtinParameters): format_group.add_argument("-v", "--verbose", dest="showOutput", help="Show test output for failures", action="store_true", default=False) + format_group.add_argument("-vv", "--echo-all-commands", + dest="echoAllCommands", + action="store_true", default=False, + help="Echo all commands as they are executed to stdout.\ + In case of failure, last command shown will be the\ + failing one.") format_group.add_argument("-a", "--show-all", dest="showAllOutput", help="Display all commandlines and output", action="store_true", default=False) @@ -303,6 +309,9 @@ def main_with_tmp(builtinParameters): if opts.maxFailures == 0: parser.error("Setting --max-failures to 0 does not have any effect.") + if opts.echoAllCommands: + opts.showOutput = True + inputs = args # Create the user defined parameters. @@ -338,7 +347,8 @@ def main_with_tmp(builtinParameters): config_prefix = opts.configPrefix, maxIndividualTestTime = maxIndividualTestTime, maxFailures = opts.maxFailures, - parallelism_groups = {}) + parallelism_groups = {}, + echo_all_commands = opts.echoAllCommands) # Perform test discovery. run = lit.run.Run(litConfig, diff --git a/utils/vim/syntax/llvm.vim b/utils/vim/syntax/llvm.vim index e795c7f62133..e110da4329b5 100644 --- a/utils/vim/syntax/llvm.vim +++ b/utils/vim/syntax/llvm.vim @@ -1,7 +1,7 @@ " Vim syntax file " Language: llvm " Maintainer: The LLVM team, http://llvm.org/ -" Version: $Revision: 307419 $ +" Version: $Revision: 308208 $ if version < 600 syntax clear @@ -161,7 +161,7 @@ syn keyword llvmKeyword \ within \ writeonly \ x86_64_sysvcc - \ x86_64_win64cc + \ win64cc \ x86_fastcallcc \ x86_stdcallcc \ x86_thiscallcc -- cgit v1.3