diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 17:58:59 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 17:58:59 +0000 |
commit | 1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (patch) | |
tree | 2f526c9cfcb089e51c33d6e1f0d51b10bda34714 /lib/Target/X86 | |
parent | d8e91e46262bc44006913e6796843909f1ac7bcd (diff) | |
download | src-1a56a5ead7a2e84bee8240f5f6b033b5f1707154.tar.gz src-1a56a5ead7a2e84bee8240f5f6b033b5f1707154.zip |
Notes
Diffstat (limited to 'lib/Target/X86')
-rw-r--r-- | lib/Target/X86/AsmParser/CMakeLists.txt | 4 | ||||
-rw-r--r-- | lib/Target/X86/AsmParser/LLVMBuild.txt | 23 | ||||
-rw-r--r-- | lib/Target/X86/CMakeLists.txt | 81 | ||||
-rw-r--r-- | lib/Target/X86/Disassembler/CMakeLists.txt | 4 | ||||
-rw-r--r-- | lib/Target/X86/Disassembler/LLVMBuild.txt | 23 | ||||
-rw-r--r-- | lib/Target/X86/InstPrinter/CMakeLists.txt | 6 | ||||
-rw-r--r-- | lib/Target/X86/InstPrinter/LLVMBuild.txt | 23 | ||||
-rw-r--r-- | lib/Target/X86/LLVMBuild.txt | 35 | ||||
-rw-r--r-- | lib/Target/X86/MCTargetDesc/CMakeLists.txt | 11 | ||||
-rw-r--r-- | lib/Target/X86/MCTargetDesc/LLVMBuild.txt | 23 | ||||
-rw-r--r-- | lib/Target/X86/README-FPStack.txt | 85 | ||||
-rw-r--r-- | lib/Target/X86/README-SSE.txt | 841 | ||||
-rw-r--r-- | lib/Target/X86/README-X86-64.txt | 184 | ||||
-rw-r--r-- | lib/Target/X86/README.txt | 1794 | ||||
-rw-r--r-- | lib/Target/X86/TargetInfo/CMakeLists.txt | 3 | ||||
-rw-r--r-- | lib/Target/X86/TargetInfo/LLVMBuild.txt | 23 | ||||
-rw-r--r-- | lib/Target/X86/Utils/CMakeLists.txt | 3 | ||||
-rw-r--r-- | lib/Target/X86/Utils/LLVMBuild.txt | 23 |
18 files changed, 0 insertions, 3189 deletions
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt deleted file mode 100644 index b022a41b192f..000000000000 --- a/lib/Target/X86/AsmParser/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_llvm_library(LLVMX86AsmParser - X86AsmInstrumentation.cpp - X86AsmParser.cpp - ) diff --git a/lib/Target/X86/AsmParser/LLVMBuild.txt b/lib/Target/X86/AsmParser/LLVMBuild.txt deleted file mode 100644 index 67c0d1358d80..000000000000 --- a/lib/Target/X86/AsmParser/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/X86/AsmParser/LLVMBuild.txt -----------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = X86AsmParser -parent = X86 -required_libraries = MC MCParser Support X86Desc X86Info X86AsmPrinter -add_to_library_groups = X86 diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt deleted file mode 100644 index 645ca49f2a19..000000000000 --- a/lib/Target/X86/CMakeLists.txt +++ /dev/null @@ -1,81 +0,0 @@ -set(LLVM_TARGET_DEFINITIONS X86.td) - -tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher) -tablegen(LLVM X86GenAsmWriter.inc -gen-asm-writer) -tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) -tablegen(LLVM X86GenCallingConv.inc -gen-callingconv) -tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel) -tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler) -tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables) -tablegen(LLVM X86GenExegesis.inc -gen-exegesis) -tablegen(LLVM X86GenFastISel.inc -gen-fast-isel) -tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel) -tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info) -tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank) -tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info) -tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget) - -if (X86_GEN_FOLD_TABLES) - tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables) -endif() - -add_public_tablegen_target(X86CommonTableGen) - -set(sources - ShadowCallStack.cpp - X86AsmPrinter.cpp - X86CallFrameOptimization.cpp - X86CallingConv.cpp - X86CallLowering.cpp - X86CmovConversion.cpp - X86CondBrFolding.cpp - X86DomainReassignment.cpp - X86DiscriminateMemOps.cpp - X86ExpandPseudo.cpp - X86FastISel.cpp - X86FixupBWInsts.cpp - X86FixupLEAs.cpp - X86AvoidStoreForwardingBlocks.cpp - X86FixupSetCC.cpp - X86FlagsCopyLowering.cpp - X86FloatingPoint.cpp - X86FrameLowering.cpp - X86InstructionSelector.cpp - X86ISelDAGToDAG.cpp - X86ISelLowering.cpp - X86IndirectBranchTracking.cpp - X86InterleavedAccess.cpp - X86InsertPrefetch.cpp - X86InstrFMA3Info.cpp - X86InstrFoldTables.cpp - X86InstrInfo.cpp - X86EvexToVex.cpp - X86LegalizerInfo.cpp - X86MCInstLower.cpp - X86MachineFunctionInfo.cpp - X86MacroFusion.cpp - X86OptimizeLEAs.cpp - X86PadShortFunction.cpp - X86RegisterBankInfo.cpp - X86RegisterInfo.cpp - X86RetpolineThunks.cpp - X86SelectionDAGInfo.cpp - X86ShuffleDecodeConstantPool.cpp - X86SpeculativeLoadHardening.cpp - X86Subtarget.cpp - X86TargetMachine.cpp - X86TargetObjectFile.cpp - X86TargetTransformInfo.cpp - X86VZeroUpper.cpp - X86WinAllocaExpander.cpp - X86WinEHState.cpp - ) - -add_llvm_target(X86CodeGen ${sources}) - -add_subdirectory(AsmParser) -add_subdirectory(Disassembler) -add_subdirectory(InstPrinter) -add_subdirectory(MCTargetDesc) -add_subdirectory(TargetInfo) -add_subdirectory(Utils) diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt deleted file mode 100644 index 43702826c9bc..000000000000 --- a/lib/Target/X86/Disassembler/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_llvm_library(LLVMX86Disassembler - X86Disassembler.cpp - X86DisassemblerDecoder.cpp - ) diff --git a/lib/Target/X86/Disassembler/LLVMBuild.txt b/lib/Target/X86/Disassembler/LLVMBuild.txt deleted file mode 100644 index e003fc9f996e..000000000000 --- a/lib/Target/X86/Disassembler/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/X86/Disassembler/LLVMBuild.txt --------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = X86Disassembler -parent = X86 -required_libraries = MCDisassembler Support X86Info -add_to_library_groups = X86 diff --git a/lib/Target/X86/InstPrinter/CMakeLists.txt b/lib/Target/X86/InstPrinter/CMakeLists.txt deleted file mode 100644 index a61efaed33a5..000000000000 --- a/lib/Target/X86/InstPrinter/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -add_llvm_library(LLVMX86AsmPrinter - X86ATTInstPrinter.cpp - X86IntelInstPrinter.cpp - X86InstComments.cpp - X86InstPrinterCommon.cpp - ) diff --git a/lib/Target/X86/InstPrinter/LLVMBuild.txt b/lib/Target/X86/InstPrinter/LLVMBuild.txt deleted file mode 100644 index 6868ddefa51f..000000000000 --- a/lib/Target/X86/InstPrinter/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/X86/InstPrinter/LLVMBuild.txt ---------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = X86AsmPrinter -parent = X86 -required_libraries = MC Support X86Utils -add_to_library_groups = X86 diff --git a/lib/Target/X86/LLVMBuild.txt b/lib/Target/X86/LLVMBuild.txt deleted file mode 100644 index 055336baac19..000000000000 --- a/lib/Target/X86/LLVMBuild.txt +++ /dev/null @@ -1,35 +0,0 @@ -;===- ./lib/Target/X86/LLVMBuild.txt ---------------------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[common] -subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils - -[component_0] -type = TargetGroup -name = X86 -parent = Target -has_asmparser = 1 -has_asmprinter = 1 -has_disassembler = 1 -has_jit = 1 - -[component_1] -type = Library -name = X86CodeGen -parent = X86 -required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target X86AsmPrinter X86Desc X86Info X86Utils GlobalISel ProfileData -add_to_library_groups = X86 diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt deleted file mode 100644 index 8d0d9fa1215c..000000000000 --- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -add_llvm_library(LLVMX86Desc - X86AsmBackend.cpp - X86MCTargetDesc.cpp - X86MCAsmInfo.cpp - X86MCCodeEmitter.cpp - X86MachObjectWriter.cpp - X86ELFObjectWriter.cpp - X86WinCOFFObjectWriter.cpp - X86WinCOFFStreamer.cpp - X86WinCOFFTargetStreamer.cpp - ) diff --git a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt deleted file mode 100644 index b9fdc9c483fd..000000000000 --- a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/X86/MCTargetDesc/LLVMBuild.txt --------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = X86Desc -parent = X86 -required_libraries = MC MCDisassembler Object Support X86AsmPrinter X86Info -add_to_library_groups = X86 diff --git a/lib/Target/X86/README-FPStack.txt b/lib/Target/X86/README-FPStack.txt deleted file mode 100644 index 39efd2dbcf1a..000000000000 --- a/lib/Target/X86/README-FPStack.txt +++ /dev/null @@ -1,85 +0,0 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend: FP stack related stuff -//===---------------------------------------------------------------------===// - -//===---------------------------------------------------------------------===// - -Some targets (e.g. athlons) prefer freep to fstp ST(0): -http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html - -//===---------------------------------------------------------------------===// - -This should use fiadd on chips where it is profitable: -double foo(double P, int *I) { return P+*I; } - -We have fiadd patterns now but the followings have the same cost and -complexity. We need a way to specify the later is more profitable. - -def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP:$dst, (fadd RFP:$src1, - (extloadf64f32 addr:$src2)))]>; - // ST(0) = ST(0) + [mem32] - -def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP:$dst, (fadd RFP:$src1, - (X86fild addr:$src2, i32)))]>; - // ST(0) = ST(0) + [mem32int] - -//===---------------------------------------------------------------------===// - -The FP stackifier should handle simple permutates to reduce number of shuffle -instructions, e.g. turning: - -fld P -> fld Q -fld Q fld P -fxch - -or: - -fxch -> fucomi -fucomi jl X -jg X - -Ideas: -http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html - - -//===---------------------------------------------------------------------===// - -Add a target specific hook to DAG combiner to handle SINT_TO_FP and -FP_TO_SINT when the source operand is already in memory. - -//===---------------------------------------------------------------------===// - -Open code rint,floor,ceil,trunc: -http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html -http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html - -Opencode the sincos[f] libcall. - -//===---------------------------------------------------------------------===// - -None of the FPStack instructions are handled in -X86RegisterInfo::foldMemoryOperand, which prevents the spiller from -folding spill code into the instructions. - -//===---------------------------------------------------------------------===// - -Currently the x86 codegen isn't very good at mixing SSE and FPStack -code: - -unsigned int foo(double x) { return x; } - -foo: - subl $20, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, 8(%esp) - fldl 8(%esp) - fisttpll (%esp) - movl (%esp), %eax - addl $20, %esp - ret - -This just requires being smarter when custom expanding fptoui. - -//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt deleted file mode 100644 index 190ca861967a..000000000000 --- a/lib/Target/X86/README-SSE.txt +++ /dev/null @@ -1,841 +0,0 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend: SSE-specific stuff. -//===---------------------------------------------------------------------===// - -//===---------------------------------------------------------------------===// - -SSE Variable shift can be custom lowered to something like this, which uses a -small table + unaligned load + shuffle instead of going through memory. - -__m128i_shift_right: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - -... -__m128i shift_right(__m128i value, unsigned long offset) { - return _mm_shuffle_epi8(value, - _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); -} - -//===---------------------------------------------------------------------===// - -SSE has instructions for doing operations on complex numbers, we should pattern -match them. For example, this should turn into a horizontal add: - -typedef float __attribute__((vector_size(16))) v4f32; -float f32(v4f32 A) { - return A[0]+A[1]+A[2]+A[3]; -} - -Instead we get this: - -_f32: ## @f32 - pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] - addss %xmm0, %xmm1 - pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] - movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] - movaps %xmm0, %xmm3 - addss %xmm1, %xmm3 - movdqa %xmm2, %xmm0 - addss %xmm3, %xmm0 - ret - -Also, there are cases where some simple local SLP would improve codegen a bit. -compiling this: - -_Complex float f32(_Complex float A, _Complex float B) { - return A+B; -} - -into: - -_f32: ## @f32 - movdqa %xmm0, %xmm2 - addss %xmm1, %xmm2 - pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] - pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] - addss %xmm1, %xmm3 - movaps %xmm2, %xmm0 - unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] - ret - -seems silly when it could just be one addps. - - -//===---------------------------------------------------------------------===// - -Expand libm rounding functions inline: Significant speedups possible. -http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html - -//===---------------------------------------------------------------------===// - -When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and -other fast SSE modes. - -//===---------------------------------------------------------------------===// - -Think about doing i64 math in SSE regs on x86-32. - -//===---------------------------------------------------------------------===// - -This testcase should have no SSE instructions in it, and only one load from -a constant pool: - -double %test3(bool %B) { - %C = select bool %B, double 123.412, double 523.01123123 - ret double %C -} - -Currently, the select is being lowered, which prevents the dag combiner from -turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' - -The pattern isel got this one right. - -//===---------------------------------------------------------------------===// - -Lower memcpy / memset to a series of SSE 128 bit move instructions when it's -feasible. - -//===---------------------------------------------------------------------===// - -Codegen: - if (copysign(1.0, x) == copysign(1.0, y)) -into: - if (x^y & mask) -when using SSE. - -//===---------------------------------------------------------------------===// - -Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half -of a v4sf value. - -//===---------------------------------------------------------------------===// - -Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. -Perhaps use pxor / xorp* to clear a XMM register first? - -//===---------------------------------------------------------------------===// - -External test Nurbs exposed some problems. Look for -__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc -emits: - - movaps (%edx), %xmm2 #59.21 - movaps (%edx), %xmm5 #60.21 - movaps (%edx), %xmm4 #61.21 - movaps (%edx), %xmm3 #62.21 - movl 40(%ecx), %ebp #69.49 - shufps $0, %xmm2, %xmm5 #60.21 - movl 100(%esp), %ebx #69.20 - movl (%ebx), %edi #69.20 - imull %ebp, %edi #69.49 - addl (%eax), %edi #70.33 - shufps $85, %xmm2, %xmm4 #61.21 - shufps $170, %xmm2, %xmm3 #62.21 - shufps $255, %xmm2, %xmm2 #63.21 - lea (%ebp,%ebp,2), %ebx #69.49 - negl %ebx #69.49 - lea -3(%edi,%ebx), %ebx #70.33 - shll $4, %ebx #68.37 - addl 32(%ecx), %ebx #68.37 - testb $15, %bl #91.13 - jne L_B1.24 # Prob 5% #91.13 - -This is the llvm code after instruction scheduling: - -cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %reg1078 = MOV32ri -3 - %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0 - %reg1037 = MOV32rm %reg1024, 1, %noreg, 40 - %reg1080 = IMUL32rr %reg1079, %reg1037 - %reg1081 = MOV32rm %reg1058, 1, %noreg, 0 - %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 - %reg1036 = MOV32rm %reg1024, 1, %noreg, 32 - %reg1082 = SHL32ri %reg1038, 4 - %reg1039 = ADD32rr %reg1036, %reg1082 - %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0 - %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 - %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 - %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 - %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 - %reg1040 = MOV32rr %reg1039 - %reg1084 = AND32ri8 %reg1039, 15 - CMP32ri8 %reg1084, 0 - JE mbb<cond_next204,0xa914d30> - -Still ok. After register allocation: - -cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %eax = MOV32ri -3 - %edx = MOV32rm %stack.3, 1, %noreg, 0 - ADD32rm %eax<def&use>, %edx, 1, %noreg, 0 - %edx = MOV32rm %stack.7, 1, %noreg, 0 - %edx = MOV32rm %edx, 1, %noreg, 40 - IMUL32rr %eax<def&use>, %edx - %esi = MOV32rm %stack.5, 1, %noreg, 0 - %esi = MOV32rm %esi, 1, %noreg, 0 - MOV32mr %stack.4, 1, %noreg, 0, %esi - %eax = LEA32r %esi, 1, %eax, -3 - %esi = MOV32rm %stack.7, 1, %noreg, 0 - %esi = MOV32rm %esi, 1, %noreg, 32 - %edi = MOV32rr %eax - SHL32ri %edi<def&use>, 4 - ADD32rr %edi<def&use>, %esi - %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0 - %xmm1 = MOVAPSrr %xmm0 - SHUFPSrr %xmm1<def&use>, %xmm1, 170 - %xmm2 = MOVAPSrr %xmm0 - SHUFPSrr %xmm2<def&use>, %xmm2, 0 - %xmm3 = MOVAPSrr %xmm0 - SHUFPSrr %xmm3<def&use>, %xmm3, 255 - SHUFPSrr %xmm0<def&use>, %xmm0, 85 - %ebx = MOV32rr %edi - AND32ri8 %ebx<def&use>, 15 - CMP32ri8 %ebx, 0 - JE mbb<cond_next204,0xa914d30> - -This looks really bad. The problem is shufps is a destructive opcode. Since it -appears as operand two in more than one shufps ops. It resulted in a number of -copies. Note icc also suffers from the same problem. Either the instruction -selector should select pshufd or The register allocator can made the two-address -to three-address transformation. - -It also exposes some other problems. See MOV32ri -3 and the spills. - -//===---------------------------------------------------------------------===// - -Consider: - -__m128 test(float a) { - return _mm_set_ps(0.0, 0.0, 0.0, a*a); -} - -This compiles into: - -movss 4(%esp), %xmm1 -mulss %xmm1, %xmm1 -xorps %xmm0, %xmm0 -movss %xmm1, %xmm0 -ret - -Because mulss doesn't modify the top 3 elements, the top elements of -xmm1 are already zero'd. We could compile this to: - -movss 4(%esp), %xmm0 -mulss %xmm0, %xmm0 -ret - -//===---------------------------------------------------------------------===// - -Here's a sick and twisted idea. Consider code like this: - -__m128 test(__m128 a) { - float b = *(float*)&A; - ... - return _mm_set_ps(0.0, 0.0, 0.0, b); -} - -This might compile to this code: - -movaps c(%esp), %xmm1 -xorps %xmm0, %xmm0 -movss %xmm1, %xmm0 -ret - -Now consider if the ... code caused xmm1 to get spilled. This might produce -this code: - -movaps c(%esp), %xmm1 -movaps %xmm1, c2(%esp) -... - -xorps %xmm0, %xmm0 -movaps c2(%esp), %xmm1 -movss %xmm1, %xmm0 -ret - -However, since the reload is only used by these instructions, we could -"fold" it into the uses, producing something like this: - -movaps c(%esp), %xmm1 -movaps %xmm1, c2(%esp) -... - -movss c2(%esp), %xmm0 -ret - -... saving two instructions. - -The basic idea is that a reload from a spill slot, can, if only one 4-byte -chunk is used, bring in 3 zeros the one element instead of 4 elements. -This can be used to simplify a variety of shuffle operations, where the -elements are fixed zeros. - -//===---------------------------------------------------------------------===// - -This code generates ugly code, probably due to costs being off or something: - -define void @test(float* %P, <4 x float>* %P2 ) { - %xFloat0.688 = load float* %P - %tmp = load <4 x float>* %P2 - %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 - store <4 x float> %inFloat3.713, <4 x float>* %P2 - ret void -} - -Generates: - -_test: - movl 8(%esp), %eax - movaps (%eax), %xmm0 - pxor %xmm1, %xmm1 - movaps %xmm0, %xmm2 - shufps $50, %xmm1, %xmm2 - shufps $132, %xmm2, %xmm0 - movaps %xmm0, (%eax) - ret - -Would it be better to generate: - -_test: - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - xor %eax, %eax - pinsrw $6, %eax, %xmm0 - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) - ret - -? - -//===---------------------------------------------------------------------===// - -Some useful information in the Apple Altivec / SSE Migration Guide: - -http://developer.apple.com/documentation/Performance/Conceptual/ -Accelerate_sse_migration/index.html - -e.g. SSE select using and, andnot, or. Various SSE compare translations. - -//===---------------------------------------------------------------------===// - -Add hooks to commute some CMPP operations. - -//===---------------------------------------------------------------------===// - -Apply the same transformation that merged four float into a single 128-bit load -to loads from constant pool. - -//===---------------------------------------------------------------------===// - -Floating point max / min are commutable when -enable-unsafe-fp-path is -specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other -nodes which are selected to max / min instructions that are marked commutable. - -//===---------------------------------------------------------------------===// - -We should materialize vector constants like "all ones" and "signbit" with -code like: - - cmpeqps xmm1, xmm1 ; xmm1 = all-ones - -and: - cmpeqps xmm1, xmm1 ; xmm1 = all-ones - psrlq xmm1, 31 ; xmm1 = all 100000000000... - -instead of using a load from the constant pool. The later is important for -ABS/NEG/copysign etc. - -//===---------------------------------------------------------------------===// - -These functions: - -#include <xmmintrin.h> -__m128i a; -void x(unsigned short n) { - a = _mm_slli_epi32 (a, n); -} -void y(unsigned n) { - a = _mm_slli_epi32 (a, n); -} - -compile to ( -O3 -static -fomit-frame-pointer): -_x: - movzwl 4(%esp), %eax - movd %eax, %xmm0 - movaps _a, %xmm1 - pslld %xmm0, %xmm1 - movaps %xmm1, _a - ret -_y: - movd 4(%esp), %xmm0 - movaps _a, %xmm1 - pslld %xmm0, %xmm1 - movaps %xmm1, _a - ret - -"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems -like movd would be sufficient in both cases as the value is already zero -extended in the 32-bit stack slot IIRC. For signed short, it should also be -save, as a really-signed value would be undefined for pslld. - - -//===---------------------------------------------------------------------===// - -#include <math.h> -int t1(double d) { return signbit(d); } - -This currently compiles to: - subl $12, %esp - movsd 16(%esp), %xmm0 - movsd %xmm0, (%esp) - movl 4(%esp), %eax - shrl $31, %eax - addl $12, %esp - ret - -We should use movmskp{s|d} instead. - -//===---------------------------------------------------------------------===// - -CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single -(aligned) vector load. This functionality has a couple of problems. - -1. The code to infer alignment from loads of globals is in the X86 backend, - not the dag combiner. This is because dagcombine2 needs to be able to see - through the X86ISD::Wrapper node, which DAGCombine can't really do. -2. The code for turning 4 x load into a single vector load is target - independent and should be moved to the dag combiner. -3. The code for turning 4 x load into a vector load can only handle a direct - load from a global or a direct load from the stack. It should be generalized - to handle any load from P, P+4, P+8, P+12, where P can be anything. -4. The alignment inference code cannot handle loads from globals in non-static - mode because it doesn't look through the extra dyld stub load. If you try - vec_align.ll without -relocation-model=static, you'll see what I mean. - -//===---------------------------------------------------------------------===// - -We should lower store(fneg(load p), q) into an integer load+xor+store, which -eliminates a constant pool load. For example, consider: - -define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { -entry: - %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] - %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly - ret i64 %tmp20 -} -declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly - -This currently compiles to: - -LCPI1_0: # <4 x float> - .long 2147483648 # float -0 - .long 2147483648 # float -0 - .long 2147483648 # float -0 - .long 2147483648 # float -0 -_ccosf: - subl $12, %esp - movss 16(%esp), %xmm0 - movss %xmm0, 4(%esp) - movss 20(%esp), %xmm0 - xorps LCPI1_0, %xmm0 - movss %xmm0, (%esp) - call L_ccoshf$stub - addl $12, %esp - ret - -Note the load into xmm0, then xor (to negate), then store. In PIC mode, -this code computes the pic base and does two loads to do the constant pool -load, so the improvement is much bigger. - -The tricky part about this xform is that the argument load/store isn't exposed -until post-legalize, and at that point, the fneg has been custom expanded into -an X86 fxor. This means that we need to handle this case in the x86 backend -instead of in target independent code. - -//===---------------------------------------------------------------------===// - -Non-SSE4 insert into 16 x i8 is atrociously bad. - -//===---------------------------------------------------------------------===// - -<2 x i64> extract is substantially worse than <2 x f64>, even if the destination -is memory. - -//===---------------------------------------------------------------------===// - -INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert -any number of 0.0 simultaneously. Currently we only use it for simple -insertions. - -See comments in LowerINSERT_VECTOR_ELT_SSE4. - -//===---------------------------------------------------------------------===// - -On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not -Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are -legal, it'll just take a few extra patterns written in the .td file. - -Note: this is not a code quality issue; the custom lowered code happens to be -right, but we shouldn't have to custom lower anything. This is probably related -to <2 x i64> ops being so bad. - -//===---------------------------------------------------------------------===// - -LLVM currently generates stack realignment code, when it is not necessary -needed. The problem is that we need to know about stack alignment too early, -before RA runs. - -At that point we don't know, whether there will be vector spill, or not. -Stack realignment logic is overly conservative here, but otherwise we can -produce unaligned loads/stores. - -Fixing this will require some huge RA changes. - -Testcase: -#include <emmintrin.h> - -typedef short vSInt16 __attribute__ ((__vector_size__ (16))); - -static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, -- 22725, - 12873};; - -vSInt16 madd(vSInt16 b) -{ - return _mm_madd_epi16(a, b); -} - -Generated code (x86-32, linux): -madd: - pushl %ebp - movl %esp, %ebp - andl $-16, %esp - movaps .LCPI1_0, %xmm1 - pmaddwd %xmm1, %xmm0 - movl %ebp, %esp - popl %ebp - ret - -//===---------------------------------------------------------------------===// - -Consider: -#include <emmintrin.h> -__m128 foo2 (float x) { - return _mm_set_ps (0, 0, x, 0); -} - -In x86-32 mode, we generate this spiffy code: - -_foo2: - movss 4(%esp), %xmm0 - pshufd $81, %xmm0, %xmm0 - ret - -in x86-64 mode, we generate this code, which could be better: - -_foo2: - xorps %xmm1, %xmm1 - movss %xmm0, %xmm1 - pshufd $81, %xmm1, %xmm0 - ret - -In sse4 mode, we could use insertps to make both better. - -Here's another testcase that could use insertps [mem]: - -#include <xmmintrin.h> -extern float x2, x3; -__m128 foo1 (float x1, float x4) { - return _mm_set_ps (x2, x1, x3, x4); -} - -gcc mainline compiles it to: - -foo1: - insertps $0x10, x2(%rip), %xmm0 - insertps $0x10, x3(%rip), %xmm1 - movaps %xmm1, %xmm2 - movlhps %xmm0, %xmm2 - movaps %xmm2, %xmm0 - ret - -//===---------------------------------------------------------------------===// - -We compile vector multiply-by-constant into poor code: - -define <4 x i32> @f(<4 x i32> %i) nounwind { - %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > - ret <4 x i32> %A -} - -On targets without SSE4.1, this compiles into: - -LCPI1_0: ## <4 x i32> - .long 10 - .long 10 - .long 10 - .long 10 - .text - .align 4,0x90 - .globl _f -_f: - pshufd $3, %xmm0, %xmm1 - movd %xmm1, %eax - imull LCPI1_0+12, %eax - movd %eax, %xmm1 - pshufd $1, %xmm0, %xmm2 - movd %xmm2, %eax - imull LCPI1_0+4, %eax - movd %eax, %xmm2 - punpckldq %xmm1, %xmm2 - movd %xmm0, %eax - imull LCPI1_0, %eax - movd %eax, %xmm1 - movhlps %xmm0, %xmm0 - movd %xmm0, %eax - imull LCPI1_0+8, %eax - movd %eax, %xmm0 - punpckldq %xmm0, %xmm1 - movaps %xmm1, %xmm0 - punpckldq %xmm2, %xmm0 - ret - -It would be better to synthesize integer vector multiplication by constants -using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, -simple cases such as multiplication by powers of two would be better as -vector shifts than as multiplications. - -//===---------------------------------------------------------------------===// - -We compile this: - -__m128i -foo2 (char x) -{ - return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); -} - -into: - movl $1, %eax - xorps %xmm0, %xmm0 - pinsrw $2, %eax, %xmm0 - movzbl 4(%esp), %eax - pinsrw $3, %eax, %xmm0 - movl $256, %eax - pinsrw $7, %eax, %xmm0 - ret - - -gcc-4.2: - subl $12, %esp - movzbl 16(%esp), %eax - movdqa LC0, %xmm0 - pinsrw $3, %eax, %xmm0 - addl $12, %esp - ret - .const - .align 4 -LC0: - .word 0 - .word 0 - .word 1 - .word 0 - .word 0 - .word 0 - .word 0 - .word 256 - -With SSE4, it should be - movdqa .LC0(%rip), %xmm0 - pinsrb $6, %edi, %xmm0 - -//===---------------------------------------------------------------------===// - -We should transform a shuffle of two vectors of constants into a single vector -of constants. Also, insertelement of a constant into a vector of constants -should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. - -We compiled it to something horrible: - - .align 4 -LCPI1_1: ## float - .long 1065353216 ## float 1 - .const - - .align 4 -LCPI1_0: ## <4 x float> - .space 4 - .long 1065353216 ## float 1 - .space 4 - .long 1065353216 ## float 1 - .text - .align 4,0x90 - .globl _t -_t: - xorps %xmm0, %xmm0 - movhps LCPI1_0, %xmm0 - movss LCPI1_1, %xmm1 - movaps %xmm0, %xmm2 - shufps $2, %xmm1, %xmm2 - shufps $132, %xmm2, %xmm0 - movaps %xmm0, 0 - -//===---------------------------------------------------------------------===// -rdar://5907648 - -This function: - -float foo(unsigned char x) { - return x; -} - -compiles to (x86-32): - -define float @foo(i8 zeroext %x) nounwind { - %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] - ret float %tmp12 -} - -compiles to: - -_foo: - subl $4, %esp - movzbl 8(%esp), %eax - cvtsi2ss %eax, %xmm0 - movss %xmm0, (%esp) - flds (%esp) - addl $4, %esp - ret - -We should be able to use: - cvtsi2ss 8($esp), %xmm0 -since we know the stack slot is already zext'd. - -//===---------------------------------------------------------------------===// - -Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) -when code size is critical. movlps is slower than movsd on core2 but it's one -byte shorter. - -//===---------------------------------------------------------------------===// - -We should use a dynamic programming based approach to tell when using FPStack -operations is cheaper than SSE. SciMark montecarlo contains code like this -for example: - -double MonteCarlo_num_flops(int Num_samples) { - return ((double) Num_samples)* 4.0; -} - -In fpstack mode, this compiles into: - -LCPI1_0: - .long 1082130432 ## float 4.000000e+00 -_MonteCarlo_num_flops: - subl $4, %esp - movl 8(%esp), %eax - movl %eax, (%esp) - fildl (%esp) - fmuls LCPI1_0 - addl $4, %esp - ret - -in SSE mode, it compiles into significantly slower code: - -_MonteCarlo_num_flops: - subl $12, %esp - cvtsi2sd 16(%esp), %xmm0 - mulsd LCPI1_0, %xmm0 - movsd %xmm0, (%esp) - fldl (%esp) - addl $12, %esp - ret - -There are also other cases in scimark where using fpstack is better, it is -cheaper to do fld1 than load from a constant pool for example, so -"load, add 1.0, store" is better done in the fp stack, etc. - -//===---------------------------------------------------------------------===// - -These should compile into the same code (PR6214): Perhaps instcombine should -canonicalize the former into the later? - -define float @foo(float %x) nounwind { - %t = bitcast float %x to i32 - %s = and i32 %t, 2147483647 - %d = bitcast i32 %s to float - ret float %d -} - -declare float @fabsf(float %n) -define float @bar(float %x) nounwind { - %d = call float @fabsf(float %x) - ret float %d -} - -//===---------------------------------------------------------------------===// - -This IR (from PR6194): - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-apple-darwin10.0.0" - -%0 = type { double, double } -%struct.float3 = type { float, float, float } - -define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { -entry: - %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] - %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] - %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] - %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] - %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] - %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] - %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] - store float %tmp12, float* %tmp5 - ret void -} - -Compiles to: - -_test: ## @test - movd %xmm0, %rax - shrq $32, %rax - movl %eax, 4(%rdi) - ret - -This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and -doing a shuffle from v[1] to v[0] then a float store. - -//===---------------------------------------------------------------------===// - -[UNSAFE FP] - -void foo(double, double, double); -void norm(double x, double y, double z) { - double scale = __builtin_sqrt(x*x + y*y + z*z); - foo(x/scale, y/scale, z/scale); -} - -We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is -slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first -and emit 3 mulsd in place of the divs. This can be done as a target-independent -transform. - -If we're dealing with floats instead of doubles we could even replace the sqrtss -and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the -cost of reduced accuracy. - -//===---------------------------------------------------------------------===// - -This function should be matched to haddpd when the appropriate CPU is enabled: - -#include <x86intrin.h> -double f (__m128d p) { - return p[0] + p[1]; -} - -similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should -turn into hsubpd also. - -//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt deleted file mode 100644 index a3ea4595ac1e..000000000000 --- a/lib/Target/X86/README-X86-64.txt +++ /dev/null @@ -1,184 +0,0 @@ -//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===// - -AMD64 Optimization Manual 8.2 has some nice information about optimizing integer -multiplication by a constant. How much of it applies to Intel's X86-64 -implementation? There are definite trade-offs to consider: latency vs. register -pressure vs. code size. - -//===---------------------------------------------------------------------===// - -Are we better off using branches instead of cmove to implement FP to -unsigned i64? - -_conv: - ucomiss LC0(%rip), %xmm0 - cvttss2siq %xmm0, %rdx - jb L3 - subss LC0(%rip), %xmm0 - movabsq $-9223372036854775808, %rax - cvttss2siq %xmm0, %rdx - xorq %rax, %rdx -L3: - movq %rdx, %rax - ret - -instead of - -_conv: - movss LCPI1_0(%rip), %xmm1 - cvttss2siq %xmm0, %rcx - movaps %xmm0, %xmm2 - subss %xmm1, %xmm2 - cvttss2siq %xmm2, %rax - movabsq $-9223372036854775808, %rdx - xorq %rdx, %rax - ucomiss %xmm1, %xmm0 - cmovb %rcx, %rax - ret - -Seems like the jb branch has high likelihood of being taken. It would have -saved a few instructions. - -//===---------------------------------------------------------------------===// - -It's not possible to reference AH, BH, CH, and DH registers in an instruction -requiring REX prefix. However, divb and mulb both produce results in AH. If isel -emits a CopyFromReg which gets turned into a movb and that can be allocated a -r8b - r15b. - -To get around this, isel emits a CopyFromReg from AX and then right shift it -down by 8 and truncate it. It's not pretty but it works. We need some register -allocation magic to make the hack go away (e.g. putting additional constraints -on the result of the movb). - -//===---------------------------------------------------------------------===// - -The x86-64 ABI for hidden-argument struct returns requires that the -incoming value of %rdi be copied into %rax by the callee upon return. - -The idea is that it saves callers from having to remember this value, -which would often require a callee-saved register. Callees usually -need to keep this value live for most of their body anyway, so it -doesn't add a significant burden on them. - -We currently implement this in codegen, however this is suboptimal -because it means that it would be quite awkward to implement the -optimization for callers. - -A better implementation would be to relax the LLVM IR rules for sret -arguments to allow a function with an sret argument to have a non-void -return type, and to have the front-end to set up the sret argument value -as the return value of the function. The front-end could more easily -emit uses of the returned struct value to be in terms of the function's -lowered return value, and it would free non-C frontends from a -complication only required by a C-based ABI. - -//===---------------------------------------------------------------------===// - -We get a redundant zero extension for code like this: - -int mask[1000]; -int foo(unsigned x) { - if (x < 10) - x = x * 45; - else - x = x * 78; - return mask[x]; -} - -_foo: -LBB1_0: ## entry - cmpl $9, %edi - jbe LBB1_3 ## bb -LBB1_1: ## bb1 - imull $78, %edi, %eax -LBB1_2: ## bb2 - movl %eax, %eax <---- - movq _mask@GOTPCREL(%rip), %rcx - movl (%rcx,%rax,4), %eax - ret -LBB1_3: ## bb - imull $45, %edi, %eax - jmp LBB1_2 ## bb2 - -Before regalloc, we have: - - %reg1025 = IMUL32rri8 %reg1024, 45, implicit-def %eflags - JMP mbb<bb2,0x203afb0> - Successors according to CFG: 0x203afb0 (#3) - -bb1: 0x203af60, LLVM BB @0x1e02310, ID#2: - Predecessors according to CFG: 0x203aec0 (#0) - %reg1026 = IMUL32rri8 %reg1024, 78, implicit-def %eflags - Successors according to CFG: 0x203afb0 (#3) - -bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3: - Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2) - %reg1027 = PHI %reg1025, mbb<bb,0x203af10>, - %reg1026, mbb<bb1,0x203af60> - %reg1029 = MOVZX64rr32 %reg1027 - -so we'd have to know that IMUL32rri8 leaves the high word zero extended and to -be able to recognize the zero extend. This could also presumably be implemented -if we have whole-function selectiondags. - -//===---------------------------------------------------------------------===// - -Take the following code -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653): -extern unsigned long table[]; -unsigned long foo(unsigned char *p) { - unsigned long tag = *p; - return table[tag >> 4] + table[tag & 0xf]; -} - -Current code generated: - movzbl (%rdi), %eax - movq %rax, %rcx - andq $240, %rcx - shrq %rcx - andq $15, %rax - movq table(,%rax,8), %rax - addq table(%rcx), %rax - ret - -Issues: -1. First movq should be movl; saves a byte. -2. Both andq's should be andl; saves another two bytes. I think this was - implemented at one point, but subsequently regressed. -3. shrq should be shrl; saves another byte. -4. The first andq can be completely eliminated by using a slightly more - expensive addressing mode. - -//===---------------------------------------------------------------------===// - -Consider the following (contrived testcase, but contains common factors): - -#include <stdarg.h> -int test(int x, ...) { - int sum, i; - va_list l; - va_start(l, x); - for (i = 0; i < x; i++) - sum += va_arg(l, int); - va_end(l); - return sum; -} - -Testcase given in C because fixing it will likely involve changing the IR -generated for it. The primary issue with the result is that it doesn't do any -of the optimizations which are possible if we know the address of a va_list -in the current function is never taken: -1. We shouldn't spill the XMM registers because we only call va_arg with "int". -2. It would be nice if we could sroa the va_list. -3. Probably overkill, but it'd be cool if we could peel off the first five -iterations of the loop. - -Other optimizations involving functions which use va_arg on floats which don't -have the address of a va_list taken: -1. Conversely to the above, we shouldn't spill general registers if we only - call va_arg on "double". -2. If we know nothing more than 64 bits wide is read from the XMM registers, - we can change the spilling code to reduce the amount of stack used by half. - -//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt deleted file mode 100644 index c06a7b1ade6d..000000000000 --- a/lib/Target/X86/README.txt +++ /dev/null @@ -1,1794 +0,0 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the X86 backend. -//===---------------------------------------------------------------------===// - -Improvements to the multiply -> shift/add algorithm: -http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html - -//===---------------------------------------------------------------------===// - -Improve code like this (occurs fairly frequently, e.g. in LLVM): -long long foo(int x) { return 1LL << x; } - -http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html -http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html -http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html - -Another useful one would be ~0ULL >> X and ~0ULL << X. - -One better solution for 1LL << x is: - xorl %eax, %eax - xorl %edx, %edx - testb $32, %cl - sete %al - setne %dl - sall %cl, %eax - sall %cl, %edx - -But that requires good 8-bit subreg support. - -Also, this might be better. It's an extra shift, but it's one instruction -shorter, and doesn't stress 8-bit subreg support. -(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, -but without the unnecessary and.) - movl %ecx, %eax - shrl $5, %eax - movl %eax, %edx - xorl $1, %edx - sall %cl, %eax - sall %cl. %edx - -64-bit shifts (in general) expand to really bad code. Instead of using -cmovs, we should expand to a conditional branch like GCC produces. - -//===---------------------------------------------------------------------===// - -Some isel ideas: - -1. Dynamic programming based approach when compile time is not an - issue. -2. Code duplication (addressing mode) during isel. -3. Other ideas from "Register-Sensitive Selection, Duplication, and - Sequencing of Instructions". -4. Scheduling for reduced register pressure. E.g. "Minimum Register - Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" - and other related papers. - http://citeseer.ist.psu.edu/govindarajan01minimum.html - -//===---------------------------------------------------------------------===// - -Should we promote i16 to i32 to avoid partial register update stalls? - -//===---------------------------------------------------------------------===// - -Leave any_extend as pseudo instruction and hint to register -allocator. Delay codegen until post register allocation. -Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach -the coalescer how to deal with it though. - -//===---------------------------------------------------------------------===// - -It appears icc use push for parameter passing. Need to investigate. - -//===---------------------------------------------------------------------===// - -The instruction selector sometimes misses folding a load into a compare. The -pattern is written as (cmp reg, (load p)). Because the compare isn't -commutative, it is not matched with the load on both sides. The dag combiner -should be made smart enough to canonicalize the load into the RHS of a compare -when it can invert the result of the compare for free. - -//===---------------------------------------------------------------------===// - -In many cases, LLVM generates code like this: - -_test: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - setl %al - movzbl %al, %eax - ret - -on some processors (which ones?), it is more efficient to do this: - -_test: - movl 8(%esp), %ebx - xor %eax, %eax - cmpl %ebx, 4(%esp) - setl %al - ret - -Doing this correctly is tricky though, as the xor clobbers the flags. - -//===---------------------------------------------------------------------===// - -We should generate bts/btr/etc instructions on targets where they are cheap or -when codesize is important. e.g., for: - -void setbit(int *target, int bit) { - *target |= (1 << bit); -} -void clearbit(int *target, int bit) { - *target &= ~(1 << bit); -} - -//===---------------------------------------------------------------------===// - -Instead of the following for memset char*, 1, 10: - - movl $16843009, 4(%edx) - movl $16843009, (%edx) - movw $257, 8(%edx) - -It might be better to generate - - movl $16843009, %eax - movl %eax, 4(%edx) - movl %eax, (%edx) - movw al, 8(%edx) - -when we can spare a register. It reduces code size. - -//===---------------------------------------------------------------------===// - -Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently -get this: - -define i32 @test1(i32 %X) { - %Y = sdiv i32 %X, 8 - ret i32 %Y -} - -_test1: - movl 4(%esp), %eax - movl %eax, %ecx - sarl $31, %ecx - shrl $29, %ecx - addl %ecx, %eax - sarl $3, %eax - ret - -GCC knows several different ways to codegen it, one of which is this: - -_test1: - movl 4(%esp), %eax - cmpl $-1, %eax - leal 7(%eax), %ecx - cmovle %ecx, %eax - sarl $3, %eax - ret - -which is probably slower, but it's interesting at least :) - -//===---------------------------------------------------------------------===// - -We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl -We should leave these as libcalls for everything over a much lower threshold, -since libc is hand tuned for medium and large mem ops (avoiding RFO for large -stores, TLB preheating, etc) - -//===---------------------------------------------------------------------===// - -Optimize this into something reasonable: - x * copysign(1.0, y) * copysign(1.0, z) - -//===---------------------------------------------------------------------===// - -Optimize copysign(x, *y) to use an integer load from y. - -//===---------------------------------------------------------------------===// - -The following tests perform worse with LSR: - -lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. - -//===---------------------------------------------------------------------===// - -Adding to the list of cmp / test poor codegen issues: - -int test(__m128 *A, __m128 *B) { - if (_mm_comige_ss(*A, *B)) - return 3; - else - return 4; -} - -_test: - movl 8(%esp), %eax - movaps (%eax), %xmm0 - movl 4(%esp), %eax - movaps (%eax), %xmm1 - comiss %xmm0, %xmm1 - setae %al - movzbl %al, %ecx - movl $3, %eax - movl $4, %edx - cmpl $0, %ecx - cmove %edx, %eax - ret - -Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There -are a number of issues. 1) We are introducing a setcc between the result of the -intrisic call and select. 2) The intrinsic is expected to produce a i32 value -so a any extend (which becomes a zero extend) is added. - -We probably need some kind of target DAG combine hook to fix this. - -//===---------------------------------------------------------------------===// - -We generate significantly worse code for this than GCC: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 -http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 - -There is also one case we do worse on PPC. - -//===---------------------------------------------------------------------===// - -For this: - -int test(int a) -{ - return a * 3; -} - -We currently emits - imull $3, 4(%esp), %eax - -Perhaps this is what we really should generate is? Is imull three or four -cycles? Note: ICC generates this: - movl 4(%esp), %eax - leal (%eax,%eax,2), %eax - -The current instruction priority is based on pattern complexity. The former is -more "complex" because it folds a load so the latter will not be emitted. - -Perhaps we should use AddedComplexity to give LEA32r a higher priority? We -should always try to match LEA first since the LEA matching code does some -estimate to determine whether the match is profitable. - -However, if we care more about code size, then imull is better. It's two bytes -shorter than movl + leal. - -On a Pentium M, both variants have the same characteristics with regard -to throughput; however, the multiplication has a latency of four cycles, as -opposed to two cycles for the movl+lea variant. - -//===---------------------------------------------------------------------===// - -It appears gcc place string data with linkonce linkage in -.section __TEXT,__const_coal,coalesced instead of -.section __DATA,__const_coal,coalesced. -Take a look at darwin.h, there are other Darwin assembler directives that we -do not make use of. - -//===---------------------------------------------------------------------===// - -define i32 @foo(i32* %a, i32 %t) { -entry: - br label %cond_true - -cond_true: ; preds = %cond_true, %entry - %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3] - %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1] - %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1] - %tmp3 = load i32* %tmp2 ; <i32> [#uses=1] - %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1] - %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2] - %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2] - %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1] - br i1 %tmp, label %bb12, label %cond_true - -bb12: ; preds = %cond_true - ret i32 %tmp7 -} -is pessimized by -loop-reduce and -indvars - -//===---------------------------------------------------------------------===// - -u32 to float conversion improvement: - -float uint32_2_float( unsigned u ) { - float fl = (int) (u & 0xffff); - float fh = (int) (u >> 16); - fh *= 0x1.0p16f; - return fh + fl; -} - -00000000 subl $0x04,%esp -00000003 movl 0x08(%esp,1),%eax -00000007 movl %eax,%ecx -00000009 shrl $0x10,%ecx -0000000c cvtsi2ss %ecx,%xmm0 -00000010 andl $0x0000ffff,%eax -00000015 cvtsi2ss %eax,%xmm1 -00000019 mulss 0x00000078,%xmm0 -00000021 addss %xmm1,%xmm0 -00000025 movss %xmm0,(%esp,1) -0000002a flds (%esp,1) -0000002d addl $0x04,%esp -00000030 ret - -//===---------------------------------------------------------------------===// - -When using fastcc abi, align stack slot of argument of type double on 8 byte -boundary to improve performance. - -//===---------------------------------------------------------------------===// - -GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting -simplifications for integer "x cmp y ? a : b". - -//===---------------------------------------------------------------------===// - -Consider the expansion of: - -define i32 @test3(i32 %X) { - %tmp1 = urem i32 %X, 255 - ret i32 %tmp1 -} - -Currently it compiles to: - -... - movl $2155905153, %ecx - movl 8(%esp), %esi - movl %esi, %eax - mull %ecx -... - -This could be "reassociated" into: - - movl $2155905153, %eax - movl 8(%esp), %ecx - mull %ecx - -to avoid the copy. In fact, the existing two-address stuff would do this -except that mul isn't a commutative 2-addr instruction. I guess this has -to be done at isel time based on the #uses to mul? - -//===---------------------------------------------------------------------===// - -Make sure the instruction which starts a loop does not cross a cacheline -boundary. This requires knowning the exact length of each machine instruction. -That is somewhat complicated, but doable. Example 256.bzip2: - -In the new trace, the hot loop has an instruction which crosses a cacheline -boundary. In addition to potential cache misses, this can't help decoding as I -imagine there has to be some kind of complicated decoder reset and realignment -to grab the bytes from the next cacheline. - -532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines -942 942 0x3d03 movl %dh, (1809(%esp, %esi) -937 937 0x3d0a incl %esi -3 3 0x3d0b cmpb %bl, %dl -27 27 0x3d0d jnz 0x000062db <main+11707> - -//===---------------------------------------------------------------------===// - -In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. - -//===---------------------------------------------------------------------===// - -This could be a single 16-bit load. - -int f(char *p) { - if ((p[0] == 1) & (p[1] == 2)) return 1; - return 0; -} - -//===---------------------------------------------------------------------===// - -We should inline lrintf and probably other libc functions. - -//===---------------------------------------------------------------------===// - -This code: - -void test(int X) { - if (X) abort(); -} - -is currently compiled to: - -_test: - subl $12, %esp - cmpl $0, 16(%esp) - jne LBB1_1 - addl $12, %esp - ret -LBB1_1: - call L_abort$stub - -It would be better to produce: - -_test: - subl $12, %esp - cmpl $0, 16(%esp) - jne L_abort$stub - addl $12, %esp - ret - -This can be applied to any no-return function call that takes no arguments etc. -Alternatively, the stack save/restore logic could be shrink-wrapped, producing -something like this: - -_test: - cmpl $0, 4(%esp) - jne LBB1_1 - ret -LBB1_1: - subl $12, %esp - call L_abort$stub - -Both are useful in different situations. Finally, it could be shrink-wrapped -and tail called, like this: - -_test: - cmpl $0, 4(%esp) - jne LBB1_1 - ret -LBB1_1: - pop %eax # realign stack. - call L_abort$stub - -Though this probably isn't worth it. - -//===---------------------------------------------------------------------===// - -Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with -a neg instead of a sub instruction. Consider: - -int test(char X) { return 7-X; } - -we currently produce: -_test: - movl $7, %eax - movsbl 4(%esp), %ecx - subl %ecx, %eax - ret - -We would use one fewer register if codegen'd as: - - movsbl 4(%esp), %eax - neg %eax - add $7, %eax - ret - -Note that this isn't beneficial if the load can be folded into the sub. In -this case, we want a sub: - -int test(int X) { return 7-X; } -_test: - movl $7, %eax - subl 4(%esp), %eax - ret - -//===---------------------------------------------------------------------===// - -Leaf functions that require one 4-byte spill slot have a prolog like this: - -_foo: - pushl %esi - subl $4, %esp -... -and an epilog like this: - addl $4, %esp - popl %esi - ret - -It would be smaller, and potentially faster, to push eax on entry and to -pop into a dummy register instead of using addl/subl of esp. Just don't pop -into any return registers :) - -//===---------------------------------------------------------------------===// - -The X86 backend should fold (branch (or (setcc, setcc))) into multiple -branches. We generate really poor code for: - -double testf(double a) { - return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); -} - -For example, the entry BB is: - -_testf: - subl $20, %esp - pxor %xmm0, %xmm0 - movsd 24(%esp), %xmm1 - ucomisd %xmm0, %xmm1 - setnp %al - sete %cl - testb %cl, %al - jne LBB1_5 # UnifiedReturnBlock -LBB1_1: # cond_true - - -it would be better to replace the last four instructions with: - - jp LBB1_1 - je LBB1_5 -LBB1_1: - -We also codegen the inner ?: into a diamond: - - cvtss2sd LCPI1_0(%rip), %xmm2 - cvtss2sd LCPI1_1(%rip), %xmm3 - ucomisd %xmm1, %xmm0 - ja LBB1_3 # cond_true -LBB1_2: # cond_true - movapd %xmm3, %xmm2 -LBB1_3: # cond_true - movapd %xmm2, %xmm0 - ret - -We should sink the load into xmm3 into the LBB1_2 block. This should -be pretty easy, and will nuke all the copies. - -//===---------------------------------------------------------------------===// - -This: - #include <algorithm> - inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) - { return std::make_pair(a + b, a + b < a); } - bool no_overflow(unsigned a, unsigned b) - { return !full_add(a, b).second; } - -Should compile to: - addl %esi, %edi - setae %al - movzbl %al, %eax - ret - -on x86-64, instead of the rather stupid-looking: - addl %esi, %edi - setb %al - xorb $1, %al - movzbl %al, %eax - ret - - -//===---------------------------------------------------------------------===// - -The following code: - -bb114.preheader: ; preds = %cond_next94 - %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1] - %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1] - %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1] - %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1] - %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1] - %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2] - %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1] - %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1] - %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1] - %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1] - %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1] - br label %bb114 - -produces: - -LBB3_5: # bb114.preheader - movswl -68(%ebp), %eax - movl $32, %ecx - movl %ecx, -80(%ebp) - subl %eax, -80(%ebp) - movswl -52(%ebp), %eax - movl %ecx, -84(%ebp) - subl %eax, -84(%ebp) - movswl -70(%ebp), %eax - movl %ecx, -88(%ebp) - subl %eax, -88(%ebp) - movswl -50(%ebp), %eax - subl %eax, %ecx - movl %ecx, -76(%ebp) - movswl -42(%ebp), %eax - movl %eax, -92(%ebp) - movswl -66(%ebp), %eax - movl %eax, -96(%ebp) - movw $0, -98(%ebp) - -This appears to be bad because the RA is not folding the store to the stack -slot into the movl. The above instructions could be: - movl $32, -80(%ebp) -... - movl $32, -84(%ebp) -... -This seems like a cross between remat and spill folding. - -This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't -change, so we could simply subtract %eax from %ecx first and then use %ecx (or -vice-versa). - -//===---------------------------------------------------------------------===// - -This code: - - %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1] - br i1 %tmp659, label %cond_true662, label %cond_next715 - -produces this: - - testw %cx, %cx - movswl %cx, %esi - jns LBB4_109 # cond_next715 - -Shark tells us that using %cx in the testw instruction is sub-optimal. It -suggests using the 32-bit register (which is what ICC uses). - -//===---------------------------------------------------------------------===// - -We compile this: - -void compare (long long foo) { - if (foo < 4294967297LL) - abort(); -} - -to: - -compare: - subl $4, %esp - cmpl $0, 8(%esp) - setne %al - movzbw %al, %ax - cmpl $1, 12(%esp) - setg %cl - movzbw %cl, %cx - cmove %ax, %cx - testb $1, %cl - jne .LBB1_2 # UnifiedReturnBlock -.LBB1_1: # ifthen - call abort -.LBB1_2: # UnifiedReturnBlock - addl $4, %esp - ret - -(also really horrible code on ppc). This is due to the expand code for 64-bit -compares. GCC produces multiple branches, which is much nicer: - -compare: - subl $12, %esp - movl 20(%esp), %edx - movl 16(%esp), %eax - decl %edx - jle .L7 -.L5: - addl $12, %esp - ret - .p2align 4,,7 -.L7: - jl .L4 - cmpl $0, %eax - .p2align 4,,8 - ja .L5 -.L4: - .p2align 4,,9 - call abort - -//===---------------------------------------------------------------------===// - -Tail call optimization improvements: Tail call optimization currently -pushes all arguments on the top of the stack (their normal place for -non-tail call optimized calls) that source from the callers arguments -or that source from a virtual register (also possibly sourcing from -callers arguments). -This is done to prevent overwriting of parameters (see example -below) that might be used later. - -example: - -int callee(int32, int64); -int caller(int32 arg1, int32 arg2) { - int64 local = arg2 * 2; - return callee(arg2, (int64)local); -} - -[arg1] [!arg2 no longer valid since we moved local onto it] -[arg2] -> [(int64) -[RETADDR] local ] - -Moving arg1 onto the stack slot of callee function would overwrite -arg2 of the caller. - -Possible optimizations: - - - - Analyse the actual parameters of the callee to see which would - overwrite a caller parameter which is used by the callee and only - push them onto the top of the stack. - - int callee (int32 arg1, int32 arg2); - int caller (int32 arg1, int32 arg2) { - return callee(arg1,arg2); - } - - Here we don't need to write any variables to the top of the stack - since they don't overwrite each other. - - int callee (int32 arg1, int32 arg2); - int caller (int32 arg1, int32 arg2) { - return callee(arg2,arg1); - } - - Here we need to push the arguments because they overwrite each - other. - -//===---------------------------------------------------------------------===// - -main () -{ - int i = 0; - unsigned long int z = 0; - - do { - z -= 0x00004000; - i++; - if (i > 0x00040000) - abort (); - } while (z > 0); - exit (0); -} - -gcc compiles this to: - -_main: - subl $28, %esp - xorl %eax, %eax - jmp L2 -L3: - cmpl $262144, %eax - je L10 -L2: - addl $1, %eax - cmpl $262145, %eax - jne L3 - call L_abort$stub -L10: - movl $0, (%esp) - call L_exit$stub - -llvm: - -_main: - subl $12, %esp - movl $1, %eax - movl $16384, %ecx -LBB1_1: # bb - cmpl $262145, %eax - jge LBB1_4 # cond_true -LBB1_2: # cond_next - incl %eax - addl $4294950912, %ecx - cmpl $16384, %ecx - jne LBB1_1 # bb -LBB1_3: # bb11 - xorl %eax, %eax - addl $12, %esp - ret -LBB1_4: # cond_true - call L_abort$stub - -1. LSR should rewrite the first cmp with induction variable %ecx. -2. DAG combiner should fold - leal 1(%eax), %edx - cmpl $262145, %edx - => - cmpl $262144, %eax - -//===---------------------------------------------------------------------===// - -define i64 @test(double %X) { - %Y = fptosi double %X to i64 - ret i64 %Y -} - -compiles to: - -_test: - subl $20, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, 8(%esp) - fldl 8(%esp) - fisttpll (%esp) - movl 4(%esp), %edx - movl (%esp), %eax - addl $20, %esp - #FP_REG_KILL - ret - -This should just fldl directly from the input stack slot. - -//===---------------------------------------------------------------------===// - -This code: -int foo (int x) { return (x & 65535) | 255; } - -Should compile into: - -_foo: - movzwl 4(%esp), %eax - orl $255, %eax - ret - -instead of: -_foo: - movl $65280, %eax - andl 4(%esp), %eax - orl $255, %eax - ret - -//===---------------------------------------------------------------------===// - -We're codegen'ing multiply of long longs inefficiently: - -unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { - return arg1 * arg2; -} - -We compile to (fomit-frame-pointer): - -_LLM: - pushl %esi - movl 8(%esp), %ecx - movl 16(%esp), %esi - movl %esi, %eax - mull %ecx - imull 12(%esp), %esi - addl %edx, %esi - imull 20(%esp), %ecx - movl %esi, %edx - addl %ecx, %edx - popl %esi - ret - -This looks like a scheduling deficiency and lack of remat of the load from -the argument area. ICC apparently produces: - - movl 8(%esp), %ecx - imull 12(%esp), %ecx - movl 16(%esp), %eax - imull 4(%esp), %eax - addl %eax, %ecx - movl 4(%esp), %eax - mull 12(%esp) - addl %ecx, %edx - ret - -Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 - -//===---------------------------------------------------------------------===// - -We can fold a store into "zeroing a reg". Instead of: - -xorl %eax, %eax -movl %eax, 124(%esp) - -we should get: - -movl $0, 124(%esp) - -if the flags of the xor are dead. - -Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should -be folded into: shl [mem], 1 - -//===---------------------------------------------------------------------===// - -In SSE mode, we turn abs and neg into a load from the constant pool plus a xor -or and instruction, for example: - - xorpd LCPI1_0, %xmm2 - -However, if xmm2 gets spilled, we end up with really ugly code like this: - - movsd (%esp), %xmm0 - xorpd LCPI1_0, %xmm0 - movsd %xmm0, (%esp) - -Since we 'know' that this is a 'neg', we can actually "fold" the spill into -the neg/abs instruction, turning it into an *integer* operation, like this: - - xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) - -you could also use xorb, but xorl is less likely to lead to a partial register -stall. Here is a contrived testcase: - -double a, b, c; -void test(double *P) { - double X = *P; - a = X; - bar(); - X = -X; - b = X; - bar(); - c = X; -} - -//===---------------------------------------------------------------------===// - -The generated code on x86 for checking for signed overflow on a multiply the -obvious way is much longer than it needs to be. - -int x(int a, int b) { - long long prod = (long long)a*b; - return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); -} - -See PR2053 for more details. - -//===---------------------------------------------------------------------===// - -We should investigate using cdq/ctld (effect: edx = sar eax, 31) -more aggressively; it should cost the same as a move+shift on any modern -processor, but it's a lot shorter. Downside is that it puts more -pressure on register allocation because it has fixed operands. - -Example: -int abs(int x) {return x < 0 ? -x : x;} - -gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: -abs: - movl 4(%esp), %eax - cltd - xorl %edx, %eax - subl %edx, %eax - ret - -//===---------------------------------------------------------------------===// - -Take the following code (from -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): - -extern unsigned char first_one[65536]; -int FirstOnet(unsigned long long arg1) -{ - if (arg1 >> 48) - return (first_one[arg1 >> 48]); - return 0; -} - - -The following code is currently generated: -FirstOnet: - movl 8(%esp), %eax - cmpl $65536, %eax - movl 4(%esp), %ecx - jb .LBB1_2 # UnifiedReturnBlock -.LBB1_1: # ifthen - shrl $16, %eax - movzbl first_one(%eax), %eax - ret -.LBB1_2: # UnifiedReturnBlock - xorl %eax, %eax - ret - -We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this -lets us change the cmpl into a testl, which is shorter, and eliminate the shift. - -//===---------------------------------------------------------------------===// - -We compile this function: - -define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { -entry: - %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1] - br i1 %tmp2, label %bb7, label %bb - -bb: ; preds = %entry - %tmp6 = add i32 %b, %a ; <i32> [#uses=1] - ret i32 %tmp6 - -bb7: ; preds = %entry - %tmp10 = sub i32 %a, %c ; <i32> [#uses=1] - ret i32 %tmp10 -} - -to: - -foo: # @foo -# %bb.0: # %entry - movl 4(%esp), %ecx - cmpb $0, 16(%esp) - je .LBB0_2 -# %bb.1: # %bb - movl 8(%esp), %eax - addl %ecx, %eax - ret -.LBB0_2: # %bb7 - movl 12(%esp), %edx - movl %ecx, %eax - subl %edx, %eax - ret - -There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a -couple more movls by putting 4(%esp) into %eax instead of %ecx. - -//===---------------------------------------------------------------------===// - -See rdar://4653682. - -From flops: - -LBB1_15: # bb310 - cvtss2sd LCPI1_0, %xmm1 - addsd %xmm1, %xmm0 - movsd 176(%esp), %xmm2 - mulsd %xmm0, %xmm2 - movapd %xmm2, %xmm3 - mulsd %xmm3, %xmm3 - movapd %xmm3, %xmm4 - mulsd LCPI1_23, %xmm4 - addsd LCPI1_24, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_25, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_26, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_27, %xmm4 - mulsd %xmm3, %xmm4 - addsd LCPI1_28, %xmm4 - mulsd %xmm3, %xmm4 - addsd %xmm1, %xmm4 - mulsd %xmm2, %xmm4 - movsd 152(%esp), %xmm1 - addsd %xmm4, %xmm1 - movsd %xmm1, 152(%esp) - incl %eax - cmpl %eax, %esi - jge LBB1_15 # bb310 -LBB1_16: # bb358.loopexit - movsd 152(%esp), %xmm0 - addsd %xmm0, %xmm0 - addsd LCPI1_22, %xmm0 - movsd %xmm0, 152(%esp) - -Rather than spilling the result of the last addsd in the loop, we should have -insert a copy to split the interval (one for the duration of the loop, one -extending to the fall through). The register pressure in the loop isn't high -enough to warrant the spill. - -Also check why xmm7 is not used at all in the function. - -//===---------------------------------------------------------------------===// - -Take the following: - -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" -target triple = "i386-apple-darwin8" -@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2] -define fastcc void @abort_gzip() noreturn nounwind { -entry: - %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1] - br i1 %tmp.b.i, label %bb.i, label %bb4.i -bb.i: ; preds = %entry - tail call void @exit( i32 1 ) noreturn nounwind - unreachable -bb4.i: ; preds = %entry - store i1 true, i1* @in_exit.4870.b - tail call void @exit( i32 1 ) noreturn nounwind - unreachable -} -declare void @exit(i32) noreturn nounwind - -This compiles into: -_abort_gzip: ## @abort_gzip -## %bb.0: ## %entry - subl $12, %esp - movb _in_exit.4870.b, %al - cmpb $1, %al - jne LBB0_2 - -We somehow miss folding the movb into the cmpb. - -//===---------------------------------------------------------------------===// - -We compile: - -int test(int x, int y) { - return x-y-1; -} - -into (-m64): - -_test: - decl %edi - movl %edi, %eax - subl %esi, %eax - ret - -it would be better to codegen as: x+~y (notl+addl) - -//===---------------------------------------------------------------------===// - -This code: - -int foo(const char *str,...) -{ - __builtin_va_list a; int x; - __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a); - return x; -} - -gets compiled into this on x86-64: - subq $200, %rsp - movaps %xmm7, 160(%rsp) - movaps %xmm6, 144(%rsp) - movaps %xmm5, 128(%rsp) - movaps %xmm4, 112(%rsp) - movaps %xmm3, 96(%rsp) - movaps %xmm2, 80(%rsp) - movaps %xmm1, 64(%rsp) - movaps %xmm0, 48(%rsp) - movq %r9, 40(%rsp) - movq %r8, 32(%rsp) - movq %rcx, 24(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 8(%rsp) - leaq (%rsp), %rax - movq %rax, 192(%rsp) - leaq 208(%rsp), %rax - movq %rax, 184(%rsp) - movl $48, 180(%rsp) - movl $8, 176(%rsp) - movl 176(%rsp), %eax - cmpl $47, %eax - jbe .LBB1_3 # bb -.LBB1_1: # bb3 - movq 184(%rsp), %rcx - leaq 8(%rcx), %rax - movq %rax, 184(%rsp) -.LBB1_2: # bb4 - movl (%rcx), %eax - addq $200, %rsp - ret -.LBB1_3: # bb - movl %eax, %ecx - addl $8, %eax - addq 192(%rsp), %rcx - movl %eax, 176(%rsp) - jmp .LBB1_2 # bb4 - -gcc 4.3 generates: - subq $96, %rsp -.LCFI0: - leaq 104(%rsp), %rax - movq %rsi, -80(%rsp) - movl $8, -120(%rsp) - movq %rax, -112(%rsp) - leaq -88(%rsp), %rax - movq %rax, -104(%rsp) - movl $8, %eax - cmpl $48, %eax - jb .L6 - movq -112(%rsp), %rdx - movl (%rdx), %eax - addq $96, %rsp - ret - .p2align 4,,10 - .p2align 3 -.L6: - mov %eax, %edx - addq -104(%rsp), %rdx - addl $8, %eax - movl %eax, -120(%rsp) - movl (%rdx), %eax - addq $96, %rsp - ret - -and it gets compiled into this on x86: - pushl %ebp - movl %esp, %ebp - subl $4, %esp - leal 12(%ebp), %eax - movl %eax, -4(%ebp) - leal 16(%ebp), %eax - movl %eax, -4(%ebp) - movl 12(%ebp), %eax - addl $4, %esp - popl %ebp - ret - -gcc 4.3 generates: - pushl %ebp - movl %esp, %ebp - movl 12(%ebp), %eax - popl %ebp - ret - -//===---------------------------------------------------------------------===// - -Teach tblgen not to check bitconvert source type in some cases. This allows us -to consolidate the following patterns in X86InstrMMX.td: - -def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))), - (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>; -def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))), - (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>; -def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))), - (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>; - -There are other cases in various td files. - -//===---------------------------------------------------------------------===// - -Take something like the following on x86-32: -unsigned a(unsigned long long x, unsigned y) {return x % y;} - -We currently generate a libcall, but we really shouldn't: the expansion is -shorter and likely faster than the libcall. The expected code is something -like the following: - - movl 12(%ebp), %eax - movl 16(%ebp), %ecx - xorl %edx, %edx - divl %ecx - movl 8(%ebp), %eax - divl %ecx - movl %edx, %eax - ret - -A similar code sequence works for division. - -//===---------------------------------------------------------------------===// - -We currently compile this: - -define i32 @func1(i32 %v1, i32 %v2) nounwind { -entry: - %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) - %sum = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %normal -normal: - ret i32 %sum -overflow: - call void @llvm.trap() - unreachable -} -declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) -declare void @llvm.trap() - -to: - -_func1: - movl 4(%esp), %eax - addl 8(%esp), %eax - jo LBB1_2 ## overflow -LBB1_1: ## normal - ret -LBB1_2: ## overflow - ud2 - -it would be nice to produce "into" someday. - -//===---------------------------------------------------------------------===// - -Test instructions can be eliminated by using EFLAGS values from arithmetic -instructions. This is currently not done for mul, and, or, xor, neg, shl, -sra, srl, shld, shrd, atomic ops, and others. It is also currently not done -for read-modify-write instructions. It is also current not done if the -OF or CF flags are needed. - -The shift operators have the complication that when the shift count is -zero, EFLAGS is not set, so they can only subsume a test instruction if -the shift count is known to be non-zero. Also, using the EFLAGS value -from a shift is apparently very slow on some x86 implementations. - -In read-modify-write instructions, the root node in the isel match is -the store, and isel has no way for the use of the EFLAGS result of the -arithmetic to be remapped to the new node. - -Add and subtract instructions set OF on signed overflow and CF on unsiged -overflow, while test instructions always clear OF and CF. In order to -replace a test with an add or subtract in a situation where OF or CF is -needed, codegen must be able to prove that the operation cannot see -signed or unsigned overflow, respectively. - -//===---------------------------------------------------------------------===// - -memcpy/memmove do not lower to SSE copies when possible. A silly example is: -define <16 x float> @foo(<16 x float> %A) nounwind { - %tmp = alloca <16 x float>, align 16 - %tmp2 = alloca <16 x float>, align 16 - store <16 x float> %A, <16 x float>* %tmp - %s = bitcast <16 x float>* %tmp to i8* - %s2 = bitcast <16 x float>* %tmp2 to i8* - call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) - %R = load <16 x float>* %tmp2 - ret <16 x float> %R -} - -declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind - -which compiles to: - -_foo: - subl $140, %esp - movaps %xmm3, 112(%esp) - movaps %xmm2, 96(%esp) - movaps %xmm1, 80(%esp) - movaps %xmm0, 64(%esp) - movl 60(%esp), %eax - movl %eax, 124(%esp) - movl 56(%esp), %eax - movl %eax, 120(%esp) - movl 52(%esp), %eax - <many many more 32-bit copies> - movaps (%esp), %xmm0 - movaps 16(%esp), %xmm1 - movaps 32(%esp), %xmm2 - movaps 48(%esp), %xmm3 - addl $140, %esp - ret - -On Nehalem, it may even be cheaper to just use movups when unaligned than to -fall back to lower-granularity chunks. - -//===---------------------------------------------------------------------===// - -Implement processor-specific optimizations for parity with GCC on these -processors. GCC does two optimizations: - -1. ix86_pad_returns inserts a noop before ret instructions if immediately - preceded by a conditional branch or is the target of a jump. -2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of - code contains more than 3 branches. - -The first one is done for all AMDs, Core2, and "Generic" -The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, - Core 2, and "Generic" - -//===---------------------------------------------------------------------===// -Testcase: -int x(int a) { return (a&0xf0)>>4; } - -Current output: - movl 4(%esp), %eax - shrl $4, %eax - andl $15, %eax - ret - -Ideal output: - movzbl 4(%esp), %eax - shrl $4, %eax - ret - -//===---------------------------------------------------------------------===// - -Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch -properly. - -When the return value is not used (i.e. only care about the value in the -memory), x86 does not have to use add to implement these. Instead, it can use -add, sub, inc, dec instructions with the "lock" prefix. - -This is currently implemented using a bit of instruction selection trick. The -issue is the target independent pattern produces one output and a chain and we -want to map it into one that just output a chain. The current trick is to select -it into a MERGE_VALUES with the first definition being an implicit_def. The -proper solution is to add new ISD opcodes for the no-output variant. DAG -combiner can then transform the node before it gets to target node selection. - -Problem #2 is we are adding a whole bunch of x86 atomic instructions when in -fact these instructions are identical to the non-lock versions. We need a way to -add target specific information to target nodes and have this information -carried over to machine instructions. Asm printer (or JIT) can use this -information to add the "lock" prefix. - -//===---------------------------------------------------------------------===// - -struct B { - unsigned char y0 : 1; -}; - -int bar(struct B* a) { return a->y0; } - -define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { - %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 - %2 = load i8* %1, align 1 - %3 = and i8 %2, 1 - %4 = zext i8 %3 to i32 - ret i32 %4 -} - -bar: # @bar -# %bb.0: - movb (%rdi), %al - andb $1, %al - movzbl %al, %eax - ret - -Missed optimization: should be movl+andl. - -//===---------------------------------------------------------------------===// - -The x86_64 abi says: - -Booleans, when stored in a memory object, are stored as single byte objects the -value of which is always 0 (false) or 1 (true). - -We are not using this fact: - -int bar(_Bool *a) { return *a; } - -define i32 @bar(i8* nocapture %a) nounwind readonly optsize { - %1 = load i8* %a, align 1, !tbaa !0 - %tmp = and i8 %1, 1 - %2 = zext i8 %tmp to i32 - ret i32 %2 -} - -bar: - movb (%rdi), %al - andb $1, %al - movzbl %al, %eax - ret - -GCC produces - -bar: - movzbl (%rdi), %eax - ret - -//===---------------------------------------------------------------------===// - -Take the following C code: -int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } - -We generate the following IR with clang: -define i32 @f(i32 %a, i32 %b) nounwind readnone { -entry: - %tmp = xor i32 %b, %a ; <i32> [#uses=1] - %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1] - %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1] - %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] - ret i32 %conv5 -} - -And the following x86 code: - xorl %esi, %edi - testb $-1, %dil - sete %al - movzbl %al, %eax - ret - -A cmpb instead of the xorl+testb would be one instruction shorter. - -//===---------------------------------------------------------------------===// - -Given the following C code: -int f(int a, int b) { return (signed char)a == (signed char)b; } - -We generate the following IR with clang: -define i32 @f(i32 %a, i32 %b) nounwind readnone { -entry: - %sext = shl i32 %a, 24 ; <i32> [#uses=1] - %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1] - %sext6 = shl i32 %b, 24 ; <i32> [#uses=1] - %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1] - %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1] - %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] - ret i32 %conv5 -} - -And the following x86 code: - movsbl %sil, %eax - movsbl %dil, %ecx - cmpl %eax, %ecx - sete %al - movzbl %al, %eax - ret - - -It should be possible to eliminate the sign extensions. - -//===---------------------------------------------------------------------===// - -LLVM misses a load+store narrowing opportunity in this code: - -%struct.bf = type { i64, i16, i16, i32 } - -@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] - -define void @t1() nounwind ssp { -entry: - %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] - %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1] - %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2] - %3 = load i32* %2, align 1 ; <i32> [#uses=1] - %4 = and i32 %3, -65537 ; <i32> [#uses=1] - store i32 %4, i32* %2, align 1 - %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] - %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1] - %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2] - %8 = load i32* %7, align 1 ; <i32> [#uses=1] - %9 = and i32 %8, -131073 ; <i32> [#uses=1] - store i32 %9, i32* %7, align 1 - ret void -} - -LLVM currently emits this: - - movq bfi(%rip), %rax - andl $-65537, 8(%rax) - movq bfi(%rip), %rax - andl $-131073, 8(%rax) - ret - -It could narrow the loads and stores to emit this: - - movq bfi(%rip), %rax - andb $-2, 10(%rax) - movq bfi(%rip), %rax - andb $-3, 10(%rax) - ret - -The trouble is that there is a TokenFactor between the store and the -load, making it non-trivial to determine if there's anything between -the load and the store which would prohibit narrowing. - -//===---------------------------------------------------------------------===// - -This code: -void foo(unsigned x) { - if (x == 0) bar(); - else if (x == 1) qux(); -} - -currently compiles into: -_foo: - movl 4(%esp), %eax - cmpl $1, %eax - je LBB0_3 - testl %eax, %eax - jne LBB0_4 - -the testl could be removed: -_foo: - movl 4(%esp), %eax - cmpl $1, %eax - je LBB0_3 - jb LBB0_4 - -0 is the only unsigned number < 1. - -//===---------------------------------------------------------------------===// - -This code: - -%0 = type { i32, i1 } - -define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { -entry: - %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) - %cmp = extractvalue %0 %uadd, 1 - %inc = zext i1 %cmp to i32 - %add = add i32 %x, %sum - %z.0 = add i32 %add, %inc - ret i32 %z.0 -} - -declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone - -compiles to: - -_add32carry: ## @add32carry - addl %esi, %edi - sbbl %ecx, %ecx - movl %edi, %eax - subl %ecx, %eax - ret - -But it could be: - -_add32carry: - leal (%rsi,%rdi), %eax - cmpl %esi, %eax - adcl $0, %eax - ret - -//===---------------------------------------------------------------------===// - -The hot loop of 256.bzip2 contains code that looks a bit like this: - -int foo(char *P, char *Q, int x, int y) { - if (P[0] != Q[0]) - return P[0] < Q[0]; - if (P[1] != Q[1]) - return P[1] < Q[1]; - if (P[2] != Q[2]) - return P[2] < Q[2]; - return P[3] < Q[3]; -} - -In the real code, we get a lot more wrong than this. However, even in this -code we generate: - -_foo: ## @foo -## %bb.0: ## %entry - movb (%rsi), %al - movb (%rdi), %cl - cmpb %al, %cl - je LBB0_2 -LBB0_1: ## %if.then - cmpb %al, %cl - jmp LBB0_5 -LBB0_2: ## %if.end - movb 1(%rsi), %al - movb 1(%rdi), %cl - cmpb %al, %cl - jne LBB0_1 -## %bb.3: ## %if.end38 - movb 2(%rsi), %al - movb 2(%rdi), %cl - cmpb %al, %cl - jne LBB0_1 -## %bb.4: ## %if.end60 - movb 3(%rdi), %al - cmpb 3(%rsi), %al -LBB0_5: ## %if.end60 - setl %al - movzbl %al, %eax - ret - -Note that we generate jumps to LBB0_1 which does a redundant compare. The -redundant compare also forces the register values to be live, which prevents -folding one of the loads into the compare. In contrast, GCC 4.2 produces: - -_foo: - movzbl (%rsi), %eax - cmpb %al, (%rdi) - jne L10 -L12: - movzbl 1(%rsi), %eax - cmpb %al, 1(%rdi) - jne L10 - movzbl 2(%rsi), %eax - cmpb %al, 2(%rdi) - jne L10 - movzbl 3(%rdi), %eax - cmpb 3(%rsi), %al -L10: - setl %al - movzbl %al, %eax - ret - -which is "perfect". - -//===---------------------------------------------------------------------===// - -For the branch in the following code: -int a(); -int b(int x, int y) { - if (x & (1<<(y&7))) - return a(); - return y; -} - -We currently generate: - movb %sil, %al - andb $7, %al - movzbl %al, %eax - btl %eax, %edi - jae .LBB0_2 - -movl+andl would be shorter than the movb+andb+movzbl sequence. - -//===---------------------------------------------------------------------===// - -For the following: -struct u1 { - float x, y; -}; -float foo(struct u1 u) { - return u.x + u.y; -} - -We currently generate: - movdqa %xmm0, %xmm1 - pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] - addss %xmm1, %xmm0 - ret - -We could save an instruction here by commuting the addss. - -//===---------------------------------------------------------------------===// - -This (from PR9661): - -float clamp_float(float a) { - if (a > 1.0f) - return 1.0f; - else if (a < 0.0f) - return 0.0f; - else - return a; -} - -Could compile to: - -clamp_float: # @clamp_float - movss .LCPI0_0(%rip), %xmm1 - minss %xmm1, %xmm0 - pxor %xmm1, %xmm1 - maxss %xmm1, %xmm0 - ret - -with -ffast-math. - -//===---------------------------------------------------------------------===// - -This function (from PR9803): - -int clamp2(int a) { - if (a > 5) - a = 5; - if (a < 0) - return 0; - return a; -} - -Compiles to: - -_clamp2: ## @clamp2 - pushq %rbp - movq %rsp, %rbp - cmpl $5, %edi - movl $5, %ecx - cmovlel %edi, %ecx - testl %ecx, %ecx - movl $0, %eax - cmovnsl %ecx, %eax - popq %rbp - ret - -The move of 0 could be scheduled above the test to make it is xor reg,reg. - -//===---------------------------------------------------------------------===// - -GCC PR48986. We currently compile this: - -void bar(void); -void yyy(int* p) { - if (__sync_fetch_and_add(p, -1) == 1) - bar(); -} - -into: - movl $-1, %eax - lock - xaddl %eax, (%rdi) - cmpl $1, %eax - je LBB0_2 - -Instead we could generate: - - lock - dec %rdi - je LBB0_2 - -The trick is to match "fetch_and_add(X, -C) == C". - -//===---------------------------------------------------------------------===// - -unsigned t(unsigned a, unsigned b) { - return a <= b ? 5 : -5; -} - -We generate: - movl $5, %ecx - cmpl %esi, %edi - movl $-5, %eax - cmovbel %ecx, %eax - -GCC: - cmpl %edi, %esi - sbbl %eax, %eax - andl $-10, %eax - addl $5, %eax - -//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/TargetInfo/CMakeLists.txt b/lib/Target/X86/TargetInfo/CMakeLists.txt deleted file mode 100644 index 1d8a8c1c118e..000000000000 --- a/lib/Target/X86/TargetInfo/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_llvm_library(LLVMX86Info - X86TargetInfo.cpp - ) diff --git a/lib/Target/X86/TargetInfo/LLVMBuild.txt b/lib/Target/X86/TargetInfo/LLVMBuild.txt deleted file mode 100644 index 6a52ea61d87e..000000000000 --- a/lib/Target/X86/TargetInfo/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/X86/TargetInfo/LLVMBuild.txt ----------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = X86Info -parent = X86 -required_libraries = Support -add_to_library_groups = X86 diff --git a/lib/Target/X86/Utils/CMakeLists.txt b/lib/Target/X86/Utils/CMakeLists.txt deleted file mode 100644 index b2697467f26a..000000000000 --- a/lib/Target/X86/Utils/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_llvm_library(LLVMX86Utils - X86ShuffleDecode.cpp - ) diff --git a/lib/Target/X86/Utils/LLVMBuild.txt b/lib/Target/X86/Utils/LLVMBuild.txt deleted file mode 100644 index fdb886f53a08..000000000000 --- a/lib/Target/X86/Utils/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/X86/Utils/LLVMBuild.txt ---------------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = X86Utils -parent = X86 -required_libraries = Support -add_to_library_groups = X86 |