summaryrefslogtreecommitdiff
path: root/lib/Target/X86/README.txt
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/README.txt')
-rw-r--r--lib/Target/X86/README.txt335
1 files changed, 145 insertions, 190 deletions
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index a305ae6ec5505..c10e1709f6677 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -67,19 +67,6 @@ cmovs, we should expand to a conditional branch like GCC produces.
//===---------------------------------------------------------------------===//
-Compile this:
-_Bool f(_Bool a) { return a!=1; }
-
-into:
- movzbl %dil, %eax
- xorl $1, %eax
- ret
-
-(Although note that this isn't a legal way to express the code that llvm-gcc
-currently generates for that function.)
-
-//===---------------------------------------------------------------------===//
-
Some isel ideas:
1. Dynamic programming based approach when compile time if not an
@@ -109,6 +96,37 @@ It appears icc use push for parameter passing. Need to investigate.
//===---------------------------------------------------------------------===//
+This:
+
+void foo(void);
+void bar(int x, int *P) {
+ x >>= 2;
+ if (x)
+ foo();
+ *P = x;
+}
+
+compiles into:
+
+ movq %rsi, %rbx
+ movl %edi, %r14d
+ sarl $2, %r14d
+ testl %r14d, %r14d
+ je LBB0_2
+
+Instead of doing an explicit test, we can use the flags off the sar. This
+occurs in a bigger testcase like this, which is pretty common:
+
+#include <vector>
+int test1(std::vector<int> &X) {
+ int Sum = 0;
+ for (long i = 0, e = X.size(); i != e; ++i)
+ X[i] = 0;
+ return Sum;
+}
+
+//===---------------------------------------------------------------------===//
+
Only use inc/neg/not instructions on processors where they are faster than
add/sub/xor. They are slower on the P4 due to only updating some processor
flags.
@@ -394,72 +412,8 @@ boundary to improve performance.
//===---------------------------------------------------------------------===//
-Codegen:
-
-int f(int a, int b) {
- if (a == 4 || a == 6)
- b++;
- return b;
-}
-
-
-as:
-
-or eax, 2
-cmp eax, 6
-jz label
-
-//===---------------------------------------------------------------------===//
-
GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
-simplifications for integer "x cmp y ? a : b". For example, instead of:
-
-int G;
-void f(int X, int Y) {
- G = X < 0 ? 14 : 13;
-}
-
-compiling to:
-
-_f:
- movl $14, %eax
- movl $13, %ecx
- movl 4(%esp), %edx
- testl %edx, %edx
- cmovl %eax, %ecx
- movl %ecx, _G
- ret
-
-it could be:
-_f:
- movl 4(%esp), %eax
- sarl $31, %eax
- notl %eax
- addl $14, %eax
- movl %eax, _G
- ret
-
-etc.
-
-Another is:
-int usesbb(unsigned int a, unsigned int b) {
- return (a < b ? -1 : 0);
-}
-to:
-_usesbb:
- movl 8(%esp), %eax
- cmpl %eax, 4(%esp)
- sbbl %eax, %eax
- ret
-
-instead of:
-_usesbb:
- xorl %eax, %eax
- movl 8(%esp), %ecx
- cmpl %ecx, 4(%esp)
- movl $4294967295, %ecx
- cmovb %ecx, %eax
- ret
+simplifications for integer "x cmp y ? a : b".
//===---------------------------------------------------------------------===//
@@ -756,23 +710,17 @@ This:
{ return !full_add(a, b).second; }
Should compile to:
+ addl %esi, %edi
+ setae %al
+ movzbl %al, %eax
+ ret
-
- _Z11no_overflowjj:
- addl %edi, %esi
- setae %al
- ret
-
-FIXME: That code looks wrong; bool return is normally defined as zext.
-
-on x86-64, not:
-
-__Z11no_overflowjj:
- addl %edi, %esi
- cmpl %edi, %esi
- setae %al
- movzbl %al, %eax
- ret
+on x86-64, instead of the rather stupid-looking:
+ addl %esi, %edi
+ setb %al
+ xorb $1, %al
+ movzbl %al, %eax
+ ret
//===---------------------------------------------------------------------===//
@@ -1040,10 +988,10 @@ _foo:
instead of:
_foo:
- movl $255, %eax
- orl 4(%esp), %eax
- andl $65535, %eax
- ret
+ movl $65280, %eax
+ andl 4(%esp), %eax
+ orl $255, %eax
+ ret
//===---------------------------------------------------------------------===//
@@ -1165,58 +1113,6 @@ abs:
//===---------------------------------------------------------------------===//
-Consider:
-int test(unsigned long a, unsigned long b) { return -(a < b); }
-
-We currently compile this to:
-
-define i32 @test(i32 %a, i32 %b) nounwind {
- %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
- %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
- %tmp5 = sub i32 0, %tmp34 ; <i32> [#uses=1]
- ret i32 %tmp5
-}
-
-and
-
-_test:
- movl 8(%esp), %eax
- cmpl %eax, 4(%esp)
- setb %al
- movzbl %al, %eax
- negl %eax
- ret
-
-Several deficiencies here. First, we should instcombine zext+neg into sext:
-
-define i32 @test2(i32 %a, i32 %b) nounwind {
- %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
- %tmp34 = sext i1 %tmp3 to i32 ; <i32> [#uses=1]
- ret i32 %tmp34
-}
-
-However, before we can do that, we have to fix the bad codegen that we get for
-sext from bool:
-
-_test2:
- movl 8(%esp), %eax
- cmpl %eax, 4(%esp)
- setb %al
- movzbl %al, %eax
- shll $31, %eax
- sarl $31, %eax
- ret
-
-This code should be at least as good as the code above. Once this is fixed, we
-can optimize this specific case even more to:
-
- movl 8(%esp), %eax
- xorl %ecx, %ecx
- cmpl %eax, 4(%esp)
- sbbl %ecx, %ecx
-
-//===---------------------------------------------------------------------===//
-
Take the following code (from
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
@@ -1605,6 +1501,8 @@ loop, the value comes into the loop as two values, and
RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
constructed BUILD_PAIR which represents the cast value.
+This can be handled by making CodeGenPrepare sink the cast.
+
//===---------------------------------------------------------------------===//
Test instructions can be eliminated by using EFLAGS values from arithmetic
@@ -1736,46 +1634,6 @@ Ideal output:
//===---------------------------------------------------------------------===//
-Testcase:
-int x(int a) { return (a & 0x80) ? 0x100 : 0; }
-int y(int a) { return (a & 0x80) *2; }
-
-Current:
- testl $128, 4(%esp)
- setne %al
- movzbl %al, %eax
- shll $8, %eax
- ret
-
-Better:
- movl 4(%esp), %eax
- addl %eax, %eax
- andl $256, %eax
- ret
-
-This is another general instcombine transformation that is profitable on all
-targets. In LLVM IR, these functions look like this:
-
-define i32 @x(i32 %a) nounwind readnone {
-entry:
- %0 = and i32 %a, 128
- %1 = icmp eq i32 %0, 0
- %iftmp.0.0 = select i1 %1, i32 0, i32 256
- ret i32 %iftmp.0.0
-}
-
-define i32 @y(i32 %a) nounwind readnone {
-entry:
- %0 = shl i32 %a, 1
- %1 = and i32 %0, 256
- ret i32 %1
-}
-
-Replacing an icmp+select with a shift should always be considered profitable in
-instcombine.
-
-//===---------------------------------------------------------------------===//
-
Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
properly.
@@ -1960,3 +1818,100 @@ load, making it non-trivial to determine if there's anything between
the load and the store which would prohibit narrowing.
//===---------------------------------------------------------------------===//
+
+This code:
+void foo(unsigned x) {
+ if (x == 0) bar();
+ else if (x == 1) qux();
+}
+
+currently compiles into:
+_foo:
+ movl 4(%esp), %eax
+ cmpl $1, %eax
+ je LBB0_3
+ testl %eax, %eax
+ jne LBB0_4
+
+the testl could be removed:
+_foo:
+ movl 4(%esp), %eax
+ cmpl $1, %eax
+ je LBB0_3
+ jb LBB0_4
+
+0 is the only unsigned number < 1.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+%0 = type { i32, i1 }
+
+define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
+entry:
+ %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
+ %cmp = extractvalue %0 %uadd, 1
+ %inc = zext i1 %cmp to i32
+ %add = add i32 %x, %sum
+ %z.0 = add i32 %add, %inc
+ ret i32 %z.0
+}
+
+declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+
+compiles to:
+
+_add32carry: ## @add32carry
+ addl %esi, %edi
+ sbbl %ecx, %ecx
+ movl %edi, %eax
+ subl %ecx, %eax
+ ret
+
+But it could be:
+
+_add32carry:
+ leal (%rsi,%rdi), %eax
+ cmpl %esi, %eax
+ adcl $0, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+This:
+char t(char c) {
+ return c/3;
+}
+
+Compiles to: $clang t.c -S -o - -O3 -mkernel -fomit-frame-pointer
+
+_t: ## @t
+ movslq %edi, %rax
+ imulq $-1431655765, %rax, %rcx ## imm = 0xFFFFFFFFAAAAAAAB
+ shrq $32, %rcx
+ addl %ecx, %eax
+ movl %eax, %ecx
+ shrl $31, %ecx
+ shrl %eax
+ addl %ecx, %eax
+ movsbl %al, %eax
+ ret
+
+GCC gets:
+
+_t:
+ movl $86, %eax
+ imulb %dil
+ shrw $8, %ax
+ sarb $7, %dil
+ subb %dil, %al
+ movsbl %al,%eax
+ ret
+
+which is nicer. This also happens for int, not just char.
+
+//===---------------------------------------------------------------------===//
+
+
+