309 files changed, 5337 insertions, 739 deletions
diff --git a/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll b/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
index 24848602baf84..0af2445d7fbae 100644
--- a/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
+++ b/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
@@ -3,7 +3,7 @@
 ; it makes a ton of annoying overlapping live ranges.  This code should not
 ; cause spills!
 ;
-; RUN: llc < %s -march=x86 -stats |& not grep spilled
+; RUN: llc < %s -march=x86 -stats 2>&1 | not grep spilled
 
 target datalayout = "e-p:32:32"
 
diff --git a/test/CodeGen/X86/2003-11-03-GlobalBool.ll b/test/CodeGen/X86/2003-11-03-GlobalBool.ll
index 8b0a18550da15..f201b981a8720 100644
--- a/test/CodeGen/X86/2003-11-03-GlobalBool.ll
+++ b/test/CodeGen/X86/2003-11-03-GlobalBool.ll
@@ -1,4 +1,4 @@
 ; RUN: llc < %s -march=x86 | \
-; RUN:   not grep {.byte\[\[:space:\]\]*true}
+; RUN:   not grep ".byte[[:space:]]*true"
 
 @X = global i1 true             ; <i1*> [#uses=0]
diff --git a/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll b/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
index fea2b54d76305..dde210b776af7 100644
--- a/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
+++ b/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 | grep {(%esp}
-; RUN: llc < %s -march=x86 | grep {pushl	%ebp} | count 1
-; RUN: llc < %s -march=x86 | grep {popl	%ebp} | count 1
+; RUN: llc < %s -march=x86 | grep "(%esp"
+; RUN: llc < %s -march=x86 | grep "pushl	%ebp" | count 1
+; RUN: llc < %s -march=x86 | grep "popl	%ebp" | count 1
 
 declare i8* @llvm.returnaddress(i32)
 
diff --git a/test/CodeGen/X86/2004-03-30-Select-Max.ll b/test/CodeGen/X86/2004-03-30-Select-Max.ll
index c44d10ac5b5a6..e22aa6a093987 100644
--- a/test/CodeGen/X86/2004-03-30-Select-Max.ll
+++ b/test/CodeGen/X86/2004-03-30-Select-Max.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep {j\[lgbe\]}
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; CHECK-NOT: {{j[lgbe]}}
 
 define i32 @max(i32 %A, i32 %B) nounwind {
         %gt = icmp sgt i32 %A, %B               ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll b/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
index dc69ef83103f7..f8bf0991fb148 100644
--- a/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
+++ b/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep {subl.*%esp}
+; RUN: llc < %s -march=x86 | not grep "subl.*%esp"
 
 define i32 @f(i32 %a, i32 %b) {
         %tmp.2 = mul i32 %a, %a         ; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll b/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
index 0421896922b9e..1a3d74918d1a6 100644
--- a/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
+++ b/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86  -stats |& \
+; RUN: llc < %s -march=x86  -stats 2>&1 | \
 ; RUN:   grep asm-printer | grep 7
 
 define i32 @g(i32 %a, i32 %b) nounwind {
diff --git a/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll b/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
index 8783a11c060b1..fb1262a372950 100644
--- a/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
+++ b/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin8 -relocation-model=static > %t
-; RUN: grep {movl	_last} %t | count 1
-; RUN: grep {cmpl.*_last} %t | count 1
+; RUN: grep "movl	_last" %t | count 1
+; RUN: grep "cmpl.*_last" %t | count 1
 
 @block = external global i8*            ; <i8**> [#uses=1]
 @last = external global i32             ; <i32*> [#uses=3]
diff --git a/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll b/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
index b0453299669e2..5cba3efeefb87 100644
--- a/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
+++ b/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -stats |& \
-; RUN:   not grep {Number of register spills}
+; RUN: llc < %s -march=x86 -mcpu=yonah -stats 2>&1 | \
+; RUN:   not grep "Number of register spills"
 ; END.
 
 
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched1.ll b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
index 7d0a6ab0a04c2..1c75f93915a7d 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched1.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static -stats |& \
+; RUN: llc < %s -march=x86 -relocation-model=static -stats 2>&1 | \
 ; RUN:   grep asm-printer | grep 14
 ;
 @size20 = external global i32		; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched2.ll b/test/CodeGen/X86/2006-05-02-InstrSched2.ll
index 23954d76a5d6a..95eefa1e7196f 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched2.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -stats  |& \
+; RUN: llc < %s -march=x86 -stats  2>&1 | \
 ; RUN:   grep asm-printer | grep 13
 
 define void @_ZN9__gnu_cxx9hashtableISt4pairIKPKciES3_NS_4hashIS3_EESt10_Select1stIS5_E5eqstrSaIiEE14find_or_insertERKS5__cond_true456.i(i8* %tmp435.i, i32* %tmp449.i.out) nounwind {
diff --git a/test/CodeGen/X86/2006-05-08-InstrSched.ll b/test/CodeGen/X86/2006-05-08-InstrSched.ll
index d58d638562c9a..3419d01fa0831 100644
--- a/test/CodeGen/X86/2006-05-08-InstrSched.ll
+++ b/test/CodeGen/X86/2006-05-08-InstrSched.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | not grep {subl.*%esp}
+; RUN: llc < %s -march=x86 -relocation-model=static | not grep "subl.*%esp"
 
 @A = external global i16*		; <i16**> [#uses=1]
 @B = external global i32		; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/2006-05-11-InstrSched.ll b/test/CodeGen/X86/2006-05-11-InstrSched.ll
index 38bca283b132e..37c510786a5e1 100644
--- a/test/CodeGen/X86/2006-05-11-InstrSched.ll
+++ b/test/CodeGen/X86/2006-05-11-InstrSched.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2 -stats -realign-stack=0 |&\
-; RUN:     grep {asm-printer} | grep 35
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats -realign-stack=0 2>&1 | \
+; RUN:     grep "asm-printer" | grep 35
 
 target datalayout = "e-p:32:32"
 define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
diff --git a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
index 3159cec8553e4..c5c74d104863b 100644
--- a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
+++ b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
@@ -1,7 +1,7 @@
 ; PR850
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=att > %t
-; RUN: grep {movl 4(%eax),%ebp} %t
-; RUN: grep {movl 0(%eax), %ebx} %t
+; RUN: grep "movl 4(%eax),%ebp" %t
+; RUN: grep "movl 0(%eax), %ebx" %t
 
 define i32 @foo(i32 %__s.i.i, i32 %tmp5.i.i, i32 %tmp6.i.i, i32 %tmp7.i.i, i32 %tmp8.i.i) {
 	%tmp9.i.i = call i32 asm sideeffect "push %ebp\0Apush %ebx\0Amovl 4($2),%ebp\0Amovl 0($2), %ebx\0Amovl $1,%eax\0Aint  $$0x80\0Apop  %ebx\0Apop %ebp", "={ax},i,0,{cx},{dx},{si},{di}"( i32 192, i32 %__s.i.i, i32 %tmp5.i.i, i32 %tmp6.i.i, i32 %tmp7.i.i, i32 %tmp8.i.i )		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll b/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
index a19d8f7092c34..56d5f2f3040a5 100644
--- a/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
+++ b/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=i386 | \
-; RUN:    not grep {movl %eax, %edx}
+; RUN:    not grep "movl %eax, %edx"
 
 define i32 @foo(i32 %t, i32 %C) {
 entry:
diff --git a/test/CodeGen/X86/2006-11-12-CSRetCC.ll b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
index 6ec9a488494a3..a58c9b102d133 100644
--- a/test/CodeGen/X86/2006-11-12-CSRetCC.ll
+++ b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
@@ -52,8 +52,8 @@ entry:
         %tmp21 = load double* %tmp20            ; <double> [#uses=1]
         %tmp.upgrd.6 = getelementptr [9 x i8]* @str, i32 0, i64 0               ; <i8*> [#uses=1]
         %tmp.upgrd.7 = call i32 (i8*, ...)* @printf( i8* %tmp.upgrd.6, double %tmp21, double %tmp19 )           ; <i32> [#uses=0]
-        br label %return
-return:         ; preds = %entry
+        br label %finish
+finish:
         %retval.upgrd.8 = load i32* %retval             ; <i32> [#uses=1]
         ret i32 %retval.upgrd.8
 }
diff --git a/test/CodeGen/X86/2006-11-17-IllegalMove.ll b/test/CodeGen/X86/2006-11-17-IllegalMove.ll
index affb7afb1c519..783d9f94caeb9 100644
--- a/test/CodeGen/X86/2006-11-17-IllegalMove.ll
+++ b/test/CodeGen/X86/2006-11-17-IllegalMove.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep movb %t | count 2
-; RUN: grep {movzb\[wl\]} %t
+; RUN: grep movb %t | count 1
+; RUN: grep "movzb[wl]" %t
 
 
 define void @handle_vector_size_attribute() nounwind {
diff --git a/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
index a2288986362e5..04d4b8ee57eb1 100644
--- a/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
+++ b/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86-64 > %t
-; RUN: not grep {,%rsp)} %t
+; RUN: not grep ",%rsp)" %t
 ; PR1103
 
 target datalayout = "e-p:64:64"
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
index 3312e01b3d8ef..3b2e443d7d4ef 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {mov %gs:72, %eax}
+; RUN: llc < %s -march=x86 | grep "mov %gs:72, %eax"
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin9"
 
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
index c1b1ad1c730d8..18b06dc0857c6 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mcpu=yonah -march=x86 | \
-; RUN:   grep {cmpltsd %xmm0, %xmm0}
+; RUN:   grep "cmpltsd %xmm0, %xmm0"
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin9"
 
diff --git a/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll b/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll
index 85a2ecc959ab7..cae68c9f3a1b1 100644
--- a/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll
+++ b/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | not grep {bsrl.*10}
+; RUN: llc < %s | not grep "bsrl.*10"
 ; PR1356
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/2007-05-07-InvokeSRet.ll b/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
index deb39998a3ab9..c3d7e8a054725 100644
--- a/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
+++ b/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-pc-linux-gnu -disable-fp-elim | not grep {addl .12, %esp}
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -disable-fp-elim | not grep "addl .12, %esp"
 ; PR1398
 
 	%struct.S = type { i32, i32 }
diff --git a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
index 77291f063b79f..aa0ee5d074629 100644
--- a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
+++ b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {movsbl}
+; RUN: llc < %s -march=x86 | grep "movsbl"
 
 @X = global i32 0               ; <i32*> [#uses=1]
 
diff --git a/test/CodeGen/X86/2007-09-05-InvalidAsm.ll b/test/CodeGen/X86/2007-09-05-InvalidAsm.ll
index 5acb05134c7cb..e81534b0110be 100644
--- a/test/CodeGen/X86/2007-09-05-InvalidAsm.ll
+++ b/test/CodeGen/X86/2007-09-05-InvalidAsm.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -x86-asm-syntax=intel | not grep {lea\[\[:space:\]\]R}
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -x86-asm-syntax=intel | FileCheck %s
+; CHECK-NOT: lea R
 
 	%struct.AGenericCall = type { %struct.AGenericManager*, %struct.ComponentParameters*, i32* }
 	%struct.AGenericManager = type <{ i8 }>
diff --git a/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll b/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
index 228a915e3e5a0..56a109acfc79c 100644
--- a/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
+++ b/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=static | grep {foo str$}
+; RUN: llc < %s -relocation-model=static | grep "foo str$"
 ; PR1761
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-pc-linux"
diff --git a/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll b/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
index 2e95082afa9c1..99df20da2510d 100644
--- a/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
+++ b/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=generic | grep {(%esp)} | count 2
+; RUN: llc < %s -march=x86 -mcpu=generic | grep "(%esp)" | count 2
 ; PR1872
 
 	%struct.c34007g__designated___XUB = type { i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
index 266fd7b913256..39af9319c8d15 100644
--- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
+++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
@@ -10,10 +10,10 @@
 
 	%struct.indexentry = type { i32, i8*, i8*, i8*, i8*, i8* }
 
-define i32 @_bfd_stab_section_find_nearest_line(i32 %offset) nounwind  {
+define i32 @_bfd_stab_section_find_nearest_line(i32 %offset, i1 %cond) nounwind  {
 entry:
 	%tmp910 = add i32 0, %offset		; <i32> [#uses=1]
-	br i1 true, label %bb951, label %bb917
+	br i1 %cond, label %bb951, label %bb917
 
 bb917:		; preds = %entry
 	ret i32 0
@@ -21,7 +21,7 @@ bb917:		; preds = %entry
 bb951:		; preds = %bb986, %entry
 	%tmp955 = sdiv i32 0, 2		; <i32> [#uses=3]
 	%tmp961 = getelementptr %struct.indexentry* null, i32 %tmp955, i32 0		; <i32*> [#uses=1]
-	br i1 true, label %bb986, label %bb967
+	br i1 %cond, label %bb986, label %bb967
 
 bb967:		; preds = %bb951
 	ret i32 0
diff --git a/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll b/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
index 0091397ca6b0c..9584b718fea06 100644
--- a/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
+++ b/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -regalloc=fast
+; RUN: llc < %s -march=x86 -mattr=+sse2 -regalloc=fast -optimize-regalloc=0
 
 define void @SolveCubic(double %a, double %b, double %c, double %d, i32* %solutions, double* %x) {
 entry:
diff --git a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
index bdacf50711289..a1b973d7ccfa6 100644
--- a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
+++ b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -stats |& grep {Number of block tails merged} | grep 16
+; RUN: llc < %s -march=x86 -mcpu=yonah -stats 2>&1 | grep "Number of block tails merged" | grep 16
 ; PR1909
 
 @.str = internal constant [48 x i8] c"transformed bounds: (%.2f, %.2f), (%.2f, %.2f)\0A\00"		; <[48 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll b/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
index 5115e48365fcc..a52b36588a363 100644
--- a/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
+++ b/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s | grep {a:} | not grep ax
-; RUN: llc < %s | grep {b:} | not grep ax
+; RUN: llc < %s | grep "a:" | not grep ax
+; RUN: llc < %s | grep "b:" | not grep ax
 ; PR2078
 ; The clobber list says that "ax" is clobbered.  Make sure that eax isn't 
 ; allocated to the input/output register.
diff --git a/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll b/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
index da029079c6ffe..9185a36711848 100644
--- a/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
+++ b/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -regalloc=fast -march=x86 -mattr=+mmx | grep esi
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -march=x86 -mattr=+mmx | grep esi
 ; PR2082
 ; Local register allocator was refusing to use ESI, EDI, and EBP so it ran out of
 ; registers.
diff --git a/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll b/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll
index 4dc3a10f46479..5ca7e3ed3dbf0 100644
--- a/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll
+++ b/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -asm-verbose | grep {#} | not grep -v {##}
+; RUN: llc < %s -mtriple=i386-apple-darwin -asm-verbose | grep "#" | not grep -v "##"
 
 	%struct.AGenericCall = type { %struct.AGenericManager*, %struct.ComponentParameters*, i32* }
 	%struct.AGenericManager = type <{ i8 }>
diff --git a/test/CodeGen/X86/2008-04-16-ReMatBug.ll b/test/CodeGen/X86/2008-04-16-ReMatBug.ll
index 109069e353659..3a1de11ea21bd 100644
--- a/test/CodeGen/X86/2008-04-16-ReMatBug.ll
+++ b/test/CodeGen/X86/2008-04-16-ReMatBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -disable-cgp-branch-opts | grep movw | not grep {, %e}
+; RUN: llc < %s -mtriple=i386-apple-darwin -disable-cgp-branch-opts | grep movw | not grep ", %e"
 
 	%struct.DBC_t = type { i32, i8*, i16, %struct.DBC_t*, i8*, i8*, i8*, i8*, i8*, %struct.DBC_t*, i32, i32, i32, i32, i8*, i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, i32, i16, i16, i32*, i8, i16, %struct.DRVOPT*, i16 }
 	%struct.DRVOPT = type { i16, i32, i8, %struct.DRVOPT* }
diff --git a/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index 859041eb81ff1..f244793e7a95d 100644
--- a/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | grep xorl | grep {%e}
+; RUN: llc < %s -mtriple=i386-apple-darwin | grep xorl | grep "%e"
 ; Make sure xorl operands are 32-bit registers.
 
 	%struct.tm = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8* }
diff --git a/test/CodeGen/X86/2008-04-28-CoalescerBug.ll b/test/CodeGen/X86/2008-04-28-CoalescerBug.ll
index 5b97eb71cbfdf..7c04206de72ff 100644
--- a/test/CodeGen/X86/2008-04-28-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-04-28-CoalescerBug.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movl > %t
-; RUN: not grep {r\[abcd\]x} %t
-; RUN: not grep {r\[ds\]i} %t
-; RUN: not grep {r\[bs\]p} %t
+; RUN: not grep "r[abcd]x" %t
+; RUN: not grep "r[ds]i" %t
+; RUN: not grep "r[bs]p" %t
 
 	%struct.BITMAP = type { i16, i16, i32, i32, i32, i32, i32, i32, i8*, i8* }
 	%struct.BltData = type { float, float, float, float }
diff --git a/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll b/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll
index c068f8ac632cd..4e73b5aa1cdb6 100644
--- a/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll
+++ b/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -regalloc=fast
+; RUN: llc < %s -mtriple=i386-apple-darwin -regalloc=fast -optimize-regalloc=0
 
 @_ZTVN10Evaluation10GridOutputILi3EEE = external constant [5 x i32 (...)*]		; <[5 x i32 (...)*]*> [#uses=1]
 
diff --git a/test/CodeGen/X86/2008-08-06-CmpStride.ll b/test/CodeGen/X86/2008-08-06-CmpStride.ll
index 99cb8569b3f42..bdac8fd48422c 100644
--- a/test/CodeGen/X86/2008-08-06-CmpStride.ll
+++ b/test/CodeGen/X86/2008-08-06-CmpStride.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s -o - | grep {cmpl	\\$\[1\], %}
+; RUN: llc -march=x86-64 < %s -o - | grep "cmpl	\$[1], %"
 
 @.str = internal constant [4 x i8] c"%d\0A\00"
 
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
index 1d27fc53ea5ea..c63c890add50b 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
@@ -1,15 +1,36 @@
 ; Check that eh_return & unwind_init were properly lowered
-; RUN: llc < %s | grep %ebp | count 9
-; RUN: llc < %s | grep %ecx | count 5
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i386-pc-linux"
 
-define i8* @test(i32 %a, i8* %b)  {
+; CHECK: test1
+; CHECK: pushl %ebp
+define i8* @test1(i32 %a, i8* %b)  {
 entry:
   call void @llvm.eh.unwind.init()
   %foo   = alloca i32
   call void @llvm.eh.return.i32(i32 %a, i8* %b)
+; CHECK: movl 12(%ebp), %[[ECX:e..]]
+; CHECK: movl 8(%ebp), %[[EAX:e..]]
+; CHECK: movl %[[ECX]], 4(%ebp,%[[EAX]])
+; CHECK: leal 4(%ebp,%[[EAX]]), %[[ECX2:e..]]
+; CHECK: movl %[[ECX2]], %esp
+; CHECK: ret
+  unreachable
+}
+
+; CHECK: test2
+; CHECK: pushl %ebp
+define i8* @test2(i32 %a, i8* %b)  {
+entry:
+  call void @llvm.eh.return.i32(i32 %a, i8* %b)
+; CHECK: movl 12(%ebp), %[[ECX:e..]]
+; CHECK: movl 8(%ebp), %[[EAX:e..]]
+; CHECK: movl %[[ECX]], 4(%ebp,%[[EAX]])
+; CHECK: leal 4(%ebp,%[[EAX]]), %[[ECX2:e..]]
+; CHECK: movl %[[ECX2]], %esp
+; CHECK: ret
   unreachable
 }
 
diff --git a/test/CodeGen/X86/2008-09-17-inline-asm-1.ll b/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
index 86e50c98bfdbf..4b2774b64b7b5 100644
--- a/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
+++ b/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s
-; RUN: llc < %s -march=x86 -regalloc=fast | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 | FileCheck %s
 
 ; %0 must not be put in EAX or EDX.
 ; In the first asm, $0 and $2 must not be put in EAX.
diff --git a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
index 6867ae7980871..5c2fbeee5c705 100644
--- a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
+++ b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -regalloc=fast       | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 | FileCheck %s
 ; RUN: llc < %s -march=x86 -regalloc=basic      | FileCheck %s
 ; RUN: llc < %s -march=x86 -regalloc=greedy     | FileCheck %s
 
diff --git a/test/CodeGen/X86/2008-10-24-FlippedCompare.ll b/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
index 421b931ecd5a2..e504bc3e776c5 100644
--- a/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
+++ b/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o - | not grep {ucomiss\[^,\]*esp}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -o - | not grep "ucomiss[^,]*esp"
 
 define void @f(float %wt) {
 entry:
diff --git a/test/CodeGen/X86/2008-10-27-CoalescerBug.ll b/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
index 9d144a4be0e90..b2cf34cd2033b 100644
--- a/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -stats |& FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -stats 2>&1 | FileCheck %s
 ; Now this test spills one register. But a reload in the loop is cheaper than
 ; the divsd so it's a win.
 
@@ -17,8 +17,7 @@ bb:		; preds = %bb, %entry
 ; CHECK: %bb30.loopexit
 ; CHECK: divsd %xmm0
 ; CHECK: movsd %xmm0, 16(%esp)
-; CHECK: .align
-; CHECK-NEXT: %bb3
+; CHECK: %bb3
 bb3:		; preds = %bb30.loopexit, %bb25, %bb3
 	%2 = load i32* null, align 4		; <i32> [#uses=1]
 	%3 = mul i32 %2, 0		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2008-12-23-crazy-address.ll b/test/CodeGen/X86/2008-12-23-crazy-address.ll
index 2edcaea80ce70..0e95c9e34e1cd 100644
--- a/test/CodeGen/X86/2008-12-23-crazy-address.ll
+++ b/test/CodeGen/X86/2008-12-23-crazy-address.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | grep {lea.*X.*esp} | count 2
+; RUN: llc < %s -march=x86 -relocation-model=static | grep "lea.*X.*esp" | count 2
 
 @X = external global [0 x i32]
 
diff --git a/test/CodeGen/X86/2009-01-31-BigShift2.ll b/test/CodeGen/X86/2009-01-31-BigShift2.ll
index 3e425536d1b94..b478f27a95b95 100644
--- a/test/CodeGen/X86/2009-01-31-BigShift2.ll
+++ b/test/CodeGen/X86/2009-01-31-BigShift2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {mov.*56}
+; RUN: llc < %s -march=x86 | grep "mov.*56"
 ; PR3449
 
 define void @test(<8 x double>* %P, i64* %Q) nounwind {
diff --git a/test/CodeGen/X86/2009-02-25-CommuteBug.ll b/test/CodeGen/X86/2009-02-25-CommuteBug.ll
index 7ea699833ba86..9cbf350940613 100644
--- a/test/CodeGen/X86/2009-02-25-CommuteBug.ll
+++ b/test/CodeGen/X86/2009-02-25-CommuteBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats |& not grep commuted
+; RUN: llc < %s -march=x86 -mattr=+sse2 -stats 2>&1 | not grep commuted
 ; rdar://6608609
 
 define <2 x double> @t(<2 x double> %A, <2 x double> %B, <2 x double> %C) nounwind readnone {
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 0b5b7bdd94d70..d50fe6f73a004 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -stats |& grep {8 machine-licm}
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -mcpu=penryn -stats 2>&1 | grep "5 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
 
diff --git a/test/CodeGen/X86/2009-03-12-CPAlignBug.ll b/test/CodeGen/X86/2009-03-12-CPAlignBug.ll
index 3564f01a7c433..847a43fb06a1e 100644
--- a/test/CodeGen/X86/2009-03-12-CPAlignBug.ll
+++ b/test/CodeGen/X86/2009-03-12-CPAlignBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | not grep {.space}
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | not grep ".space"
 ; rdar://6668548
 
 declare double @llvm.sqrt.f64(double) nounwind readonly
diff --git a/test/CodeGen/X86/2009-03-23-MultiUseSched.ll b/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
index 8bbdb0e82f784..d934ec9a88f8d 100644
--- a/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
+++ b/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-linux -relocation-model=static -o /dev/null -stats -info-output-file - > %t
 ; RUN: not grep spill %t
-; RUN: not grep {%rsp} %t
-; RUN: not grep {%rbp} %t
+; RUN: not grep "%rsp" %t
+; RUN: not grep "%rbp" %t
 
 ; The register-pressure scheduler should be able to schedule this in a
 ; way that does not require spills.
diff --git a/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll b/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
index f46eed4769f7e..ad18a0c5b94d4 100644
--- a/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
+++ b/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -relocation-model=pic -disable-fp-elim -stats |& grep {Number of modref unfolded}
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -relocation-model=pic -disable-fp-elim -stats 2>&1 | grep "Number of modref unfolded"
 ; XFAIL: *
 ; 69408 removed the opportunity for this optimization to work
 
diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
index 9f5a8c53be183..5cb05e8a796fa 100644
--- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
+++ b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \
-; RUN:     -disable-fp-elim -mattr=-sse41,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
+; RUN:     -mcpu=generic -disable-fp-elim -mattr=-sse41,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
 ; RUN:   FileCheck %s
 ; rdar://6808032
 
diff --git a/test/CodeGen/X86/2009-04-24.ll b/test/CodeGen/X86/2009-04-24.ll
index d6ed0c42230d4..08bf9e3f9f36c 100644
--- a/test/CodeGen/X86/2009-04-24.ll
+++ b/test/CodeGen/X86/2009-04-24.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -regalloc=fast -relocation-model=pic > %t2
-; RUN: grep {leaq.*TLSGD} %t2
-; RUN: grep {__tls_get_addr} %t2
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -regalloc=fast -optimize-regalloc=0 -relocation-model=pic > %t2
+; RUN: grep "leaq.*TLSGD" %t2
+; RUN: grep "__tls_get_addr" %t2
 ; PR4004
 
 @i = thread_local global i32 15
diff --git a/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll b/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
index a2fd2e4c51c95..a6ed74ba2ee9a 100644
--- a/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
+++ b/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {movl.*%ebx, 8(%esi)}
+; RUN: llc < %s | grep "movl.*%ebx, 8(%esi)"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9.0"
 
diff --git a/test/CodeGen/X86/2009-05-30-ISelBug.ll b/test/CodeGen/X86/2009-05-30-ISelBug.ll
index af552d4ce20d4..fe04272082c9c 100644
--- a/test/CodeGen/X86/2009-05-30-ISelBug.ll
+++ b/test/CodeGen/X86/2009-05-30-ISelBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | not grep {movzbl	%\[abcd\]h,}
+; RUN: llc < %s -march=x86-64 | not grep "movzbl	%[abcd]h,"
 
 define void @BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i(i32*, i32 %c_nblock_used.2.i, i32 %.reload51, i32* %.out, i32* %.out1, i32* %.out2, i32* %.out3) nounwind {
 newFuncRoot:
diff --git a/test/CodeGen/X86/20090313-signext.ll b/test/CodeGen/X86/20090313-signext.ll
index de930d5126782..b8effa6773552 100644
--- a/test/CodeGen/X86/20090313-signext.ll
+++ b/test/CodeGen/X86/20090313-signext.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86-64 -relocation-model=pic > %t
-; RUN: grep {movswl	%ax, %edi} %t
-; RUN: grep {movw	(%rax), %ax} %t
+; RUN: grep "movswl	%ax, %edi" %t
+; RUN: grep "movw	(%rax), %ax" %t
 ; XFAIL: *
 
 @x = common global i16 0
diff --git a/test/CodeGen/X86/2010-01-19-OptExtBug.ll b/test/CodeGen/X86/2010-01-19-OptExtBug.ll
index cd8960b9ed7b4..eb4a5c04a2ae5 100644
--- a/test/CodeGen/X86/2010-01-19-OptExtBug.ll
+++ b/test/CodeGen/X86/2010-01-19-OptExtBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -relocation-model=pic -disable-fp-elim -stats |& not grep ext-opt
+; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -relocation-model=pic -disable-fp-elim -stats 2>&1 | not grep ext-opt
 
 define fastcc i8* @S_scan_str(i8* %start, i32 %keep_quoted, i32 %keep_delims) nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll b/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll
index 90eb84d1dc40c..35f233952df00 100644
--- a/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll
+++ b/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast %s -o %t
+; RUN: llc -regalloc=fast -optimize-regalloc=0 %s -o %t
 ; PR7066
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/CodeGen/X86/2010-05-12-FastAllocKills.ll b/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
index 36a99d6f90e74..eb0b150378d69 100644
--- a/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
+++ b/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast -verify-machineinstrs < %s
+; RUN: llc -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin"
 
diff --git a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
index 4639866afc5ef..9b47bb75bf16e 100644
--- a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
+++ b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast < %s | FileCheck %s
+; RUN: llc -regalloc=fast -optimize-regalloc=0 < %s | FileCheck %s
 ; PR7382
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
index c6f4b497af103..be10ad5cc2067 100644
--- a/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
+++ b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
@@ -12,9 +12,9 @@ declare hidden fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalData
 
 ; Avoid hoisting the test above loads or copies
 ; CHECK: %entry
-; CHECK: cmpq
+; CHECK: test
 ; CHECK-NOT: mov
-; CHECK: jb
+; CHECK: je
 define i32 @cti_op_eq(i8** nocapture %args) nounwind ssp {
 entry:
   %0 = load i8** null, align 8
diff --git a/test/CodeGen/X86/2011-04-19-sclr-bb.ll b/test/CodeGen/X86/2011-04-19-sclr-bb.ll
new file mode 100644
index 0000000000000..771e4b3a08159
--- /dev/null
+++ b/test/CodeGen/X86/2011-04-19-sclr-bb.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
+
+; Make sure that values of illegal types are not scalarized between basic blocks.
+;CHECK: test
+;CHECK-NOT: pinsrw
+;CHECK-NOT: pextrb
+;CHECK: ret
+define void @test(i1 %cond) {
+ENTRY:
+  br label %LOOP
+LOOP:
+  %vec1 = phi <4 x i1> [ %vec1_or_2, %LOOP ], [ zeroinitializer, %ENTRY ]
+  %vec2 = phi <4 x i1> [ %vec2_and_1, %LOOP ], [ zeroinitializer, %ENTRY ]
+  %vec1_or_2 = or <4 x i1> %vec1, %vec2
+  %vec2_and_1 = and <4 x i1> %vec2, %vec1
+  br i1 %cond, label %LOOP, label %EXIT
+
+EXIT:
+  ret void
+}
+
diff --git a/test/CodeGen/X86/2011-06-03-x87chain.ll b/test/CodeGen/X86/2011-06-03-x87chain.ll
index bf7f583aab732..ce63c74fbdfd3 100644
--- a/test/CodeGen/X86/2011-06-03-x87chain.ll
+++ b/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse | FileCheck %s
 
 define float @chainfail1(i64* nocapture %a, i64* nocapture %b, i32 %x, i32 %y, float* nocapture %f) nounwind uwtable noinline ssp {
 entry:
diff --git a/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll b/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
index a51dad03039e8..47ef693cc25e6 100644
--- a/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
+++ b/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -disable-fp-elim -relocation-model=pic -stats |& FileCheck %s
+; RUN: llc < %s -O0 -disable-fp-elim -relocation-model=pic -stats 2>&1 | FileCheck %s
 ;
 ; This test should not cause any spilling with RAFast.
 ;
diff --git a/test/CodeGen/X86/2011-09-18-sse2cmp.ll b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
index 844d674fc9e57..a6f428fdacc35 100644
--- a/test/CodeGen/X86/2011-09-18-sse2cmp.ll
+++ b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=x86 -mcpu=yonah -promote-elements -mattr=+sse2,-sse41 | FileCheck %s
+;RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse41 | FileCheck %s
 
 ;CHECK: @max
 ;CHECK: cmplepd
diff --git a/test/CodeGen/X86/2011-09-21-setcc-bug.ll b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
index ed5649c60265d..4daf6781495a8 100644
--- a/test/CodeGen/X86/2011-09-21-setcc-bug.ll
+++ b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -promote-elements -mattr=+sse41
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41
 
 ; Make sure we are not crashing on this code.
 
diff --git a/test/CodeGen/X86/2011-10-11-srl.ll b/test/CodeGen/X86/2011-10-11-srl.ll
index cf9d36f1c48cc..6c6d340fd1a49 100644
--- a/test/CodeGen/X86/2011-10-11-srl.ll
+++ b/test/CodeGen/X86/2011-10-11-srl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -promote-elements -mattr=-sse41 
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=-sse41 
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/X86/2011-12-15-vec_shift.ll b/test/CodeGen/X86/2011-12-15-vec_shift.ll
index 6f9188c442684..dc3a08bb4daf5 100644
--- a/test/CodeGen/X86/2011-12-15-vec_shift.ll
+++ b/test/CodeGen/X86/2011-12-15-vec_shift.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86-64 -mattr=+sse41 < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
-; RUN: llc -march=x86-64 -mattr=-sse41 < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
+; RUN: llc -march=x86-64 -mattr=+sse41 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
+; RUN: llc -march=x86-64 -mattr=-sse41 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
 ; Test case for r146671
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7"
diff --git a/test/CodeGen/X86/2012-02-20-MachineCPBug.ll b/test/CodeGen/X86/2012-02-20-MachineCPBug.ll
index 557d49d82f844..477b4deba8209 100644
--- a/test/CodeGen/X86/2012-02-20-MachineCPBug.ll
+++ b/test/CodeGen/X86/2012-02-20-MachineCPBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-macosx -mattr=+sse | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-macosx -mcpu=core2 -mattr=+sse | FileCheck %s
 ; PR11940: Do not optimize away movb %al, %ch
 
 %struct.APInt = type { i64* }
diff --git a/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll b/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll
index 101eccabbd49e..18a3313773534 100644
--- a/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll
+++ b/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -stats |& \
-; RUN:   not grep {Number of machine instructions hoisted out of loops post regalloc}
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -stats 2>&1 | \
+; RUN:   not grep "Number of machine instructions hoisted out of loops post regalloc"
 
 ; rdar://11095580
 
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9543587747a6a..9a66b670c7af4 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,8 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vmovups
+;CHECK: vpxor
+;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
 ;CHECK: vmulps
diff --git a/test/CodeGen/X86/2012-05-17-TwoAddressBug.ll b/test/CodeGen/X86/2012-05-17-TwoAddressBug.ll
new file mode 100644
index 0000000000000..171c3f18dc8bc
--- /dev/null
+++ b/test/CodeGen/X86/2012-05-17-TwoAddressBug.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -pre-RA-sched=source | FileCheck %s
+
+; Teach two-address pass to update the "source" map so it doesn't perform a
+; non-profitable commute using outdated info. The test case would still fail
+; because of poor pre-RA schedule. That will be fixed by MI scheduler.
+; rdar://11472010
+define i32 @t(i32 %mask) nounwind readnone ssp {
+entry:
+; CHECK: t:
+; CHECK-NOT: mov
+  %sub = add i32 %mask, -65535
+  %shr = lshr i32 %sub, 23
+  %and = and i32 %mask, 1
+  %add = add i32 %shr, %and
+  ret i32 %add
+}
diff --git a/test/CodeGen/X86/2012-05-19-CoalescerCrash.ll b/test/CodeGen/X86/2012-05-19-CoalescerCrash.ll
new file mode 100644
index 0000000000000..837fbc0777f73
--- /dev/null
+++ b/test/CodeGen/X86/2012-05-19-CoalescerCrash.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -verify-coalescing
+; PR12892
+;
+; Dead code elimination during coalesing causes a live range to split into two
+; virtual registers. Stale identity copies that had already been joined were
+; interfering with the liveness computations.
+
+target triple = "i386-pc-linux-gnu"
+
+define void @_ZN4llvm17AsmMatcherEmitter3runERNS_11raw_ostreamE() align 2 {
+  invoke void @_ZNK4llvm13CodeGenTarget12getAsmParserEv()
+          to label %1 unwind label %5
+
+; <label>:1                                       ; preds = %0
+  invoke void @_ZNK4llvm6Record16getValueAsStringENS_9StringRefE()
+          to label %4 unwind label %2
+
+; <label>:2                                       ; preds = %1
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  unreachable
+
+; <label>:4                                       ; preds = %1
+  invoke void @_ZN4llvm18isCurrentDebugTypeEPKc()
+          to label %12 unwind label %7
+
+; <label>:5                                       ; preds = %0
+  %6 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %33
+
+; <label>:7                                       ; preds = %4
+  %8 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %9
+
+; <label>:9                                       ; preds = %28, %7
+  %10 = phi { i8*, i32 } [ %29, %28 ], [ %8, %7 ]
+  %11 = extractvalue { i8*, i32 } %10, 1
+  invoke fastcc void @_ZN12_GLOBAL__N_114AsmMatcherInfoD2Ev()
+          to label %32 unwind label %35
+
+; <label>:12                                      ; preds = %4
+  invoke void @_ZNK4llvm13CodeGenTarget10getRegBankEv()
+          to label %13 unwind label %16
+
+; <label>:13                                      ; preds = %12
+  br label %14
+
+; <label>:14                                      ; preds = %20, %13
+  %15 = icmp eq i32 undef, 0
+  br i1 %15, label %20, label %18
+
+; <label>:16                                      ; preds = %12
+  %17 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %26
+
+; <label>:18                                      ; preds = %14
+  invoke void @_ZNSs4_Rep9_S_createEjjRKSaIcE()
+          to label %19 unwind label %21
+
+; <label>:19                                      ; preds = %18
+  unreachable
+
+; <label>:20                                      ; preds = %14
+  br label %14
+
+; <label>:21                                      ; preds = %18
+  %22 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %23 = extractvalue { i8*, i32 } %22, 1
+  br i1 undef, label %26, label %24
+
+; <label>:24                                      ; preds = %21
+  br i1 undef, label %25, label %26
+
+; <label>:25                                      ; preds = %24
+  unreachable
+
+; <label>:26                                      ; preds = %24, %21, %16
+  %27 = phi i32 [ 0, %16 ], [ %23, %21 ], [ %23, %24 ]
+  invoke void @_ZNSt6vectorISt4pairISsSsESaIS1_EED1Ev()
+          to label %28 unwind label %30
+
+; <label>:28                                      ; preds = %26
+  %29 = insertvalue { i8*, i32 } undef, i32 %27, 1
+  br label %9
+
+; <label>:30                                      ; preds = %26
+  %31 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  unreachable
+
+; <label>:32                                      ; preds = %9
+  br label %33
+
+; <label>:33                                      ; preds = %32, %5
+  %34 = phi i32 [ undef, %5 ], [ %11, %32 ]
+  unreachable
+
+; <label>:35                                      ; preds = %9
+  %36 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  unreachable
+}
+
+declare void @_ZNK4llvm13CodeGenTarget12getAsmParserEv()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZNK4llvm6Record16getValueAsStringENS_9StringRefE()
+
+declare void @_ZN4llvm18isCurrentDebugTypeEPKc()
+
+declare fastcc void @_ZN12_GLOBAL__N_114AsmMatcherInfoD2Ev() unnamed_addr inlinehint align 2
+
+declare hidden void @_ZNSt6vectorISt4pairISsSsESaIS1_EED1Ev() unnamed_addr align 2
+
+declare void @_ZNSs4_Rep9_S_createEjjRKSaIcE()
+
+declare void @_ZNK4llvm13CodeGenTarget10getRegBankEv()
diff --git a/test/CodeGen/X86/2012-05-19-avx2-store.ll b/test/CodeGen/X86/2012-05-19-avx2-store.ll
new file mode 100644
index 0000000000000..1c1e8e2f0a21c
--- /dev/null
+++ b/test/CodeGen/X86/2012-05-19-avx2-store.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
+
+define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp {
+entry:
+  ; CHECK: vmovaps
+  ; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]),
+  ; CHECK: vmovups
+  %A = load <4 x i32>* %Ap
+  %B = load <4 x i32>* %Bp
+  %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %Z, <8 x i32>* %P, align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
new file mode 100644
index 0000000000000..906b748fa4208
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mtriple=i686-pc-win32 | FileCheck %s
+
+; CHECK: load_store
+define void @load_store(<4 x i16>* %in) {
+entry:
+; CHECK: movsd
+  %A27 = load <4 x i16>* %in, align 4
+  %A28 = add <4 x i16> %A27, %A27
+; CHECK: movlpd
+  store <4 x i16> %A28, <4 x i16>* %in, align 4
+  ret void
+; CHECK: ret
+}
+
+; Make sure that we store a 64bit value, even on 32bit systems.
+;CHECK: store_64
+define void @store_64(<2 x i32>* %ptr) {
+BB:
+  store <2 x i32> zeroinitializer, <2 x i32>* %ptr
+  ret void
+;CHECK: movlpd
+;CHECK: ret
+}
+
+;CHECK: load_64
+define <2 x i32> @load_64(<2 x i32>* %ptr) {
+BB:
+  %t = load <2 x i32>* %ptr
+  ret <2 x i32> %t
+;CHECK: movsd
+;CHECK: ret
+}
diff --git a/test/CodeGen/X86/2012-07-10-shufnorm.ll b/test/CodeGen/X86/2012-07-10-shufnorm.ll
new file mode 100644
index 0000000000000..e39df58877f62
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-10-shufnorm.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx | FileCheck %s
+
+; CHECK: ocl
+define void @ocl() {
+entry:
+  %vext = shufflevector <2 x double> zeroinitializer, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit = shufflevector <8 x double> %vext, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1 = insertelement <8 x double> %vecinit, double undef, i32 2
+  %vecinit3 = insertelement <8 x double> %vecinit1, double undef, i32 3
+  %vecinit5 = insertelement <8 x double> %vecinit3, double 0.000000e+00, i32 4
+  %vecinit9 = shufflevector <8 x double> %vecinit5, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 10>
+  store <8 x double> %vecinit9, <8 x double>* undef
+  ret void
+; CHECK: vxorps
+; CHECK: ret
+}
+
diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
new file mode 100644
index 0000000000000..3b7a8a7b871c9
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s
+
+declare x86_fastcallcc i64 @barrier()
+
+;CHECK: bcast_fold
+;CHECK: vmovaps %xmm{{[0-9]+}}, [[SPILLED:[^\)]+\)]]
+;CHECK: barrier
+;CHECK: vbroadcastss [[SPILLED]], %ymm0
+;CHECK: ret
+define <8 x float> @bcast_fold( float* %A) {
+BB:
+  %A0 = load float* %A
+  %tt3 = call x86_fastcallcc i64 @barrier()
+  br i1 undef, label %work, label %exit
+
+work:
+  %A1 = insertelement <8 x float> undef, float %A0, i32 0
+  %A2 = shufflevector <8 x float> %A1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %A2
+
+exit:
+  ret <8 x float> undef
+}
diff --git a/test/CodeGen/X86/2012-07-15-tconst_shl.ll b/test/CodeGen/X86/2012-07-15-tconst_shl.ll
new file mode 100644
index 0000000000000..46eca7644ebb0
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-15-tconst_shl.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+avx2
+; make sure that we are not crashing.
+
+define <16 x i32> @autogen_SD34717() {
+BB:
+  %Shuff7 = shufflevector <16 x i32> zeroinitializer, <16 x i32> zeroinitializer, <16 x i32> <i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 undef, i32 22, i32 24, i32 26, i32 28, i32 30, i32 undef>
+  %B9 = lshr <16 x i32> zeroinitializer, %Shuff7
+  ret <16 x i32> %B9
+}
diff --git a/test/CodeGen/X86/2012-07-15-vshl.ll b/test/CodeGen/X86/2012-07-15-vshl.ll
new file mode 100644
index 0000000000000..cd0fef469e6a7
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-15-vshl.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx
+; PR13352
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+define void @f_f() nounwind {
+allocas:
+  br label %for_loop29
+
+for_loop29:                                       ; preds = %safe_if_after_true, %allocas
+  %indvars.iv596 = phi i64 [ %indvars.iv.next597, %safe_if_after_true ], [ 0, %allocas ]
+  %0 = trunc i64 %indvars.iv596 to i32
+  %smear.15 = insertelement <16 x i32> undef, i32 %0, i32 15
+  %bitop = lshr <16 x i32> zeroinitializer, %smear.15
+  %bitop35 = and <16 x i32> %bitop, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %bitop35_to_bool = icmp ne <16 x i32> %bitop35, zeroinitializer
+  %val_to_boolvec32 = sext <16 x i1> %bitop35_to_bool to <16 x i32>
+  %floatmask.i526 = bitcast <16 x i32> %val_to_boolvec32 to <16 x float>
+  %mask1.i529 = shufflevector <16 x float> %floatmask.i526, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %"internal_mask&function_mask41_any" = icmp eq i32 undef, 0
+  br i1 %"internal_mask&function_mask41_any", label %safe_if_after_true, label %safe_if_run_true
+
+safe_if_after_true:                               ; preds = %for_loop29
+  %indvars.iv.next597 = add i64 %indvars.iv596, 1
+  br label %for_loop29
+
+safe_if_run_true:                                 ; preds = %for_loop29
+  %blend1.i583 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> undef, <8 x float> undef, <8 x float> %mask1.i529) nounwind
+  unreachable
+}
+
diff --git a/test/CodeGen/X86/2012-07-16-LeaUndef.ll b/test/CodeGen/X86/2012-07-16-LeaUndef.ll
new file mode 100644
index 0000000000000..9e5cbd2f33738
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-16-LeaUndef.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+define void @autogen_SD2543() {
+A:
+  %E83 = add i32 0, 1
+  %E820 = add i32 0, undef
+  br label %C
+C:
+  %B908 = add i32 %E83, %E820
+  store i32 %B908, i32* undef
+  %Sl2391 = select i1 undef, i32 undef, i32 %E83
+  %Cmp3114 = icmp ne i32 %Sl2391, undef
+  br i1 %Cmp3114, label %C, label %G
+G:
+  ret void
+}
diff --git a/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll b/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll
new file mode 100644
index 0000000000000..17533a1e1649d
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+define void @autogen_SD3100() {
+BB:
+  %FC123 = fptoui float 0x40693F5D00000000 to i1
+  br i1 %FC123, label %V, label %W
+
+V:
+  ret void
+W:
+  ret void
+}
diff --git a/test/CodeGen/X86/2012-07-17-vtrunc.ll b/test/CodeGen/X86/2012-07-17-vtrunc.ll
new file mode 100644
index 0000000000000..2de2f97d7d2d4
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-17-vtrunc.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+define void @autogen_SD33189483() {
+BB:
+  br label %CF76
+
+CF76:                                             ; preds = %CF76, %BB
+  %Shuff13 = shufflevector <4 x i64> zeroinitializer, <4 x i64> undef, <4 x i32> zeroinitializer
+  %Tr16 = trunc <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i1>
+  %E19 = extractelement <8 x i1> %Tr16, i32 2
+  br i1 %E19, label %CF76, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF76
+  %BC = bitcast <4 x i64> %Shuff13 to <4 x double>
+  br label %CF78
+}
diff --git a/test/CodeGen/X86/2012-07-23-select_cc.ll b/test/CodeGen/X86/2012-07-23-select_cc.ll
new file mode 100644
index 0000000000000..33fcb120e1623
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-23-select_cc.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR 13428
+
+declare void @use(double)
+
+define void @test() {
+entry:
+  call void @use(double 1.000000e+00)
+  %A = icmp eq i64 undef, 2
+  %B = zext i1 %A to i32
+  %C = sitofp i32 %B to double
+  call void @use(double %C)
+  call void @use(double 0.000000e+00)
+  unreachable
+}
diff --git a/test/CodeGen/X86/2012-08-07-CmpISelBug.ll b/test/CodeGen/X86/2012-08-07-CmpISelBug.ll
new file mode 100644
index 0000000000000..000b853ab8f6e
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-07-CmpISelBug.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+
+; Cmp lowering should not look past the truncate unless the high bits are known
+; zero.
+; rdar://12027825
+
+define void @foo(i8 %arg4, i32 %arg5, i32* %arg14) nounwind {
+bb:
+; CHECK: foo:
+; CHECK-NOT: testl
+; CHECK: testb
+  %tmp48 = zext i8 %arg4 to i32
+  %tmp49 = and i32 %tmp48, 32
+  %tmp50 = add i32 %tmp49, 1593371643
+  %tmp55 = sub i32 %tmp50, 0
+  %tmp56 = add i32 %tmp55, 7787538
+  %tmp57 = xor i32 %tmp56, 1601159181
+  %tmp58 = xor i32 %arg5, 1601159181
+  %tmp59 = and i32 %tmp57, %tmp58
+  %tmp60 = add i32 %tmp59, -1263900958
+  %tmp67 = sub i32 %tmp60, 0
+  %tmp103 = xor i32 %tmp56, 13
+  %tmp104 = trunc i32 %tmp103 to i8
+  %tmp105 = sub i8 0, %tmp104
+  %tmp106 = add i8 %tmp105, -103
+  %tmp113 = sub i8 %tmp106, 0
+  %tmp114 = add i8 %tmp113, -72
+  %tmp141 = icmp ne i32 %tmp67, -1263900958
+  %tmp142 = select i1 %tmp141, i8 %tmp114, i8 undef
+  %tmp143 = xor i8 %tmp142, 81
+  %tmp144 = zext i8 %tmp143 to i32
+  %tmp145 = add i32 %tmp144, 2062143348
+  %tmp152 = sub i32 %tmp145, 0
+  store i32 %tmp152, i32* %arg14
+  ret void
+}
diff --git a/test/CodeGen/X86/4char-promote.ll b/test/CodeGen/X86/4char-promote.ll
index 386057f0a3b68..4f1a859fd436b 100644
--- a/test/CodeGen/X86/4char-promote.ll
+++ b/test/CodeGen/X86/4char-promote.ll
@@ -1,11 +1,12 @@
 ; A test for checking PR 9623
-;RUN: llc -march=x86-64 -mcpu=corei7 -promote-elements < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
 
 target triple = "x86_64-apple-darwin"
 
-; CHECK:  pmulld 
-; CHECK:  paddd  
-; CHECK:  movdqa 
+; CHECK:  pmulld
+; CHECK:  paddd
+; CHECK-NOT:  movdqa
+; CHECK:  ret
 
 define <4 x i8> @foo(<4 x i8> %x, <4 x i8> %y) {
 entry:
diff --git a/test/CodeGen/X86/MachineSink-PHIUse.ll b/test/CodeGen/X86/MachineSink-PHIUse.ll
index 3758fd8ce500c..33141680aa92e 100644
--- a/test/CodeGen/X86/MachineSink-PHIUse.ll
+++ b/test/CodeGen/X86/MachineSink-PHIUse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-appel-darwin -disable-cgp-branch-opts -stats |& grep {machine-sink}
+; RUN: llc < %s -mtriple=x86_64-appel-darwin -disable-cgp-branch-opts -stats 2>&1 | grep "machine-sink"
 
 define fastcc void @t() nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/add.ll b/test/CodeGen/X86/add.ll
index 8e871f4aeb4df..03d2e472cba67 100644
--- a/test/CodeGen/X86/add.ll
+++ b/test/CodeGen/X86/add.ll
@@ -1,8 +1,6 @@
 ; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -join-physregs | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 -join-physregs | FileCheck %s -check-prefix=X64
-
-; Some of these tests depend on -join-physregs to commute instructions.
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
 
 ; The immediate can be encoded in a smaller way if the
 ; instruction is a sub instead of an add.
@@ -101,9 +99,9 @@ define {i32, i1} @test7(i32 %v1, i32 %v2) nounwind {
 }
 
 ; X64: test7:
-; X64: addl %e[[A1]], %eax
+; X64: addl %e[[A1]], %e
 ; X64-NEXT: setb %dl
-; X64-NEXT: ret
+; X64: ret
 
 ; PR5443
 define {i64, i1} @test8(i64 %left, i64 %right) nounwind {
diff --git a/test/CodeGen/X86/addr-label-difference.ll b/test/CodeGen/X86/addr-label-difference.ll
index 49abd8a92e648..15fbec52e2437 100644
--- a/test/CodeGen/X86/addr-label-difference.ll
+++ b/test/CodeGen/X86/addr-label-difference.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | grep {__TEXT,__const}
+; RUN: llc %s -o - | grep "__TEXT,__const"
 ; PR5929
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin10.0"
diff --git a/test/CodeGen/X86/aligned-comm.ll b/test/CodeGen/X86/aligned-comm.ll
index 7715869ed99cf..eab02cc1f9d57 100644
--- a/test/CodeGen/X86/aligned-comm.ll
+++ b/test/CodeGen/X86/aligned-comm.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep {array,16512,7}
-; RUN: llc < %s -mtriple=i386-apple-darwin9 | grep {array,16512,7}
+; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep "array,16512,7"
+; RUN: llc < %s -mtriple=i386-apple-darwin9 | grep "array,16512,7"
 
 ; Darwin 9+ should get alignment on common symbols.
 @array = common global [4128 x i32] zeroinitializer, align 128
diff --git a/test/CodeGen/X86/alignment-2.ll b/test/CodeGen/X86/alignment-2.ll
index cc709b52d9341..1f9e85cbb763a 100644
--- a/test/CodeGen/X86/alignment-2.ll
+++ b/test/CodeGen/X86/alignment-2.ll
@@ -18,7 +18,9 @@
 define signext i8 @do_lo_list() nounwind optsize ssp {
 bb:
 ; CHECK:     do_lo_list
-; CHECK-NOT: movaps
+; Make sure we do not use movaps for the global variable.
+; It is okay to use movaps for writing the local variable on stack.
+; CHECK-NOT: movaps {{[0-9]*}}(%{{[a-z]*}}), {{%xmm[0-9]}}
   %myopt = alloca %struct.printQueryOpt, align 4
   %tmp = bitcast %struct.printQueryOpt* %myopt to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* bitcast (%struct.printQueryOpt* getelementptr inbounds (%struct._psqlSettings* @pset, i32 0, i32 4) to i8*), i32 76, i32 4, i1 false)
diff --git a/test/CodeGen/X86/alloca-align-rounding-32.ll b/test/CodeGen/X86/alloca-align-rounding-32.ll
index c0f1a18123e6f..a45284e10cf49 100644
--- a/test/CodeGen/X86/alloca-align-rounding-32.ll
+++ b/test/CodeGen/X86/alloca-align-rounding-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | grep and | count 1
+; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | FileCheck %s
 
 declare void @bar(<2 x i64>* %n)
 
@@ -6,10 +6,15 @@ define void @foo(i32 %h) {
   %p = alloca <2 x i64>, i32 %h
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo
+; CHECK-NOT: andl $-32, %eax
 }
 
 define void @foo2(i32 %h) {
   %p = alloca <2 x i64>, i32 %h, align 32
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo2
+; CHECK: andl $-32, %esp
+; CHECK: andl $-32, %eax
 }
diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll
index 3c87dbf2bd78b..3d76fb0aa25bb 100644
--- a/test/CodeGen/X86/alloca-align-rounding.ll
+++ b/test/CodeGen/X86/alloca-align-rounding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | grep and | count 1
+; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | FileCheck %s
 
 declare void @bar(<2 x i64>* %n)
 
@@ -6,10 +6,15 @@ define void @foo(i64 %h) {
   %p = alloca <2 x i64>, i64 %h
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo
+; CHECK-NOT: andq $-32, %rax
 }
 
 define void @foo2(i64 %h) {
   %p = alloca <2 x i64>, i64 %h, align 32
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo2
+; CHECK: andq $-32, %rsp
+; CHECK: andq $-32, %rax
 }
diff --git a/test/CodeGen/X86/andimm8.ll b/test/CodeGen/X86/andimm8.ll
index a3dc85ff5ce53..640237d0b504b 100644
--- a/test/CodeGen/X86/andimm8.ll
+++ b/test/CodeGen/X86/andimm8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding -join-physregs | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding | FileCheck %s
 
 ; PR8365
 ; CHECK: andl	$-64, %edi              # encoding: [0x83,0xe7,0xc0]
diff --git a/test/CodeGen/X86/2008-08-25-AsmRegTypeMismatch.ll b/test/CodeGen/X86/asm-reg-type-mismatch.ll
index f0d46a0252c32..47accdbc07b33 100644
--- a/test/CodeGen/X86/2008-08-25-AsmRegTypeMismatch.ll
+++ b/test/CodeGen/X86/asm-reg-type-mismatch.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -mcpu=core2 | grep xorps | count 2
-; RUN: llc < %s -mcpu=core2 | not grep movap
+; RUN: llc < %s -mcpu=core2 | FileCheck %s
 ; PR2715
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
@@ -11,8 +10,22 @@ target triple = "x86_64-unknown-linux-gnu"
 	%struct.nsXPTCVariant = type { %struct.nsXPTCMiniVariant, i8*, %struct.nsXPTType, i8 }
 	%struct.nsXPTType = type { %struct.XPTTypeDescriptorPrefix }
 
-define i32 @XPTC_InvokeByIndex(%struct.nsISupports* %that, i32 %methodIndex, i32 %paramCount, %struct.nsXPTCVariant* %params) nounwind {
+define i32 @test1(%struct.nsISupports* %that, i32 %methodIndex, i32 %paramCount, %struct.nsXPTCVariant* %params) nounwind {
 entry:
 	call void asm sideeffect "", "{xmm0},{xmm1},{xmm2},{xmm3},{xmm4},{xmm5},{xmm6},{xmm7},~{dirflag},~{fpsr},~{flags}"( double undef, double undef, double undef, double 1.0, double undef, double 0.0, double undef, double 0.0 ) nounwind
 	ret i32 0
+	; CHECK: test1
+	; CHECK-NOT: movap
+	; CHECK: xorps
+	; CHECK: xorps
+	; CHECK-NOT: movap
+}
+
+define i64 @test2() nounwind {
+entry:
+  %0 = tail call i64 asm sideeffect "movq $1, $0", "={xmm7},*m,~{dirflag},~{fpsr},~{flags}"(i64* null) nounwind
+  ret i64 %0
+  ; CHECK: test2
+	; CHECK: movq {{.*}}, %xmm7
+	; CHECK: movd %xmm7, %rax
 }
diff --git a/test/CodeGen/X86/atom-lea-sp.ll b/test/CodeGen/X86/atom-lea-sp.ll
index 59427880a71da..19482e13d8c82 100644
--- a/test/CodeGen/X86/atom-lea-sp.ll
+++ b/test/CodeGen/X86/atom-lea-sp.ll
@@ -1,15 +1,15 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck -check-prefix=atom %s
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck -check-prefix=ATOM %s
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux | FileCheck %s
 
 declare void @use_arr(i8*)
 declare void @many_params(i32, i32, i32, i32, i32, i32)
 
 define void @test1() nounwind {
-; atom: test1:
-; atom: leal -1052(%esp), %esp
-; atom-NOT: sub
-; atom: call
-; atom: leal 1052(%esp), %esp
+; ATOM: test1:
+; ATOM: leal -1052(%esp), %esp
+; ATOM-NOT: sub
+; ATOM: call
+; ATOM: leal 1052(%esp), %esp
 
 ; CHECK: test1:
 ; CHECK: subl
@@ -22,10 +22,10 @@ define void @test1() nounwind {
 }
 
 define void @test2() nounwind {
-; atom: test2:
-; atom: leal -28(%esp), %esp
-; atom: call
-; atom: leal 28(%esp), %esp
+; ATOM: test2:
+; ATOM: leal -28(%esp), %esp
+; ATOM: call
+; ATOM: leal 28(%esp), %esp
 
 ; CHECK: test2:
 ; CHECK-NOT: lea
@@ -34,9 +34,9 @@ define void @test2() nounwind {
 }
 
 define void @test3() nounwind {
-; atom: test3:
-; atom: leal -8(%esp), %esp
-; atom: leal 8(%esp), %esp
+; ATOM: test3:
+; ATOM: leal -8(%esp), %esp
+; ATOM: leal 8(%esp), %esp
 
 ; CHECK: test3:
 ; CHECK-NOT: lea
diff --git a/test/CodeGen/X86/atom-sched.ll b/test/CodeGen/X86/atom-sched.ll
index 4dd9a9e3481da..0d97e85358243 100644
--- a/test/CodeGen/X86/atom-sched.ll
+++ b/test/CodeGen/X86/atom-sched.ll
@@ -1,9 +1,6 @@
-; XFAIL: *
 ; RUN: llc <%s -O2 -mcpu=atom -march=x86 -relocation-model=static | FileCheck -check-prefix=atom %s
 ; RUN: llc <%s -O2 -mcpu=core2 -march=x86 -relocation-model=static | FileCheck %s
 ;
-; FIXME: Atom's scheduler is temporarily disabled.
-; XFAIL: *
 
 @a = common global i32 0, align 4
 @b = common global i32 0, align 4
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index 7c5abe2095cce..152bece4240fe 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
index 7729491733929..188efe26d92ad 100644
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -promote-elements -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx  -mattr=+avx | FileCheck %s
 
 ; AVX128 tests:
 
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index b33493252a5f4..c44beb4bc2b8e 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7 -mattr=avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
 
 define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
   ; CHECK: vaesdec
@@ -1154,7 +1154,7 @@ define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: seta
   %res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1165,7 +1165,7 @@ define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: sbbl
   %res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1176,7 +1176,7 @@ define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: seto
   %res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1187,7 +1187,7 @@ define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: sets
   %res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1198,7 +1198,7 @@ define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: sete
   %res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1209,6 +1209,7 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestrm
+  ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -1226,7 +1227,7 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
 
 define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: seta
   %res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1235,7 +1236,7 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: sbbl
   %res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1244,7 +1245,7 @@ declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: seto
   %res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1253,7 +1254,7 @@ declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: sets
   %res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1262,7 +1263,7 @@ declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: sete
   %res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1271,6 +1272,7 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistrm
+  ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -2555,3 +2557,36 @@ define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
   ret i32 %tmp
 }
 declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
+
+; CHECK: movntdq
+define void @movnt_dq(i8* %p, <4 x i64> %a1) nounwind {
+  %a2 = add <4 x i64> %a1, <i64 1, i64 1, i64 1, i64 1>
+  tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a2) nounwind
+  ret void
+}
+declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
+
+; CHECK: movntps
+define void @movnt_ps(i8* %p, <8 x float> %a) nounwind {
+  tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind
+  ret void
+}
+declare void @llvm.x86.avx.movnt.ps.256(i8*, <8 x float>) nounwind
+
+; CHECK: movntpd
+define void @movnt_pd(i8* %p, <4 x double> %a1) nounwind {
+  ; add operation forces the execution domain.
+  %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind
+  ret void
+}
+declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
+
+
+; Check for pclmulqdq
+define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK: vpclmulqdq
+  %res = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx-minmax.ll b/test/CodeGen/X86/avx-minmax.ll
index 7c5882010945d..eff92510348ad 100644
--- a/test/CodeGen/X86/avx-minmax.ll
+++ b/test/CodeGen/X86/avx-minmax.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx -asm-verbose=false -join-physregs -enable-unsafe-fp-math -enable-no-nans-fp-math -promote-elements | FileCheck -check-prefix=UNSAFE %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
 
 ; UNSAFE: maxpd:
 ; UNSAFE: vmaxpd {{.+}}, %xmm
diff --git a/test/CodeGen/X86/avx-shuffle-x86_32.ll b/test/CodeGen/X86/avx-shuffle-x86_32.ll
index 5268ec3a56cde..e203c4ed0298f 100755
--- a/test/CodeGen/X86/avx-shuffle-x86_32.ll
+++ b/test/CodeGen/X86/avx-shuffle-x86_32.ll
@@ -4,5 +4,5 @@ define <4 x i64> @test1(<4 x i64> %a) nounwind {
  %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  ret <4 x i64>%b
  ; CHECK: test1:
- ; CHECK: vinsertf128
+ ; CHECK-NOT: vinsertf128
  }
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 16c447be17278..9b41709a3b1ba 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -90,8 +90,8 @@ define i32 @test9(<4 x i32> %a) nounwind {
 ; Extract a value which is the result of an undef mask.
 define i32 @test10(<4 x i32> %a) nounwind {
 ; CHECK: @test10
-; CHECK-NEXT: #
-; CHECK-NEXT: ret
+; CHECK-NOT: {{^[^#]*[a-z]}}
+; CHECK: ret
   %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %r = extractelement <8 x i32> %b, i32 2
   ret i32 %r
@@ -149,17 +149,26 @@ entry:
 }
 
 ; PR12413
+; CHECK: shuf1
+; CHECK: vpshufb
+; CHECK: vpshufb
 ; CHECK: vpshufb
 ; CHECK: vpshufb
+define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) {
+entry:
+ %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+ ret <32 x i8> %0
+}
+
+; handle the case where only half of the 256-bits is splittable
+; CHECK: shuf2
 ; CHECK: vpshufb
 ; CHECK: vpshufb
-define <32 x i8> @shuf(<32 x i8> %inval1, <32 x i8> %inval2) {
+; CHECK: vpextrb
+; CHECK: vpextrb
+define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) {
 entry:
- %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0,
-i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32
-22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32
-42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32
-62>
+ %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 31, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
  ret <32 x i8> %0
 }
 
@@ -202,3 +211,40 @@ define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
   %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   ret <4 x i64> %t
 }
+
+; CHECK: narrow
+; CHECK: vpermilps
+; CHECK: ret
+define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 undef, i32 14, i32 15, i32 undef, i32 undef>
+  ret <16 x i16> %t
+}
+
+;CHECK: test17
+;CHECK-NOT: vinsertf128
+;CHECK: ret
+define   <8 x float> @test17(<4 x float> %y) {
+  %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %x
+}
+
+; CHECK: test18
+; CHECK: vshufps
+; CHECK: vshufps
+; CHECK: vunpcklps
+; CHECK: ret
+define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
+  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x float>%S
+}
+
+; CHECK: test19
+; CHECK: vshufps
+; CHECK: vshufps
+; CHECK: vunpcklps
+; CHECK: ret
+define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
+  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x float>%S
+}
+
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 148ae7329f4b3..0d403d4bb124e 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -112,3 +112,32 @@ entry:
   %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
   ret <2 x double> %vecinit2.i
 }
+
+; CHECK: _RR
+; CHECK: vbroadcastss (%
+; CHECK: ret
+define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+entry:
+  %q = load float* %ptr, align 4
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  ; force a chain
+  %j = load i32* %k, align 4
+  store i32 %j, i32* undef
+  ret <4 x float> %vecinit6.i
+}
+
+
+; CHECK: _RR2
+; CHECK: vbroadcastss (%
+; CHECK: ret
+define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+entry:
+  %q = load float* %ptr, align 4
+  %v = insertelement <4 x float> undef, float %q, i32 0
+  %t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %t
+}
+
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
new file mode 100755
index 0000000000000..b47491335a312
--- /dev/null
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+; CHECK: trunc4
+; CHECK: vpermd
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
+  %B = trunc <4 x i64> %A to <4 x i32>
+  ret <4 x i32>%B
+}
+
+; CHECK: trunc8
+; CHECK: vpshufb
+; CHECK-NOT: vinsert
+; CHECK: ret
+
+define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
+  %B = trunc <8 x i32> %A to <8 x i16>
+  ret <8 x i16>%B
+}
+
+; CHECK: sext4
+; CHECK: vpmovsxdq
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <4 x i64> @sext4(<4 x i32> %A) nounwind {
+  %B = sext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+; CHECK: sext8
+; CHECK: vpmovsxwd
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <8 x i32> @sext8(<8 x i16> %A) nounwind {
+  %B = sext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+
+; CHECK: zext4
+; CHECK: vpmovzxdq
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <4 x i64> @zext4(<4 x i32> %A) nounwind {
+  %B = zext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+; CHECK: zext8
+; CHECK: vpmovzxwd
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <8 x i32> @zext8(<8 x i16> %A) nounwind {
+  %B = zext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+; CHECK: zext_8i8_8i32
+; CHECK: vpmovzxwd
+; CHECK: vpand
+; CHECK: ret
+define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
+  %B = zext <8 x i8> %A to <8 x i32>  
+  ret <8 x i32>%B
+}
+
+
+
+
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 3f27a0291b4f0..a6141b0956170 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -976,3 +976,182 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
   ret void
 }
 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+
+define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1,
+                     <4 x i32> %idx, <2 x double> %mask) {
+  ; CHECK: vgatherdpd
+  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0,
+                            i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
+                      <4 x i32>, <2 x double>, i8) nounwind readonly
+
+define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x double> %mask) {
+  ; CHECK: vgatherdpd
+  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 2) ;
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
+                      <4 x i32>, <4 x double>, i8) nounwind readonly
+
+define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1,
+                     <2 x i64> %idx, <2 x double> %mask) {
+  ; CHECK: vgatherqpd
+  %res = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0,
+                            i8* %a1, <2 x i64> %idx, <2 x double> %mask, i8 2) ;
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*,
+                      <2 x i64>, <2 x double>, i8) nounwind readonly
+
+define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x double> %mask) {
+  ; CHECK: vgatherqpd
+  %res = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x double> %mask, i8 2) ;
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*,
+                      <4 x i64>, <4 x double>, i8) nounwind readonly
+
+define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x float> %mask) {
+  ; CHECK: vgatherdps
+  %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
+                      <4 x i32>, <4 x float>, i8) nounwind readonly
+
+define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1,
+                     <8 x i32> %idx, <8 x float> %mask) {
+  ; CHECK: vgatherdps
+  %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
+                            i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 2) ;
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
+                      <8 x i32>, <8 x float>, i8) nounwind readonly
+
+define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1,
+                     <2 x i64> %idx, <4 x float> %mask) {
+  ; CHECK: vgatherqps
+  %res = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0,
+                            i8* %a1, <2 x i64> %idx, <4 x float> %mask, i8 2) ;
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*,
+                      <2 x i64>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x float> %mask) {
+  ; CHECK: vgatherqps
+  %res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*,
+                      <4 x i64>, <4 x float>, i8) nounwind readonly
+
+define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1,
+                     <4 x i32> %idx, <2 x i64> %mask) {
+  ; CHECK: vpgatherdq
+  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0,
+                            i8* %a1, <4 x i32> %idx, <2 x i64> %mask, i8 2) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*,
+                      <4 x i32>, <2 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x i64> %mask) {
+  ; CHECK: vpgatherdq
+  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x i64> %mask, i8 2) ;
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*,
+                      <4 x i32>, <4 x i64>, i8) nounwind readonly
+
+define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1,
+                     <2 x i64> %idx, <2 x i64> %mask) {
+  ; CHECK: vpgatherqq
+  %res = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0,
+                            i8* %a1, <2 x i64> %idx, <2 x i64> %mask, i8 2) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*,
+                      <2 x i64>, <2 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x i64> %mask) {
+  ; CHECK: vpgatherqq
+  %res = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x i64> %mask, i8 2) ;
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*,
+                      <4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x i32> %mask) {
+  ; CHECK: vpgatherdd
+  %res = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x i32> %mask, i8 2) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*,
+                      <4 x i32>, <4 x i32>, i8) nounwind readonly
+
+define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1,
+                     <8 x i32> %idx, <8 x i32> %mask) {
+  ; CHECK: vpgatherdd
+  %res = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0,
+                            i8* %a1, <8 x i32> %idx, <8 x i32> %mask, i8 2) ;
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*,
+                      <8 x i32>, <8 x i32>, i8) nounwind readonly
+
+define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1,
+                     <2 x i64> %idx, <4 x i32> %mask) {
+  ; CHECK: vpgatherqd
+  %res = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0,
+                            i8* %a1, <2 x i64> %idx, <4 x i32> %mask, i8 2) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*,
+                      <2 x i64>, <4 x i32>, i8) nounwind readonly
+
+define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x i32> %mask) {
+  ; CHECK: vpgatherqd
+  %res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*,
+                      <4 x i64>, <4 x i32>, i8) nounwind readonly
+
+; PR13298
+define <8 x float>  @test_gather_mask(<8 x float> %a0, float* %a,
+                                      <8 x i32> %idx, <8 x float> %mask,
+                                      float* nocapture %out) {
+; CHECK: test_gather_mask
+; CHECK: vmovdqa %ymm2, [[DEST:%.*]]
+; CHECK: vgatherdps [[DEST]]
+;; gather with mask
+  %a_i8 = bitcast float* %a to i8*
+  %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
+                           i8* %a_i8, <8 x i32> %idx, <8 x float> %mask, i8 4) ;
+
+;; for debugging, we'll just dump out the mask
+  %out_ptr = bitcast float * %out to <8 x float> *
+  store <8 x float> %mask, <8 x float> * %out_ptr, align 4
+
+  ret <8 x float> %res
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
new file mode 100644
index 0000000000000..c5899fa27426e
--- /dev/null
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+; Make sure that we don't match this shuffle using the vpblendw YMM instruction.
+; The mask for the vpblendw instruction needs to be identical for both halves
+; of the YMM. Need to use two vpblendw instructions.
+
+; CHECK: blendw1
+; CHECK: vpblendw
+; CHECK: vpblendw
+; CHECK: ret
+define <16 x i16> @blendw1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i16> %t
+}
+
+; CHECK: vpshufhw $27, %ymm
+define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
+  ret <16 x i16> %shuffle.i
+}
+
+; CHECK: vpshuflw $27, %ymm
+define <16 x i16> @vpshuflw(<16 x i16> %src1) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %shuffle.i
+}
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 1a78414761ca0..b804233663d47 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -160,6 +160,15 @@ entry:
   ret <8 x i32> %g
 }
 
+; CHECK: V113
+; CHECK: vbroadcastss
+; CHECK: ret
+define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
+entry:
+  %g = fadd <8 x float> %in, <float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000>
+  ret <8 x float> %g
+}
+
 ; CHECK: _e2
 ; CHECK: vbroadcastss
 ; CHECK: ret
@@ -179,9 +188,170 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
   %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
   %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2
   %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 52, i32 3
-  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 3
-  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 3
-  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 3
-  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 3
+  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 4
+  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 5
+  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 6
+  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 7
   ret <8 x i8> %vecinit7.i
 }
+
+
+define void @crash() nounwind alwaysinline {
+WGLoopsEntry:
+  br i1 undef, label %ret, label %footer329VF
+
+footer329VF:
+  %A.0.inVF = fmul float undef, 6.553600e+04
+  %B.0.in407VF = fmul <8 x float> undef, <float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04>
+  %A.0VF = fptosi float %A.0.inVF to i32
+  %B.0408VF = fptosi <8 x float> %B.0.in407VF to <8 x i32>
+  %0 = and <8 x i32> %B.0408VF, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %1 = and i32 %A.0VF, 65535
+  %temp1098VF = insertelement <8 x i32> undef, i32 %1, i32 0
+  %vector1099VF = shufflevector <8 x i32> %temp1098VF, <8 x i32> undef, <8 x i32> zeroinitializer
+  br i1 undef, label %preload1201VF, label %footer349VF
+
+preload1201VF:
+  br label %footer349VF
+
+footer349VF:
+  %2 = mul nsw <8 x i32> undef, %0
+  %3 = mul nsw <8 x i32> undef, %vector1099VF
+  br label %footer329VF
+
+ret:
+  ret void
+}
+
+; CHECK: _inreg0
+; CHECK: broadcastss
+; CHECK: ret
+define <8 x i32> @_inreg0(i32 %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <8 x i32> undef, i32 %scalar, i32 0
+  %wide = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %wide
+}
+
+; CHECK: _inreg1
+; CHECK: broadcastss
+; CHECK: ret
+define <8 x float> @_inreg1(float %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <8 x float> undef, float %scalar, i32 0
+  %wide = shufflevector <8 x float> %in, <8 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %wide
+}
+
+; CHECK: _inreg2
+; CHECK: broadcastss
+; CHECK: ret
+define <4 x float> @_inreg2(float %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <4 x float> undef, float %scalar, i32 0
+  %wide = shufflevector <4 x float> %in, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %wide
+}
+
+; CHECK: _inreg3
+; CHECK: broadcastsd
+; CHECK: ret
+define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <4 x double> undef, double %scalar, i32 0
+  %wide = shufflevector <4 x double> %in, <4 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %wide
+}
+
+;CHECK: _inreg8xfloat
+;CHECK: vbroadcastss
+;CHECK: ret
+define   <8 x float> @_inreg8xfloat(<8 x float> %a) {
+  %b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %b
+}
+
+;CHECK: _inreg4xfloat
+;CHECK: vbroadcastss
+;CHECK: ret
+define   <4 x float> @_inreg4xfloat(<4 x float> %a) {
+  %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %b
+}
+
+;CHECK: _inreg16xi16
+;CHECK: vpbroadcastw
+;CHECK: ret
+define   <16 x i16> @_inreg16xi16(<16 x i16> %a) {
+  %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> zeroinitializer
+  ret <16 x i16> %b
+}
+
+;CHECK: _inreg8xi16
+;CHECK: vpbroadcastw
+;CHECK: ret
+define   <8 x i16> @_inreg8xi16(<8 x i16> %a) {
+  %b = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %b
+}
+
+
+;CHECK: _inreg4xi64
+;CHECK: vpbroadcastq
+;CHECK: ret
+define   <4 x i64> @_inreg4xi64(<4 x i64> %a) {
+  %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
+  ret <4 x i64> %b
+}
+
+;CHECK: _inreg2xi64
+;CHECK: vpbroadcastq
+;CHECK: ret
+define   <2 x i64> @_inreg2xi64(<2 x i64> %a) {
+  %b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %b
+}
+
+;CHECK: _inreg4xdouble
+;CHECK: vbroadcastsd
+;CHECK: ret
+define   <4 x double> @_inreg4xdouble(<4 x double> %a) {
+  %b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %b
+}
+
+;CHECK: _inreg2xdouble
+;CHECK: vpbroadcastq
+;CHECK: ret
+define   <2 x double> @_inreg2xdouble(<2 x double> %a) {
+  %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %b
+}
+
+;CHECK: _inreg8xi32
+;CHECK: vpbroadcastd
+;CHECK: ret
+define   <8 x i32> @_inreg8xi32(<8 x i32> %a) {
+  %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %b
+}
+
+;CHECK: _inreg4xi32
+;CHECK: vpbroadcastd
+;CHECK: ret
+define   <4 x i32> @_inreg4xi32(<4 x i32> %a) {
+  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %b
+}
+
+;CHECK: _inreg32xi8
+;CHECK: vpbroadcastb
+;CHECK: ret
+define   <32 x i8> @_inreg32xi8(<32 x i8> %a) {
+  %b = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
+  ret <32 x i8> %b
+}
+
+;CHECK: _inreg16xi8
+;CHECK: vpbroadcastb
+;CHECK: ret
+define   <16 x i8> @_inreg16xi8(<16 x i8> %a) {
+  %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %b
+}
diff --git a/test/CodeGen/X86/basic-promote-integers.ll b/test/CodeGen/X86/basic-promote-integers.ll
index c80f2b03343ea..fce6b7f5565c5 100644
--- a/test/CodeGen/X86/basic-promote-integers.ll
+++ b/test/CodeGen/X86/basic-promote-integers.ll
@@ -1,7 +1,7 @@
 ; Test that vectors are scalarized/lowered correctly
 ; (with both legalization methods).
-; RUN: llc -march=x86 -promote-elements < %s
-; RUN: llc -march=x86                   < %s
+; RUN: llc -march=x86  < %s
+; RUN: llc -march=x86  < %s
 
 ; A simple test to check copyToParts and copyFromParts.
 
diff --git a/test/CodeGen/X86/bigstructret.ll b/test/CodeGen/X86/bigstructret.ll
index 633995d5d7880..3c499fae820ff 100644
--- a/test/CodeGen/X86/bigstructret.ll
+++ b/test/CodeGen/X86/bigstructret.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -march=x86 -o %t
-; RUN: grep "movl	.24601, 12(%ecx)" %t
-; RUN: grep "movl	.48, 8(%ecx)" %t
-; RUN: grep "movl	.24, 4(%ecx)" %t
-; RUN: grep "movl	.12, (%ecx)" %t
+; RUN: llc < %s -march=x86 | FileCheck %s
 
 %0 = type { i32, i32, i32, i32 }
+%1 = type { i1, i1, i1, i32 }
 
-define internal fastcc %0 @ReturnBigStruct() nounwind readnone {
+; CHECK: ReturnBigStruct
+; CHECK: movl $24601, 12(%ecx)
+; CHECK: movl	$48, 8(%ecx)
+; CHECK: movl	$24, 4(%ecx)
+; CHECK: movl	$12, (%ecx)
+
+define fastcc %0 @ReturnBigStruct() nounwind readnone {
 entry:
   %0 = insertvalue %0 zeroinitializer, i32 12, 0
   %1 = insertvalue %0 %0, i32 24, 1
@@ -15,3 +18,17 @@ entry:
   ret %0 %3
 }
 
+; CHECK: ReturnBigStruct2
+; CHECK: movl	$48, 4(%ecx)
+; CHECK: movb	$1, 2(%ecx)
+; CHECK: movb	$1, 1(%ecx)
+; CHECK: movb	$0, (%ecx)
+
+define fastcc %1 @ReturnBigStruct2() nounwind readnone {
+entry:
+  %0 = insertvalue %1 zeroinitializer, i1 false, 0
+  %1 = insertvalue %1 %0, i1 true, 1
+  %2 = insertvalue %1 %1, i1 true, 2
+  %3 = insertvalue %1 %2, i32 48, 3
+  ret %1 %3
+}
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 3a10c70ada853..11f811f8cf63d 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse41 | FileCheck %s
 
 
 ; In this test we check that sign-extend of the mask bit is performed by
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index fc7b6383b8b04..5534712af8325 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -7,10 +7,15 @@ define i32 @test_ifchains(i32 %i, i32* %a, i32 %b) {
 ; that is not expected to run.
 ; CHECK: test_ifchains:
 ; CHECK: %entry
+; CHECK-NOT: .align
 ; CHECK: %else1
+; CHECK-NOT: .align
 ; CHECK: %else2
+; CHECK-NOT: .align
 ; CHECK: %else3
+; CHECK-NOT: .align
 ; CHECK: %else4
+; CHECK-NOT: .align
 ; CHECK: %exit
 ; CHECK: %then1
 ; CHECK: %then2
@@ -76,8 +81,11 @@ define i32 @test_loop_cold_blocks(i32 %i, i32* %a) {
 ; Check that we sink cold loop blocks after the hot loop body.
 ; CHECK: test_loop_cold_blocks:
 ; CHECK: %entry
+; CHECK-NOT: .align
 ; CHECK: %unlikely1
+; CHECK-NOT: .align
 ; CHECK: %unlikely2
+; CHECK: .align
 ; CHECK: %body1
 ; CHECK: %body2
 ; CHECK: %body3
@@ -634,7 +642,7 @@ define void @test_unnatural_cfg_backwards_inner_loop() {
 ;
 ; CHECK: test_unnatural_cfg_backwards_inner_loop
 ; CHECK: %entry
-; CHECK: %body
+; CHECK: [[BODY:# BB#[0-9]+]]:
 ; CHECK: %loop2b
 ; CHECK: %loop1
 ; CHECK: %loop2a
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
new file mode 100644
index 0000000000000..0cb9fd9bc533f
--- /dev/null
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx | FileCheck %s
+
+define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
+  %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
+  %t2 = icmp ne i32 %t1, 0
+  %t3 = select i1 %t2, i32 %a, i32 %b
+  ret i32 %t3
+; CHECK: foo
+; CHECK: ptest
+; CHECK-NOT: testl
+; CHECK: cmov
+; CHECK: ret
+}
+
+define i32 @bar(<2 x i64> %c) {
+entry:
+  %0 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
+  %1 = icmp ne i32 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; CHECK: bar
+; CHECK: ptest
+; CHECK-NOT: testl
+; CHECK: jne
+; CHECK: ret
+}
+
+define i32 @bax(<2 x i64> %c) {
+  %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
+  %t2 = icmp eq i32 %t1, 1
+  %t3 = zext i1 %t2 to i32
+  ret i32 %t3
+; CHECK: bax
+; CHECK: ptest
+; CHECK-NOT: cmpl
+; CHECK: ret
+}
+
+declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/X86/br-fold.ll b/test/CodeGen/X86/br-fold.ll
index 2c37194938018..522346301162b 100644
--- a/test/CodeGen/X86/br-fold.ll
+++ b/test/CodeGen/X86/br-fold.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=x86-64 < %s | FileCheck %s
 
 ; CHECK: orq
-; CHECK-NEXT: LBB0_1
+; CHECK-NEXT: %bb8.i329
 
 @_ZN11xercesc_2_513SchemaSymbols21fgURI_SCHEMAFORSCHEMAE = external constant [33 x i16], align 32 ; <[33 x i16]*> [#uses=1]
 @_ZN11xercesc_2_56XMLUni16fgNotationStringE = external constant [9 x i16], align 16 ; <[9 x i16]*> [#uses=1]
diff --git a/test/CodeGen/X86/break-anti-dependencies.ll b/test/CodeGen/X86/break-anti-dependencies.ll
index 93b20437e1e89..c94261467c9db 100644
--- a/test/CodeGen/X86/break-anti-dependencies.ll
+++ b/test/CodeGen/X86/break-anti-dependencies.ll
@@ -1,10 +1,12 @@
 ; Without list-burr scheduling we may not see the difference in codegen here.
-; RUN: llc < %s -march=x86-64 -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
-; RUN:   grep {%xmm0} %t | count 14
-; RUN:   not grep {%xmm1} %t
-; RUN: llc < %s -march=x86-64 -post-RA-scheduler -break-anti-dependencies=critical > %t
-; RUN:   grep {%xmm0} %t | count 7
-; RUN:   grep {%xmm1} %t | count 7
+; Use a subtarget that has post-RA scheduling enabled because the anti-dependency
+; breaker requires liveness information to be kept.
+; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
+; RUN:   grep "%xmm0" %t | count 14
+; RUN:   not grep "%xmm1" %t
+; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -break-anti-dependencies=critical > %t
+; RUN:   grep "%xmm0" %t | count 7
+; RUN:   grep "%xmm1" %t | count 7
 
 define void @goo(double* %r, double* %p, double* %q) nounwind {
 entry:
diff --git a/test/CodeGen/X86/break-sse-dep.ll b/test/CodeGen/X86/break-sse-dep.ll
index 2dee5754256a1..4d801891da5cc 100644
--- a/test/CodeGen/X86/break-sse-dep.ll
+++ b/test/CodeGen/X86/break-sse-dep.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s
 
 define double @t1(float* nocapture %x) nounwind readonly ssp {
 entry:
@@ -34,8 +34,7 @@ entry:
 define double @squirt(double* %x) nounwind {
 entry:
 ; CHECK: squirt:
-; CHECK: movsd ([[A0]]), %xmm0
-; CHECK: sqrtsd %xmm0, %xmm0
+; CHECK: sqrtsd ([[A0]]), %xmm0
   %z = load double* %x
   %t = call double @llvm.sqrt.f64(double %z)
   ret double %t
diff --git a/test/CodeGen/X86/call-imm.ll b/test/CodeGen/X86/call-imm.ll
index 3857fb157905f..38cda4d140405 100644
--- a/test/CodeGen/X86/call-imm.ll
+++ b/test/CodeGen/X86/call-imm.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=static | grep {call.*12345678}
-; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=pic | not grep {call.*12345678}
-; RUN: llc < %s -mtriple=i386-pc-linux -relocation-model=dynamic-no-pic | grep {call.*12345678}
+; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=static | grep "call.*12345678"
+; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=pic | not grep "call.*12345678"
+; RUN: llc < %s -mtriple=i386-pc-linux -relocation-model=dynamic-no-pic | grep "call.*12345678"
 
 ; Call to immediate is not safe on x86-64 unless we *know* that the
 ; call will be within 32-bits pcrel from the dest immediate.
 
-; RUN: llc < %s -march=x86-64 | grep {call.*\\*%rax}
+; RUN: llc < %s -march=x86-64 | grep "call.*\*%rax"
 
 ; PR3666
 ; PR3773
diff --git a/test/CodeGen/X86/cfstring.ll b/test/CodeGen/X86/cfstring.ll
index 7420ce730475d..8cdd59e9ae934 100644
--- a/test/CodeGen/X86/cfstring.ll
+++ b/test/CodeGen/X86/cfstring.ll
@@ -4,7 +4,7 @@
 %0 = type opaque
 %struct.NSConstantString = type { i32*, i32, i8*, i32 }
 
-; Make sure that the string ends up the the correct section.
+; Make sure that the string ends up the correct section.
 
 ; CHECK:        .section __TEXT,__cstring
 ; CHECK-NEXT: l_.str3:
diff --git a/test/CodeGen/X86/cmov-into-branch.ll b/test/CodeGen/X86/cmov-into-branch.ll
new file mode 100644
index 0000000000000..780746ab1ae44
--- /dev/null
+++ b/test/CodeGen/X86/cmov-into-branch.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s
+
+; cmp with single-use load, should not form cmov.
+define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y)  {
+  %load = load double* %b, align 8
+  %cmp = fcmp olt double %load, %a
+  %cond = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %cond
+; CHECK: test1:
+; CHECK: ucomisd
+; CHECK-NOT: cmov
+; CHECK: j
+; CHECK-NOT: cmov
+}
+
+; Sanity check: no load.
+define i32 @test2(double %a, double %b, i32 %x, i32 %y)  {
+  %cmp = fcmp ogt double %a, %b
+  %cond = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %cond
+; CHECK: test2:
+; CHECK: ucomisd
+; CHECK: cmov
+}
+
+; Multiple uses of %a, should not form cmov.
+define i32 @test3(i32 %a, i32* nocapture %b, i32 %x)  {
+  %load = load i32* %b, align 4
+  %cmp = icmp ult i32 %load, %a
+  %cond = select i1 %cmp, i32 %a, i32 %x
+  ret i32 %cond
+; CHECK: test3:
+; CHECK: cmpl
+; CHECK-NOT: cmov
+; CHECK: j
+; CHECK-NOT: cmov
+}
+
+; Multiple uses of the load.
+define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y)  {
+  %load = load i32* %b, align 4
+  %cmp = icmp ult i32 %load, %a
+  %cond = select i1 %cmp, i32 %x, i32 %y
+  %add = add i32 %cond, %load
+  ret i32 %add
+; CHECK: test4:
+; CHECK: cmpl
+; CHECK: cmov
+}
+
+; Multiple uses of the cmp.
+define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
+  %load = load i32* %b, align 4
+  %cmp = icmp ult i32 %load, %a
+  %cmp1 = icmp ugt i32 %load, %a
+  %cond = select i1 %cmp1, i32 %a, i32 %y
+  %cond5 = select i1 %cmp, i32 %cond, i32 %x
+  ret i32 %cond5
+; CHECK: test5:
+; CHECK: cmpl
+; CHECK: cmov
+; CHECK: cmov
+}
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index 2e7ffbfd546d9..ed25c82fddaca 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -disable-cgp-select2branch | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK: test1:
-; CHECK: btl
-; CHECK-NEXT: movl	$12, %eax
+; CHECK: movl	$12, %eax
+; CHECK-NEXT: btl
 ; CHECK-NEXT: cmovael	(%rcx), %eax
 ; CHECK-NEXT: ret
 
@@ -19,8 +19,8 @@ entry:
 define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK: test2:
-; CHECK: btl
-; CHECK-NEXT: movl	$12, %eax
+; CHECK: movl	$12, %eax
+; CHECK-NEXT: btl
 ; CHECK-NEXT: cmovbl	(%rcx), %eax
 ; CHECK-NEXT: ret
 
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index ef5e353e9f9fb..eb06327f55a69 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -90,3 +90,64 @@ F:
 ; CHECK: encoding: [0x48,0x83,0x7c,0x24,0xf8,0x00]
 }
 
+; rdar://11866926
+define i32 @test7(i64 %res) nounwind {
+entry:
+; CHECK: test7:
+; CHECK-NOT: movabsq
+; CHECK: shrq $32, %rdi
+; CHECK: testq %rdi, %rdi
+; CHECK: sete
+  %lnot = icmp ult i64 %res, 4294967296
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+define i32 @test8(i64 %res) nounwind {
+entry:
+; CHECK: test8:
+; CHECK-NOT: movabsq
+; CHECK: shrq $32, %rdi
+; CHECK: cmpq $3, %rdi
+  %lnot = icmp ult i64 %res, 12884901888
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+define i32 @test9(i64 %res) nounwind {
+entry:
+; CHECK: test9:
+; CHECK-NOT: movabsq
+; CHECK: shrq $33, %rdi
+; CHECK: testq %rdi, %rdi
+; CHECK: sete
+  %lnot = icmp ult i64 %res, 8589934592
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+define i32 @test10(i64 %res) nounwind {
+entry:
+; CHECK: test10:
+; CHECK-NOT: movabsq
+; CHECK: shrq $32, %rdi
+; CHECK: testq %rdi, %rdi
+; CHECK: setne
+  %lnot = icmp uge i64 %res, 4294967296
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+; rdar://9758774
+define i32 @test11(i64 %l) nounwind {
+entry:
+; CHECK: test11:
+; CHECK-NOT: movabsq
+; CHECK-NOT: andq
+; CHECK: shrq $47, %rdi
+; CHECK: cmpq $1, %rdi
+  %shr.mask = and i64 %l, -140737488355328
+  %cmp = icmp eq i64 %shr.mask, 140737488355328
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/X86/coalesce-esp.ll b/test/CodeGen/X86/coalesce-esp.ll
index a5848763c98d5..4004379938790 100644
--- a/test/CodeGen/X86/coalesce-esp.ll
+++ b/test/CodeGen/X86/coalesce-esp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {movl	%esp, %ebp}
+; RUN: llc < %s | grep "movl	%esp, %ebp"
 ; PR4572
 
 ; Don't coalesce with %esp if it would end up putting %esp in
diff --git a/test/CodeGen/X86/coalescer-commute2.ll b/test/CodeGen/X86/coalescer-commute2.ll
index 6e5c1cf63006e..e45437cc9484e 100644
--- a/test/CodeGen/X86/coalescer-commute2.ll
+++ b/test/CodeGen/X86/coalescer-commute2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux -join-physregs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=nehalem | FileCheck %s
 ; CHECK-NOT:     mov
 ; CHECK:     paddw
 ; CHECK-NOT:     mov
@@ -26,14 +26,3 @@ entry:
 	%tmp10 = bitcast <8 x i16> %tmp9 to <2 x i64>		; <<2 x i64>> [#uses=1]
 	ret <2 x i64> %tmp10
 }
-
-
-; The coalescer should commute the add to avoid a copy.
-define <4 x float> @test3(<4 x float> %V) {
-entry:
-        %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
-                                        <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
-        %add = fadd <4 x float> %tmp8, %V
-        ret <4 x float> %add
-}
-
diff --git a/test/CodeGen/X86/coalescer-dce2.ll b/test/CodeGen/X86/coalescer-dce2.ll
new file mode 100644
index 0000000000000..bbbf09b267b9e
--- /dev/null
+++ b/test/CodeGen/X86/coalescer-dce2.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -verify-coalescing
+; PR12911
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+@b = common global i32 0, align 4
+@h = common global i32 0, align 4
+@f = common global i32 0, align 4
+@g = common global i32 0, align 4
+@a = common global i16 0, align 2
+@e = common global i32 0, align 4
+
+define void @fn1() nounwind uwtable ssp {
+entry:
+  %0 = load i32* @d, align 4
+  %tobool72 = icmp eq i32 %0, 0
+  br i1 %tobool72, label %for.end32, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %1 = load i32* @c, align 4
+  %tobool2 = icmp eq i32 %1, 0
+  %2 = load i32* @b, align 4
+  %cmp = icmp sgt i32 %2, 0
+  %conv = zext i1 %cmp to i32
+  %3 = load i32* @g, align 4
+  %tobool4 = icmp eq i32 %3, 0
+  %4 = load i16* @a, align 2
+  %tobool9 = icmp eq i16 %4, 0
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond25.loopexit.us-lcssa.us-lcssa, %if.end.us50, %if.end.us, %if.end.us.us, %for.cond1.preheader.lr.ph
+  %j.073 = phi i32 [ undef, %for.cond1.preheader.lr.ph ], [ %j.1.us.us, %if.end.us.us ], [ %j.1.us, %if.end.us ], [ %j.073, %for.cond25.loopexit.us-lcssa.us-lcssa ], [ %j.1.us36, %if.end.us50 ]
+  br i1 %tobool2, label %for.cond1.preheader.split.us, label %for.cond1.preheader.for.cond1.preheader.split_crit_edge
+
+for.cond1.preheader.for.cond1.preheader.split_crit_edge: ; preds = %for.cond1.preheader
+  br i1 %tobool9, label %if.end.us50, label %for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge
+
+for.cond1.preheader.split.us:                     ; preds = %for.cond1.preheader
+  br i1 %tobool9, label %cond.end.us.us, label %cond.end.us
+
+cond.false18.us.us:                               ; preds = %if.end.us.us
+  %5 = load i32* @f, align 4
+  %sext76 = shl i32 %5, 16
+  %phitmp75 = ashr exact i32 %sext76, 16
+  br label %cond.end.us.us
+
+if.end.us.us:                                     ; preds = %cond.end.us.us, %if.then.us.us
+  br i1 %tobool4, label %cond.false18.us.us, label %for.cond1.preheader
+
+if.then.us.us:                                    ; preds = %cond.end.us.us
+  store i32 0, i32* @f, align 4
+  br label %if.end.us.us
+
+cond.end.us.us:                                   ; preds = %cond.false18.us.us, %for.cond1.preheader.split.us
+  %j.1.us.us = phi i32 [ %j.073, %for.cond1.preheader.split.us ], [ %phitmp75, %cond.false18.us.us ]
+  store i32 %conv, i32* @h, align 4
+  br i1 %cmp, label %if.then.us.us, label %if.end.us.us
+
+cond.end21.us:                                    ; preds = %land.lhs.true12.us, %cond.false18.us
+  %cond22.us = phi i16 [ %add.us, %cond.false18.us ], [ %4, %land.lhs.true12.us ]
+  %conv24.us = sext i16 %cond22.us to i32
+  br label %cond.end.us
+
+cond.false18.us:                                  ; preds = %if.end6.us, %land.lhs.true12.us
+  %add.us = add i16 %4, %conv7.us
+  br label %cond.end21.us
+
+land.lhs.true12.us:                               ; preds = %if.end6.us
+  %conv10.us = sext i16 %conv7.us to i32
+  %sub.us = sub nsw i32 0, %conv10.us
+  %cmp14.us = icmp slt i32 %sub.us, 1
+  br i1 %cmp14.us, label %cond.end21.us, label %cond.false18.us
+
+if.end6.us:                                       ; preds = %if.end.us
+  %6 = load i32* @f, align 4
+  %conv7.us = trunc i32 %6 to i16
+  %tobool11.us = icmp eq i16 %conv7.us, 0
+  br i1 %tobool11.us, label %cond.false18.us, label %land.lhs.true12.us
+
+if.end.us:                                        ; preds = %cond.end.us, %if.then.us
+  br i1 %tobool4, label %if.end6.us, label %for.cond1.preheader
+
+if.then.us:                                       ; preds = %cond.end.us
+  store i32 0, i32* @f, align 4
+  br label %if.end.us
+
+cond.end.us:                                      ; preds = %cond.end21.us, %for.cond1.preheader.split.us
+  %j.1.us = phi i32 [ %conv24.us, %cond.end21.us ], [ %j.073, %for.cond1.preheader.split.us ]
+  store i32 %conv, i32* @h, align 4
+  br i1 %cmp, label %if.then.us, label %if.end.us
+
+for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge: ; preds = %for.cond1.preheader.for.cond1.preheader.split_crit_edge
+  br i1 %tobool4, label %if.end6.us65, label %for.cond25.loopexit.us-lcssa.us-lcssa
+
+cond.false18.us40:                                ; preds = %if.end.us50
+  %7 = load i32* @f, align 4
+  %sext = shl i32 %7, 16
+  %phitmp = ashr exact i32 %sext, 16
+  br label %if.end.us50
+
+if.end.us50:                                      ; preds = %cond.false18.us40, %for.cond1.preheader.for.cond1.preheader.split_crit_edge
+  %j.1.us36 = phi i32 [ %j.073, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ], [ %phitmp, %cond.false18.us40 ]
+  store i32 0, i32* @h, align 4
+  br i1 %tobool4, label %cond.false18.us40, label %for.cond1.preheader
+
+if.end6.us65:                                     ; preds = %if.end6.us65, %for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge
+  store i32 0, i32* @h, align 4
+  br label %if.end6.us65
+
+for.cond25.loopexit.us-lcssa.us-lcssa:            ; preds = %for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge
+  store i32 0, i32* @h, align 4
+  br label %for.cond1.preheader
+
+for.end32:                                        ; preds = %entry
+  ret void
+}
diff --git a/test/CodeGen/X86/coalescer-identity.ll b/test/CodeGen/X86/coalescer-identity.ll
new file mode 100644
index 0000000000000..9c72ee6296bd3
--- /dev/null
+++ b/test/CodeGen/X86/coalescer-identity.ll
@@ -0,0 +1,82 @@
+; RUN: llc < %s -verify-coalescing
+; PR12927
+target triple = "x86_64-apple-macosx10.8.0"
+
+; This is a case where removeCopyByCommutingDef() creates an identity copy that
+; joinCopy must then deal with correctly.
+
+@s = common global i16 0, align 2
+@g1 = common global i32 0, align 4
+@g2 = common global i32 0, align 4
+@g0 = common global i32 0, align 4
+
+define void @func() nounwind uwtable ssp {
+for.body.lr.ph:
+  %0 = load i32* @g2, align 4, !tbaa !0
+  %tobool6 = icmp eq i32 %0, 0
+  %s.promoted = load i16* @s, align 2
+  %.pre = load i32* @g1, align 4, !tbaa !0
+  br i1 %tobool6, label %for.body.us, label %for.body
+
+for.body.us:                                      ; preds = %for.body.lr.ph, %for.inc.us
+  %1 = phi i32 [ %3, %for.inc.us ], [ %.pre, %for.body.lr.ph ]
+  %dec13.us = phi i16 [ %dec12.us, %for.inc.us ], [ %s.promoted, %for.body.lr.ph ]
+  %i.011.us = phi i32 [ %inc.us, %for.inc.us ], [ undef, %for.body.lr.ph ]
+  %v.010.us = phi i32 [ %phitmp.us, %for.inc.us ], [ 1, %for.body.lr.ph ]
+  %tobool1.us = icmp ne i32 %v.010.us, 0
+  %2 = zext i1 %tobool1.us to i16
+  %lnot.ext.us = xor i16 %2, 1
+  %add.us = add i16 %dec13.us, %lnot.ext.us
+  %conv3.us = zext i16 %add.us to i32
+  %add4.us = sub i32 0, %1
+  %tobool5.us = icmp eq i32 %conv3.us, %add4.us
+  br i1 %tobool5.us, label %for.inc.us, label %if.then7.us
+
+for.inc.us:                                       ; preds = %cond.end.us, %for.body.us
+  %3 = phi i32 [ %1, %for.body.us ], [ %4, %cond.end.us ]
+  %dec12.us = phi i16 [ %add.us, %for.body.us ], [ %dec.us, %cond.end.us ]
+  %inc.us = add i32 %i.011.us, 1
+  %phitmp.us = udiv i32 %v.010.us, 12
+  %tobool.us = icmp eq i32 %inc.us, 0
+  br i1 %tobool.us, label %for.end, label %for.body.us
+
+cond.end.us:                                      ; preds = %if.then7.us, %cond.false.us
+  %4 = phi i32 [ 0, %cond.false.us ], [ %1, %if.then7.us ]
+  %cond.us = phi i32 [ 0, %cond.false.us ], [ %v.010.us, %if.then7.us ]
+  store i32 %cond.us, i32* @g0, align 4, !tbaa !0
+  br label %for.inc.us
+
+cond.false.us:                                    ; preds = %if.then7.us
+  store i32 0, i32* @g1, align 4, !tbaa !0
+  br label %cond.end.us
+
+if.then7.us:                                      ; preds = %for.body.us
+  %dec.us = add i16 %add.us, -1
+  br i1 %tobool1.us, label %cond.end.us, label %cond.false.us
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %dec13 = phi i16 [ %dec12, %for.body ], [ %s.promoted, %for.body.lr.ph ]
+  %i.011 = phi i32 [ %inc, %for.body ], [ undef, %for.body.lr.ph ]
+  %v.010 = phi i32 [ %phitmp, %for.body ], [ 1, %for.body.lr.ph ]
+  %tobool1 = icmp eq i32 %v.010, 0
+  %lnot.ext = zext i1 %tobool1 to i16
+  %add = add i16 %dec13, %lnot.ext
+  %conv3 = zext i16 %add to i32
+  %add4 = sub i32 0, %.pre
+  %not.tobool5 = icmp ne i32 %conv3, %add4
+  %dec = sext i1 %not.tobool5 to i16
+  %dec12 = add i16 %add, %dec
+  %inc = add i32 %i.011, 1
+  %phitmp = udiv i32 %v.010, 12
+  %tobool = icmp eq i32 %inc, 0
+  br i1 %tobool, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc.us, %for.body
+  %dec12.lcssa = phi i16 [ %dec12.us, %for.inc.us ], [ %dec12, %for.body ]
+  store i16 %dec12.lcssa, i16* @s, align 2
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/constant-pool-sharing.ll b/test/CodeGen/X86/constant-pool-sharing.ll
index f979945835ffc..26318dd6c5585 100644
--- a/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/test/CodeGen/X86/constant-pool-sharing.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s
 
 ; llc should share constant pool entries between this integer vector
 ; and this floating-point vector since they have the same encoding.
diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
new file mode 100644
index 0000000000000..b57889643e0dc
--- /dev/null
+++ b/test/CodeGen/X86/constructor.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=CTOR %s
+; RUN: llc -mtriple x86_64-pc-linux -use-init-array < %s | FileCheck --check-prefix=INIT-ARRAY %s
+@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @f }, { i32, void ()* } { i32 15, void ()* @g }]
+
+define void @f() {
+entry:
+  ret void
+}
+
+define void @g() {
+entry:
+  ret void
+}
+
+; CTOR:		.section	.ctors.65520,"aw",@progbits
+; CTOR-NEXT:	.align	8
+; CTOR-NEXT:	.quad	g
+; CTOR-NEXT:	.section	.ctors,"aw",@progbits
+; CTOR-NEXT:	.align	8
+; CTOR-NEXT:	.quad	f
+
+; INIT-ARRAY:		.section	.init_array.15,"aw",@init_array
+; INIT-ARRAY-NEXT:	.align	8
+; INIT-ARRAY-NEXT:	.quad	g
+; INIT-ARRAY-NEXT:	.section	.init_array,"aw",@init_array
+; INIT-ARRAY-NEXT:	.align	8
+; INIT-ARRAY-NEXT:	.quad	f
diff --git a/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll b/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll
index b82348b32e43c..064ee364d14e1 100644
--- a/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll
+++ b/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux -o /dev/null -stats |& FileCheck %s -check-prefix=STATS
-; RUN: llc < %s -mtriple=x86_64-win32 -o /dev/null -stats |& FileCheck %s -check-prefix=STATS
+; RUN: llc < %s -mtriple=x86_64-linux -o /dev/null -stats 2>&1 | FileCheck %s -check-prefix=STATS
+; RUN: llc < %s -mtriple=x86_64-win32 -o /dev/null -stats 2>&1 | FileCheck %s -check-prefix=STATS
 ; STATS: 9 asm-printer
 
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index cf6e27d15972a..9badfc82e99cb 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86 %s -o -
-; RUN: llc -march=x86-64 %s -o -
+; RUN: llc -march=x86 < %s -verify-machineinstrs
+; RUN: llc -march=x86-64 < %s -verify-machineinstrs
 
 ; PR6497
 
@@ -391,3 +391,54 @@ if.end:
   %t11 = tail call i64 asm sideeffect "foo", "=*m,=A,{bx},{cx},1,~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %t6, i32 0, i32 0, i64 0) nounwind
   ret void
 }
+
+; Avoid emitting wrong kill flags from InstrEmitter.
+; InstrEmitter::EmitSubregNode() may steal virtual registers from already
+; emitted blocks when isCoalescableExtInstr points out the opportunity.
+; Make sure kill flags are cleared on the newly global virtual register.
+define i64 @ov_read(i8* %vf, i8* nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, i32* %bitstream) nounwind uwtable ssp {
+entry:
+  br i1 undef, label %return, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br i1 undef, label %if.then3, label %if.end7
+
+if.then3:                                         ; preds = %while.body.preheader
+  %0 = load i32* undef, align 4
+  br i1 undef, label %land.lhs.true.i255, label %if.end7
+
+land.lhs.true.i255:                               ; preds = %if.then3
+  br i1 undef, label %if.then.i256, label %if.end7
+
+if.then.i256:                                     ; preds = %land.lhs.true.i255
+  %sub.i = sub i32 0, %0
+  %conv = sext i32 %sub.i to i64
+  br i1 undef, label %if.end7, label %while.end
+
+if.end7:                                          ; preds = %if.then.i256, %land.lhs.true.i255, %if.then3, %while.body.preheader
+  unreachable
+
+while.end:                                        ; preds = %if.then.i256
+  %cmp18 = icmp sgt i32 %sub.i, 0
+  %.conv = select i1 %cmp18, i64 -131, i64 %conv
+  ret i64 %.conv
+
+return:                                           ; preds = %entry
+  ret i64 -131
+}
+
+; The tail call to a varargs function sets %AL.
+; uitofp expands to an FCMOV instruction which splits the basic block.
+; Make sure the live range of %AL isn't split.
+@.str = private unnamed_addr constant { [1 x i8], [63 x i8] } zeroinitializer, align 32
+define void @pr13188(i64* nocapture %this) uwtable ssp address_safety align 2 {
+entry:
+  %x7 = load i64* %this, align 8
+  %sub = add i64 %x7, -1
+  %conv = uitofp i64 %sub to float
+  %div = fmul float %conv, 5.000000e-01
+  %conv2 = fpext float %div to double
+  tail call void (...)* @_Z6PrintFz(i8* getelementptr inbounds ({ [1 x i8], [63 x i8] }* @.str, i64 0, i32 0, i64 0), double %conv2)
+  ret void
+}
+declare void @_Z6PrintFz(...)
diff --git a/test/CodeGen/X86/ctpop-combine.ll b/test/CodeGen/X86/ctpop-combine.ll
index 6406cc73e4128..0a3dfca228c12 100644
--- a/test/CodeGen/X86/ctpop-combine.ll
+++ b/test/CodeGen/X86/ctpop-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
 
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 
diff --git a/test/CodeGen/X86/dagcombine-cse.ll b/test/CodeGen/X86/dagcombine-cse.ll
index c3c7990d19ebb..af69531246cf2 100644
--- a/test/CodeGen/X86/dagcombine-cse.ll
+++ b/test/CodeGen/X86/dagcombine-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats |& grep asm-printer | grep 14
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats 2>&1 | grep asm-printer | grep 14
 
 define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/dbg-merge-loc-entry.ll b/test/CodeGen/X86/dbg-merge-loc-entry.ll
index c35935f015aca..d1e349f79d6fa 100644
--- a/test/CodeGen/X86/dbg-merge-loc-entry.ll
+++ b/test/CodeGen/X86/dbg-merge-loc-entry.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-apple-darwin8"
 ;CHECK-NEXT:    .short  Lset
 ;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.byte	85                      ## DW_OP_reg5
-;CHECK-NEXT: Ltmp5
+;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.quad	0
 ;CHECK-NEXT:	.quad	0
 
diff --git a/test/CodeGen/X86/dbg-value-range.ll b/test/CodeGen/X86/dbg-value-range.ll
index 28d873bfba6fa..6b16865ba9eeb 100644
--- a/test/CodeGen/X86/dbg-value-range.ll
+++ b/test/CodeGen/X86/dbg-value-range.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin10 < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-apple-darwin10 -regalloc=basic -join-physregs < %s | FileCheck %s
 
 %struct.a = type { i32 }
 
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index e577ecb85aa83..8e7c13d8efa9c 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -71,3 +71,24 @@ define i32 @test7(i32 %x) nounwind {
 ; CHECK-NOT: shrl
 ; CHECK: ret
 }
+
+; PR13326
+define i8 @test8(i8 %x) nounwind {
+  %div = udiv i8 %x, 78
+  ret i8 %div
+; CHECK: test8:
+; CHECK: shrb %
+; CHECK: imull $211
+; CHECK: shrl $13
+; CHECK: ret
+}
+
+define i8 @test9(i8 %x) nounwind {
+  %div = udiv i8 %x, 116
+  ret i8 %div
+; CHECK: test9:
+; CHECK: shrb $2
+; CHECK: imull $71
+; CHECK: shrl $11
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
new file mode 100644
index 0000000000000..c5e47facf3468
--- /dev/null
+++ b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
@@ -0,0 +1,237 @@
+; RUN: llc < %s -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -force-align-stack -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
+; rdar://11496434
+
+; no VLAs or dynamic alignment
+define i32 @t1() nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  call void @t1_helper(i32* %a) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t1
+; CHECK-NOT: andq $-{{[0-9]+}}, %rsp
+; CHECK: leaq [[OFFSET:[0-9]*]](%rsp), %rdi
+; CHECK: callq _t1_helper
+; CHECK: movl [[OFFSET]](%rsp), %eax
+; CHECK: addl $13, %eax
+}
+
+declare void @t1_helper(i32*)
+
+; dynamic realignment
+define i32 @t2() nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %v = alloca <8 x float>, align 32
+  call void @t2_helper(i32* %a, <8 x float>* %v) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t2
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: andq $-32, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+;
+; CHECK: leaq {{[0-9]*}}(%rsp), %rdi
+; CHECK: leaq {{[0-9]*}}(%rsp), %rsi
+; CHECK: callq _t2_helper
+;
+; CHECK: movq %rbp, %rsp
+; CHECK: popq %rbp
+}
+
+declare void @t2_helper(i32*, <8 x float>*)
+
+; VLAs
+define i32 @t3(i64 %sz) nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t3_helper(i32* %a, i32* %vla) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t3
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: pushq %rbx
+; CHECK-NOT: andq $-{{[0-9]+}}, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+;
+; CHECK: leaq -{{[0-9]+}}(%rbp), %rsp
+; CHECK: popq %rbx
+; CHECK: popq %rbp
+}
+
+declare void @t3_helper(i32*, i32*)
+
+; VLAs + Dynamic realignment
+define i32 @t4(i64 %sz) nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %v = alloca <8 x float>, align 32
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t4_helper(i32* %a, i32* %vla, <8 x float>* %v) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t4
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: pushq %r14
+; CHECK: pushq %rbx
+; CHECK: andq $-32, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+; CHECK: movq %rsp, %rbx
+;
+; CHECK: leaq {{[0-9]*}}(%rbx), %rdi
+; CHECK: leaq {{[0-9]*}}(%rbx), %rdx
+; CHECK: callq   _t4_helper
+;
+; CHECK: leaq -16(%rbp), %rsp
+; CHECK: popq %rbx
+; CHECK: popq %r14
+; CHECK: popq %rbp
+}
+
+declare void @t4_helper(i32*, i32*, <8 x float>*)
+
+; Dynamic realignment + Spill
+define i32 @t5(float* nocapture %f) nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %0 = bitcast float* %f to <8 x float>*
+  %1 = load <8 x float>* %0, align 32
+  call void @t5_helper1(i32* %a) nounwind
+  call void @t5_helper2(<8 x float> %1) nounwind
+  %2 = load i32* %a, align 4
+  %add = add nsw i32 %2, 13
+  ret i32 %add
+
+; CHECK: _t5
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: andq $-32, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+;
+; CHECK: vmovaps (%rdi), [[AVXREG:%ymm[0-9]+]]
+; CHECK: vmovaps [[AVXREG]], (%rsp)
+; CHECK: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK: callq   _t5_helper1
+; CHECK: vmovaps (%rsp), %ymm0
+; CHECK: callq   _t5_helper2
+; CHECK: movl {{[0-9]+}}(%rsp), %eax
+;
+; CHECK: movq %rbp, %rsp
+; CHECK: popq %rbp
+}
+
+declare void @t5_helper1(i32*)
+
+declare void @t5_helper2(<8 x float>)
+
+; VLAs + Dynamic realignment + Spill
+; FIXME: RA has already reserved RBX, so we can't do dynamic realignment.
+define i32 @t6(i64 %sz, float* nocapture %f) nounwind uwtable ssp {
+entry:
+; CHECK: _t6
+  %a = alloca i32, align 4
+  %0 = bitcast float* %f to <8 x float>*
+  %1 = load <8 x float>* %0, align 32
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t6_helper1(i32* %a, i32* %vla) nounwind
+  call void @t6_helper2(<8 x float> %1) nounwind
+  %2 = load i32* %a, align 4
+  %add = add nsw i32 %2, 13
+  ret i32 %add
+}
+
+declare void @t6_helper1(i32*, i32*)
+
+declare void @t6_helper2(<8 x float>)
+
+; VLAs + Dynamic realignment + byval
+; The byval adjust the sp after the prolog, but if we're restoring the sp from
+; the base pointer we use the original adjustment.
+%struct.struct_t = type { [5 x i32] }
+
+define void @t7(i32 %size, %struct.struct_t* byval align 8 %arg1) nounwind uwtable {
+entry:
+  %x = alloca i32, align 32
+  store i32 0, i32* %x, align 32
+  %0 = zext i32 %size to i64
+  %vla = alloca i32, i64 %0, align 16
+  %1 = load i32* %x, align 32
+  call void @bar(i32 %1, i32* %vla, %struct.struct_t* byval align 8 %arg1)
+  ret void
+
+; CHECK: _t7
+; CHECK:     pushq %rbp
+; CHECK:     movq %rsp, %rbp
+; CHECK:     pushq %rbx
+; CHECK:     andq $-32, %rsp
+; CHECK:     subq ${{[0-9]+}}, %rsp
+; CHECK:     movq %rsp, %rbx
+
+; Stack adjustment for byval
+; CHECK:     subq {{.*}}, %rsp
+; CHECK:     callq _bar
+; CHECK-NOT: addq {{.*}}, %rsp
+; CHECK:     leaq -8(%rbp), %rsp
+; CHECK:     popq %rbx
+; CHECK:     popq %rbp
+}
+
+declare i8* @llvm.stacksave() nounwind
+
+declare void @bar(i32, i32*, %struct.struct_t* byval align 8)
+
+declare void @llvm.stackrestore(i8*) nounwind
+
+
+; Test when forcing stack alignment
+define i32 @t8() nounwind uwtable {
+entry:
+  %a = alloca i32, align 4
+  call void @t1_helper(i32* %a) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; FORCE-ALIGN: _t8
+; FORCE-ALIGN:      movq %rsp, %rbp
+; FORCE-ALIGN:      andq $-32, %rsp
+; FORCE-ALIGN-NEXT: subq $32, %rsp
+; FORCE-ALIGN:      movq %rbp, %rsp
+; FORCE-ALIGN:      popq %rbp
+}
+
+; VLAs
+define i32 @t9(i64 %sz) nounwind uwtable {
+entry:
+  %a = alloca i32, align 4
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t3_helper(i32* %a, i32* %vla) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; FORCE-ALIGN: _t9
+; FORCE-ALIGN: pushq %rbp
+; FORCE-ALIGN: movq %rsp, %rbp
+; FORCE-ALIGN: pushq %rbx
+; FORCE-ALIGN: andq $-32, %rsp
+; FORCE-ALIGN: subq $32, %rsp
+; FORCE-ALIGN: movq %rsp, %rbx
+
+; FORCE-ALIGN: leaq -8(%rbp), %rsp
+; FORCE-ALIGN: popq %rbx
+; FORCE-ALIGN: popq %rbp
+}
diff --git a/test/CodeGen/X86/early-ifcvt.ll b/test/CodeGen/X86/early-ifcvt.ll
new file mode 100644
index 0000000000000..7883ffabd5659
--- /dev/null
+++ b/test/CodeGen/X86/early-ifcvt.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -enable-early-ifcvt -stress-early-ifcvt | FileCheck %s
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: mm2
+define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  br label %do.body
+
+; CHECK: do.body
+; Loop body has no branches before the backedge.
+; CHECK-NOT: LBB
+do.body:
+  %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
+  %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
+  %0 = load i32* %p.addr.0, align 4
+  %cmp = icmp sgt i32 %0, %max.0
+  br i1 %cmp, label %do.cond, label %if.else
+
+if.else:
+  %cmp1 = icmp slt i32 %0, %min.0
+  %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
+  br label %do.cond
+
+do.cond:
+  %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
+  %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
+; CHECK: decl %esi
+; CHECK: jne LBB
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:
+  %sub = sub nsw i32 %max.1, %min.1
+  ret i32 %sub
+}
+
+; CHECK: multipreds
+; Deal with alternative tail predecessors
+; CHECK-NOT: LBB
+; CHECK: cmov
+; CHECK-NOT: LBB
+; CHECK: cmov
+; CHECK-NOT: LBB
+; CHECK: fprintf
+
+define void @multipreds(i32 %sw) nounwind uwtable ssp {
+entry:
+  switch i32 %sw, label %if.then29 [
+    i32 0, label %if.then37
+    i32 127, label %if.end41
+  ]
+
+if.then29:
+  br label %if.end41
+
+if.then37:
+  br label %if.end41
+
+if.end41:
+  %exit_status.0 = phi i32 [ 2, %if.then29 ], [ 0, %if.then37 ], [ 66, %entry ]
+  call void (...)* @fprintf(i32 %exit_status.0) nounwind
+  unreachable
+}
+
+declare void @fprintf(...) nounwind
diff --git a/test/CodeGen/X86/epilogue.ll b/test/CodeGen/X86/epilogue.ll
index 0f16a64ccd798..090680e48febb 100644
--- a/test/CodeGen/X86/epilogue.ll
+++ b/test/CodeGen/X86/epilogue.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | not grep lea
-; RUN: llc < %s -mcpu=generic -march=x86 | grep {movl	%ebp}
+; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+
+; CHECK-NOT: lea{{.*}}(%esp)
+; CHECK: {{(mov.* %ebp, %esp)|(lea.*\(%ebp\), %esp)}}
 
 declare void @bar(<2 x i64>* %n)
 
diff --git a/test/CodeGen/X86/extractps.ll b/test/CodeGen/X86/extractps.ll
index 14778f097ef53..9e1a3754d0f07 100644
--- a/test/CodeGen/X86/extractps.ll
+++ b/test/CodeGen/X86/extractps.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -mcpu=penryn > %t
 ; RUN: not grep movd %t
-; RUN: grep {movss	%xmm} %t | count 1
-; RUN: grep {extractps	\\\$1, %xmm0, } %t | count 1
+; RUN: grep "movss	%xmm" %t | count 1
+; RUN: grep "extractps	\$1, %xmm0, " %t | count 1
 ; PR2647
 
 external global float, align 16         ; <float*>:0 [#uses=2]
diff --git a/test/CodeGen/X86/fabs.ll b/test/CodeGen/X86/fabs.ll
index 9ded7e05dc465..af1867fc51ccc 100644
--- a/test/CodeGen/X86/fabs.ll
+++ b/test/CodeGen/X86/fabs.ll
@@ -1,28 +1,54 @@
 ; Make sure this testcase codegens to the fabs instruction, not a call to fabsf
-; RUN: llc < %s -march=x86 -mattr=-sse2,-sse3,-sse | grep fabs\$ | \
-; RUN:   count 2
-; RUN: llc < %s -march=x86 -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math -enable-no-nans-fp-math | \
-; RUN:   grep fabs\$ | count 3
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse2,-sse3,-sse | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s --check-prefix=UNSAFE
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -O0 | FileCheck %s --check-prefix=NOOPT
 
 declare float @fabsf(float)
 
 declare x86_fp80 @fabsl(x86_fp80)
 
+; CHECK:  test1:
+; UNSAFE: test1:
+; NOOPT:  test1:
 define float @test1(float %X) {
-        %Y = call float @fabsf(float %X)
+        %Y = call float @fabsf(float %X) readnone
         ret float %Y
 }
+; CHECK:  {{^[ \t]+fabs$}}
+; UNSAFE: {{^[ \t]+fabs$}}
 
+; CHECK-NOT:  fabs
+; UNSAFE-NOT: fabs
+; NOOPT-NOT:  fabsf
+
+; CHECK:  test2:
+; UNSAFE: test2:
+; NOOPT:  test2:
 define double @test2(double %X) {
         %Y = fcmp oge double %X, -0.0
         %Z = fsub double -0.0, %X
         %Q = select i1 %Y, double %X, double %Z
         ret double %Q
 }
+; fabs is not used here.
+; CHECK-NOT:  fabs
+; NOOPT-NOT:  fabs
+
+; UNSAFE: {{^[ \t]+fabs$}}
 
+; UNSAFE-NOT: fabs
+
+; CHECK:  test3:
+; UNSAFE: test3:
+; NOOPT:  test3:
 define x86_fp80 @test3(x86_fp80 %X) {
-        %Y = call x86_fp80 @fabsl(x86_fp80 %X)
+        %Y = call x86_fp80 @fabsl(x86_fp80 %X) readnone
         ret x86_fp80 %Y
 }
+; CHECK:  {{^[ \t]+fabs$}}
+; UNSAFE: {{^[ \t]+fabs$}}
+; NOOPT:  {{^[ \t]+fabs$}}
 
-
+; CHECK-NOT:  fabs
+; UNSAFE-NOT: fabs
+; NOOPT-NOT:  fabs
diff --git a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
index e4982f0549546..14cb136f89de6 100644
--- a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
+++ b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mcpu=generic -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {add	ESP, 8}
+; RUN:   grep "add	ESP, 8"
 
 target triple = "i686-pc-linux-gnu"
 
diff --git a/test/CodeGen/X86/fast-isel-constpool.ll b/test/CodeGen/X86/fast-isel-constpool.ll
index 323c8533cec2f..b3adb802a8c57 100644
--- a/test/CodeGen/X86/fast-isel-constpool.ll
+++ b/test/CodeGen/X86/fast-isel-constpool.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -fast-isel | grep {LCPI0_0(%rip)}
+; RUN: llc < %s -fast-isel | grep "LCPI0_0(%rip)"
 ; Make sure fast isel uses rip-relative addressing when required.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin9.0"
diff --git a/test/CodeGen/X86/fast-isel-gv.ll b/test/CodeGen/X86/fast-isel-gv.ll
index 34f8b382522fc..cb2464e746b17 100644
--- a/test/CodeGen/X86/fast-isel-gv.ll
+++ b/test/CodeGen/X86/fast-isel-gv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -fast-isel | grep {_kill@GOTPCREL(%rip)}
+; RUN: llc < %s -fast-isel | grep "_kill@GOTPCREL(%rip)"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin10.0"
 @f = global i8 (...)* @kill		; <i8 (...)**> [#uses=1]
diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll
index 8db1936bc20ea..52b1e8564338e 100644
--- a/test/CodeGen/X86/fast-isel-mem.ll
+++ b/test/CodeGen/X86/fast-isel-mem.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin -mcpu=generic | FileCheck %s
+; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 @src = external global i32
 
@@ -18,6 +19,13 @@ entry:
 ; CHECK: 	movl	%eax, (%ecx)
 ; CHECK: 	ret
 
+; ATOM:	loadgv:
+; ATOM:		movl    L_src$non_lazy_ptr, %ecx
+; ATOM:         movl    (%ecx), %eax
+; ATOM:         addl    (%ecx), %eax
+; ATOM:         movl    %eax, (%ecx)
+; ATOM:         ret
+
 }
 
 %stuff = type { i32 (...)** }
@@ -31,4 +39,8 @@ entry:
 ; CHECK:	movl	$0, %eax
 ; CHECK:	movl	L_LotsStuff$non_lazy_ptr, %ecx
 
+; ATOM: _t:
+; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %ecx
+; ATOM:         movl    $0, %eax
+
 }
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index b9598bb465ce4..19f38882a6c68 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -46,3 +46,17 @@ entry:
 ; CHECK: addl $40
 }
 declare void @test3sret(%struct.a* sret)
+
+; Check that fast-isel sret works with fastcc (and does not callee-pop)
+define void @test4() nounwind ssp {
+entry:
+  %tmp = alloca %struct.a, align 8
+  call fastcc void @test4fastccsret(%struct.a* sret %tmp)
+  ret void
+; CHECK: test4:
+; CHECK: subl $28
+; CHECK: leal (%esp), %ecx
+; CHECK: calll _test4fastccsret
+; CHECK addl $28
+}
+declare fastcc void @test4fastccsret(%struct.a* sret)
diff --git a/test/CodeGen/X86/fast-isel.ll b/test/CodeGen/X86/fast-isel.ll
index c88d52968dd8d..132df2b0ab43d 100644
--- a/test/CodeGen/X86/fast-isel.ll
+++ b/test/CodeGen/X86/fast-isel.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -fast-isel -fast-isel-abort -march=x86 -mattr=sse2
-; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10
+; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -march=x86 -mattr=sse2
+; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=x86_64-apple-darwin10
 
 ; This tests very minimal fast-isel functionality.
 
@@ -117,3 +117,11 @@ define i64* @life() nounwind {
   ret i64* %a3
 }
 
+declare void @llvm.donothing() readnone
+
+; CHECK: donada
+define void @donada() nounwind {
+entry:
+  call void @llvm.donothing()
+  ret void
+}
diff --git a/test/CodeGen/X86/fastcc-byval.ll b/test/CodeGen/X86/fastcc-byval.ll
index 52b3e57b96bc5..f1204d677a55b 100644
--- a/test/CodeGen/X86/fastcc-byval.ll
+++ b/test/CodeGen/X86/fastcc-byval.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -tailcallopt=false | grep {movl\[\[:space:\]\]*8(%esp), %eax} | count 2
+; RUN: llc < %s -tailcallopt=false | grep "movl[[:space:]]*8(%esp), %eax" | count 2
 ; PR3122
 ; rdar://6400815
 
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 5deedb9dd9b17..b0c1d0a0dd1c5 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,8 +1,11 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
-; CHECK: _fmaf
+; CHECK-FMA-INST: vfmadd213ss
+; CHECK-FMA-CALL: _fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -11,7 +14,8 @@ entry:
 }
 
 ; CHECK: test_f64
-; CHECK: _fma
+; CHECK-FMA-INST: vfmadd213sd
+; CHECK-FMA-CALL: _fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
new file mode 100755
index 0000000000000..90529e09d75b3
--- /dev/null
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+
+define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmadd213ss %xmm
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmadd213ps
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  ; CHECK: fmadd213ps {{.*\(%r.*}}, %ymm
+  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmadd213ss %xmm
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmadd213ps
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  ; CHECK: fnmadd213ps {{.*\(%r.*}}, %ymm
+  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmsub213ss
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmsub213ps
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmsub213ss
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmsub213ps
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+;;;;
+
+define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmadd213sd
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmadd213pd
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmadd213sd
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmadd213pd
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+
+
+define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmsub213sd
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmsub213pd
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmsub213sd
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmsub213pd
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index 5ed03ef01f3c1..fd414b346e2b2 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,295 +1,295 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
 
 ; VFMADD
-define < 4 x float > @test_x86_fma4_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddss
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
   ; CHECK: vfmaddss (%{{.*}})
   %x = load float *%a2
   %y = insertelement <4 x float> undef, float %x, i32 0
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddss %{{.*}}, (%{{.*}})
   %x = load float *%a1
   %y = insertelement <4 x float> undef, float %x, i32 0
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
   ; CHECK: vfmaddsd (%{{.*}})
   %x = load double *%a2
   %y = insertelement <2 x double> undef, double %x, i32 0
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddsd %{{.*}}, (%{{.*}})
   %x = load double *%a1
   %y = insertelement <2 x double> undef, double %x, i32 0
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
   ; CHECK: vfmaddps (%{{.*}})
   %x = load <4 x float>* %a2
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
   %x = load <4 x float>* %a1
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
   ; CHECK: vfmaddpd (%{{.*}})
   %x = load <2 x double>* %a2
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
   %x = load <2 x double>* %a1
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmaddps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmaddpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMSUB
-define < 4 x float > @test_x86_fma4_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmsubss
-  %res = call < 4 x float > @llvm.x86.fma4.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmsubsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmsubps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmsubpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmsubps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmsubpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFNMADD
-define < 4 x float > @test_x86_fma4_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmaddss
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmaddsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmaddps
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmaddpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfnmaddps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfnmaddpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFNMSUB
-define < 4 x float > @test_x86_fma4_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmsubss
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmsubsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmsubps
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmsubpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfnmsubps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfnmsubpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMADDSUB
-define < 4 x float > @test_x86_fma4_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddsubps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddsubpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmaddsubps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmaddsubpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMSUBADD
-define < 4 x float > @test_x86_fma4_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmsubaddps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmsubaddpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmsubaddps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmsubaddpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
new file mode 100644
index 0000000000000..5d97a87b3bbfe
--- /dev/null
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -0,0 +1,139 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %res = fadd <4 x float> %x, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %res = fsub <4 x float> %x, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fnmadd_ps
+; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %res = fsub <4 x float> %a2, %x
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fnmsub_ps
+; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmadd_ps_y
+; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %res = fadd <8 x float> %x, %a2
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps_y
+; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %res = fsub <8 x float> %x, %a2
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fnmadd_ps_y
+; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %res = fsub <8 x float> %a2, %x
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fnmsub_ps_y
+; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %y = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub <8 x float> %y, %a2
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fmadd_pd_y
+; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+  %x = fmul <4 x double> %a0, %a1
+  %res = fadd <4 x double> %x, %a2
+  ret <4 x double> %res
+}
+
+; CHECK: test_x86_fmsub_pd_y
+; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+  %x = fmul <4 x double> %a0, %a1
+  %res = fsub <4 x double> %x, %a2
+  ret <4 x double> %res
+}
+
+; CHECK: test_x86_fmsub_pd
+; CHECK: vfmsub213pd     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  %x = fmul <2 x double> %a0, %a1
+  %res = fsub <2 x double> %x, %a2
+  ret <2 x double> %res
+}
+
+; CHECK: test_x86_fnmadd_ss
+; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
+  %x = fmul float %a0, %a1
+  %res = fsub float %a2, %x
+  ret float %res
+}
+
+; CHECK: test_x86_fnmadd_sd
+; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
+  %x = fmul double %a0, %a1
+  %res = fsub double %a2, %x
+  ret double %res
+}
+
+; CHECK: test_x86_fmsub_sd
+; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
+  %x = fmul double %a0, %a1
+  %res = fsub double %x, %a2
+  ret double %res
+}
+
+; CHECK: test_x86_fnmsub_ss
+; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
+  %x = fsub float -0.000000e+00, %a0
+  %y = fmul float %x, %a1
+  %res = fsub float %y, %a2
+  ret float %res
+}
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index e03cb7edb5803..c961f7576f937 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -45,3 +45,29 @@ L:
 
 }
 
+; rdar://10554090
+; xor in exit block will be CSE'ed and load will be folded to xor in entry.
+define i1 @test3(i32* %P, i32* %Q) nounwind {
+; CHECK: test3:
+; CHECK: movl 8(%esp), %eax
+; CHECK: xorl (%eax),
+; CHECK: j
+; CHECK-NOT: xor
+entry:
+  %0 = load i32* %P, align 4
+  %1 = load i32* %Q, align 4
+  %2 = xor i32 %0, %1
+  %3 = and i32 %2, 65535
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %exit, label %land.end
+
+exit:
+  %shr.i.i19 = xor i32 %1, %0
+  %5 = and i32 %shr.i.i19, 2147418112
+  %6 = icmp eq i32 %5, 0
+  br label %land.end
+
+land.end:
+  %7 = phi i1 [ %6, %exit ], [ false, %entry ]
+  ret i1 %7
+}
diff --git a/test/CodeGen/X86/fold-pcmpeqd-1.ll b/test/CodeGen/X86/fold-pcmpeqd-1.ll
index cc4198d7caf00..d850630a4d083 100644
--- a/test/CodeGen/X86/fold-pcmpeqd-1.ll
+++ b/test/CodeGen/X86/fold-pcmpeqd-1.ll
@@ -1,11 +1,16 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: grep pcmpeqd %t | count 1
-; RUN: grep xor %t | count 1
-; RUN: not grep LCP %t
+; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s
 
 define <2 x double> @foo() nounwind {
   ret <2 x double> bitcast (<2 x i64><i64 -1, i64 -1> to <2 x double>)
+; CHECK: foo:
+; CHECK: pcmpeqd %xmm0, %xmm0
+; CHECK-NOT: %xmm
+; CHECK: ret
 }
 define <2 x double> @bar() nounwind {
   ret <2 x double> bitcast (<2 x i64><i64 0, i64 0> to <2 x double>)
+; CHECK: bar:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NOT: %xmm
+; CHECK: ret
 }
diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll
new file mode 100644
index 0000000000000..2ada194f891ff
--- /dev/null
+++ b/test/CodeGen/X86/force-align-stack-alloca.ll
@@ -0,0 +1,70 @@
+; This test is attempting to detect when we request forced re-alignment of the
+; stack to an alignment greater than would be available due to the ABI. We
+; arbitrarily force alignment up to 32-bytes for i386 hoping that this will
+; exceed any ABI provisions.
+;
+; RUN: llc < %s -mcpu=generic -force-align-stack -stack-alignment=32 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define i32 @f(i8* %p) nounwind {
+entry:
+  %0 = load i8* %p
+  %conv = sext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i64 @g(i32 %i) nounwind {
+; CHECK: g:
+; CHECK:      pushl  %ebp
+; CHECK-NEXT: movl   %esp, %ebp
+; CHECK-NEXT: pushl
+; CHECK-NEXT: pushl
+; CHECK-NEXT: andl   $-32, %esp
+; CHECK-NEXT: subl   $32, %esp
+;
+; Now setup the base pointer (%esi).
+; CHECK-NEXT: movl   %esp, %esi
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; The next adjustment of the stack is due to the alloca.
+; CHECK:      movl   %{{...}}, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; Next we set up the memset call, and then undo it.
+; CHECK:      subl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+; CHECK:      calll  memset
+; CHECK-NEXT: addl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; Next we set up the call to 'f'.
+; CHECK:      subl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+; CHECK:      calll  f
+; CHECK-NEXT: addl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; Restore %esp from %ebp (frame pointer) and subtract the size of
+; zone with callee-saved registers to pop them.
+; This is the state prior to stack realignment and the allocation of VLAs.
+; CHECK-NOT:  popl
+; CHECK:      leal   -8(%ebp), %esp
+; CHECK-NEXT: popl
+; CHECK-NEXT: popl
+; CHECK-NEXT: popl   %ebp
+; CHECK-NEXT: ret
+
+entry:
+  br label %if.then
+
+if.then:
+  %0 = alloca i8, i32 %i
+  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 %i, i32 1, i1 false)
+  %call = call i32 @f(i8* %0)
+  %conv = sext i32 %call to i64
+  ret i64 %conv
+}
+
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1) nounwind
diff --git a/test/CodeGen/X86/fp-immediate-shorten.ll b/test/CodeGen/X86/fp-immediate-shorten.ll
index cafc61a41ff29..62d81003a62d7 100644
--- a/test/CodeGen/X86/fp-immediate-shorten.ll
+++ b/test/CodeGen/X86/fp-immediate-shorten.ll
@@ -1,7 +1,7 @@
 ;; Test that this FP immediate is stored in the constant pool as a float.
 
 ; RUN: llc < %s -march=x86 -mattr=-sse2,-sse3 | \
-; RUN:   grep {.long.1123418112}
+; RUN:   grep ".long.1123418112"
 
 define double @D() {
         ret double 1.230000e+02
diff --git a/test/CodeGen/X86/fp-in-intregs.ll b/test/CodeGen/X86/fp-in-intregs.ll
index 6966cf0497898..1f5121d271c0e 100644
--- a/test/CodeGen/X86/fp-in-intregs.ll
+++ b/test/CodeGen/X86/fp-in-intregs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-macosx -mcpu=yonah | FileCheck %s
 ; CHECK-NOT:     {{((xor|and)ps|movd)}}
 
 ; These operations should be done in integer registers, eliminating constant
diff --git a/test/CodeGen/X86/fp-stack-compare-cmov.ll b/test/CodeGen/X86/fp-stack-compare-cmov.ll
new file mode 100644
index 0000000000000..b457fbc1a332f
--- /dev/null
+++ b/test/CodeGen/X86/fp-stack-compare-cmov.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=x86 -mcpu=pentiumpro | FileCheck %s
+; PR1012
+
+define float @foo(float* %col.2.0) {
+; CHECK: fucompi
+; CHECK: fcmov
+  %tmp = load float* %col.2.0
+  %tmp16 = fcmp olt float %tmp, 0.000000e+00
+  %tmp20 = fsub float -0.000000e+00, %tmp
+  %iftmp.2.0 = select i1 %tmp16, float %tmp20, float %tmp
+  ret float %iftmp.2.0
+}
diff --git a/test/CodeGen/X86/fp-stack-compare.ll b/test/CodeGen/X86/fp-stack-compare.ll
index f3998b67f6726..a8557adeaf742 100644
--- a/test/CodeGen/X86/fp-stack-compare.ll
+++ b/test/CodeGen/X86/fp-stack-compare.ll
@@ -1,8 +1,11 @@
 ; RUN: llc < %s -march=x86 -mcpu=i386 | FileCheck %s
-; PR1012
+; PR6679
 
 define float @foo(float* %col.2.0) {
-; CHECK: fucompi
+; CHECK: fucomp
+; CHECK-NOT: fucompi
+; CHECK: j
+; CHECK-NOT: fcmov
   %tmp = load float* %col.2.0
   %tmp16 = fcmp olt float %tmp, 0.000000e+00
   %tmp20 = fsub float -0.000000e+00, %tmp
diff --git a/test/CodeGen/X86/fp-stack-ret.ll b/test/CodeGen/X86/fp-stack-ret.ll
index 1307f70ead17d..2733117a1f023 100644
--- a/test/CodeGen/X86/fp-stack-ret.ll
+++ b/test/CodeGen/X86/fp-stack-ret.ll
@@ -22,7 +22,7 @@ define fastcc double @test2(<2 x double> %A) {
 
 ; CHECK: test3
 ; CHECK: sub{{.*}}%esp
-; CHECLK-NOT: xmm
+; CHECK-NOT: xmm
 define fastcc double @test3(<4 x float> %A) {
 	%B = bitcast <4 x float> %A to <2 x double>
 	%C = call fastcc double @test2(<2 x double> %B)
diff --git a/test/CodeGen/X86/fp_load_fold.ll b/test/CodeGen/X86/fp_load_fold.ll
index 0145069b8cd63..a2cea5e57f648 100644
--- a/test/CodeGen/X86/fp_load_fold.ll
+++ b/test/CodeGen/X86/fp_load_fold.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep -i ST | not grep {fadd\\|fsub\\|fdiv\\|fmul}
+; RUN:   grep -i ST | not grep "fadd\|fsub\|fdiv\|fmul"
 
 ; Test that the load of the memory location is folded into the operation.
 
diff --git a/test/CodeGen/X86/full-lsr.ll b/test/CodeGen/X86/full-lsr.ll
index ff9b1b0b6a5a8..1344cdcd4320d 100644
--- a/test/CodeGen/X86/full-lsr.ll
+++ b/test/CodeGen/X86/full-lsr.ll
@@ -1,9 +1,17 @@
-; RUN: llc < %s -march=x86 >%t
-
-; RUN: grep {addl	\\\$4,} %t | count 3
-; RUN: not grep {,%} %t
+; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
+; ATOM: foo
+; ATOM: addl
+; ATOM: leal
+; ATOM: leal
+
+; CHECK: foo
+; CHECK: addl
+; CHECK: addl
+; CEHCK: addl
+
 entry:
 	%0 = icmp sgt i32 %N, 0		; <i1> [#uses=1]
 	br i1 %0, label %bb, label %return
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index 4a6927f6a269b..72a50961b2ff1 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-win32 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s
 ; rdar://7398554
 
 ; When doing vector gather-scatter index calculation with 32-bit indices,
diff --git a/test/CodeGen/X86/gs-fold.ll b/test/CodeGen/X86/gs-fold.ll
new file mode 100644
index 0000000000000..dbec76ba52c40
--- /dev/null
+++ b/test/CodeGen/X86/gs-fold.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-freebsd | FileCheck %s --check-prefix=CHECK-FBSD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s --check-prefix=CHECK-LINUX
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%struct.thread = type { i32, i32, i32, i32 }
+
+define i32 @test() nounwind uwtable {
+entry:
+  %0 = load volatile %struct.thread* addrspace(256)* null
+  %c = getelementptr inbounds %struct.thread* %0, i64 0, i32 2
+  %1 = load i32* %c, align 4
+  ret i32 %1
+}
+
+; Check that we are not assuming that gs contains the address of gs if we are not targeting Linux
+; CHECK-FBSD: movq	%gs:0, %rax
+; CHECK-FBSD: movl	8(%rax), %eax
+; Check that we are assuming that gs contains the address of gs if we are targeting Linux
+; CHECK-LINUX: movl	%gs:8, %eax
+
diff --git a/test/CodeGen/X86/h-register-addressing-32.ll b/test/CodeGen/X86/h-register-addressing-32.ll
index 76ffd66524b94..968a9e88c0e97 100644
--- a/test/CodeGen/X86/h-register-addressing-32.ll
+++ b/test/CodeGen/X86/h-register-addressing-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {movzbl	%\[abcd\]h,} | count 7
+; RUN: llc < %s -march=x86 | grep "movzbl	%[abcd]h," | count 7
 
 ; Use h-register extract and zero-extend.
 
diff --git a/test/CodeGen/X86/h-register-addressing-64.ll b/test/CodeGen/X86/h-register-addressing-64.ll
index 98817f3fb59f5..a19fca555811a 100644
--- a/test/CodeGen/X86/h-register-addressing-64.ll
+++ b/test/CodeGen/X86/h-register-addressing-64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep {movzbl	%\[abcd\]h,} | count 7
+; RUN: llc < %s -march=x86-64 | grep "movzbl	%[abcd]h," | count 7
 
 ; Use h-register extract and zero-extend.
 
diff --git a/test/CodeGen/X86/h-registers-1.ll b/test/CodeGen/X86/h-registers-1.ll
index 402cdfe413b57..903c4538aba73 100644
--- a/test/CodeGen/X86/h-registers-1.ll
+++ b/test/CodeGen/X86/h-registers-1.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-linux > %t
-; RUN: grep {movzbl	%\[abcd\]h,} %t | count 8
-; RUN: grep {%\[abcd\]h} %t | not grep {%r\[\[:digit:\]\]*d}
+; RUN: grep "movzbl	%[abcd]h," %t | count 8
+; RUN: grep "%[abcd]h" %t | not grep "%r[[:digit:]]*d"
 
 ; LLVM creates virtual registers for values live across blocks
 ; based on the type of the value. Make sure that the extracts
diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll
index 4289fa7cc254d..74ecd045b3d53 100644
--- a/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/test/CodeGen/X86/hoist-invariant-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -stats -O2 |& grep "1 machine-licm"
+; RUN: llc < %s -stats -O2 2>&1 | grep "1 machine-licm"
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7.2"
diff --git a/test/CodeGen/X86/iabs.ll b/test/CodeGen/X86/iabs.ll
index a8ba0155fd10e..9196cce1ae5ac 100644
--- a/test/CodeGen/X86/iabs.ll
+++ b/test/CodeGen/X86/iabs.ll
@@ -1,13 +1,17 @@
-; RUN: llc < %s -march=x86-64 -stats  |& \
-; RUN:   grep {5 .*Number of machine instrs printed}
+; RUN: llc < %s -march=x86-64 | FileCheck %s
 
 ;; Integer absolute value, should produce something at least as good as:
-;;       movl %edi, %ecx
-;;       sarl $31, %ecx
-;;       leal (%rdi,%rcx), %eax
-;;       xorl %ecx, %eax
+;;       movl   %edi, %eax
+;;       negl   %eax
+;;       cmovll %edi, %eax
 ;;       ret
+; rdar://10695237
 define i32 @test(i32 %a) nounwind {
+; CHECK: test:
+; CHECK: mov
+; CHECK-NEXT: neg
+; CHECK-NEXT: cmov
+; CHECK-NEXT: ret
         %tmp1neg = sub i32 0, %a
         %b = icmp sgt i32 %a, -1
         %abs = select i1 %b, i32 %a, i32 %tmp1neg
diff --git a/test/CodeGen/X86/illegal-vector-args-return.ll b/test/CodeGen/X86/illegal-vector-args-return.ll
index cecf77af4de12..62a21f4c5aadc 100644
--- a/test/CodeGen/X86/illegal-vector-args-return.ll
+++ b/test/CodeGen/X86/illegal-vector-args-return.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {mulpd	%xmm3, %xmm1}
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {mulpd	%xmm2, %xmm0}
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {addps	%xmm3, %xmm1}
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {addps	%xmm2, %xmm0}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "mulpd	%xmm3, %xmm1"
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "mulpd	%xmm2, %xmm0"
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps	%xmm3, %xmm1"
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps	%xmm2, %xmm0"
 
 define <4 x double> @foo(<4 x double> %x, <4 x double> %z) {
   %y = fmul <4 x double> %x, %z
diff --git a/test/CodeGen/X86/inline-asm-error.ll b/test/CodeGen/X86/inline-asm-error.ll
index 134d6e9528336..747a5891cf046 100644
--- a/test/CodeGen/X86/inline-asm-error.ll
+++ b/test/CodeGen/X86/inline-asm-error.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -march x86 -regalloc=fast       < %s 2> %t1
+; RUN: not llc -march x86 -regalloc=fast -optimize-regalloc=0 < %s 2> %t1
 ; RUN: not llc -march x86 -regalloc=basic      < %s 2> %t2
 ; RUN: not llc -march x86 -regalloc=greedy     < %s 2> %t3
 ; RUN: FileCheck %s < %t1
diff --git a/test/CodeGen/X86/inline-asm-modifier-n.ll b/test/CodeGen/X86/inline-asm-modifier-n.ll
index 5e76b6c0580e9..b069c46318991 100644
--- a/test/CodeGen/X86/inline-asm-modifier-n.ll
+++ b/test/CodeGen/X86/inline-asm-modifier-n.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep { 37}
+; RUN: llc < %s -march=x86 | grep " 37"
 ; rdar://7008959
 
 define void @bork() nounwind {
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index eef6c2f377a7b..e6eb9efd8c781 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -43,3 +43,12 @@ entry:
   %0 = tail call i8 asm sideeffect "xchg $0, $1", "=r,*m,0,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* %p, i1 %desired) nounwind
   ret void
 }
+
+; <rdar://problem/11542429>
+; The constrained GR32_ABCD register class of the 'q' constraint requires
+; special handling after the preceding outputs used up eax-edx.
+define void @constrain_abcd(i8* %h) nounwind ssp {
+entry:
+  %0 = call { i32, i32, i32, i32, i32 } asm sideeffect "", "=&r,=&r,=&r,=&r,=&q,r,~{ecx},~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %h) nounwind
+  ret void
+}
diff --git a/test/CodeGen/X86/inreg.ll b/test/CodeGen/X86/inreg.ll
new file mode 100644
index 0000000000000..6653cfb14ed81
--- /dev/null
+++ b/test/CodeGen/X86/inreg.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=i686-pc-linux -mcpu=corei7 | FileCheck --check-prefix=DAG %s
+; RUN: llc < %s -mtriple=i686-pc-linux -mcpu=corei7 -O0 | FileCheck --check-prefix=FAST %s
+
+%struct.s1 = type { double, float }
+
+define void @g1() nounwind {
+entry:
+  %tmp = alloca %struct.s1, align 4
+  call void @f(%struct.s1* inreg sret %tmp, i32 inreg 41, i32 inreg 42, i32 43)
+  ret void
+  ; DAG: g1:
+  ; DAG: subl $[[AMT:.*]], %esp
+  ; DAG-NEXT: $43, (%esp)
+  ; DAG-NEXT: leal    16(%esp), %eax
+  ; DAG-NEXT: movl    $41, %edx
+  ; DAG-NEXT: movl    $42, %ecx
+  ; DAG-NEXT: calll   f
+  ; DAG-NEXT: addl $[[AMT]], %esp
+  ; DAG-NEXT: ret
+
+  ; FAST: g1:
+  ; FAST: subl $[[AMT:.*]], %esp
+  ; FAST-NEXT: leal    8(%esp), %eax
+  ; FAST-NEXT: movl    $41, %edx
+  ; FAST-NEXT: movl    $42, %ecx
+  ; FAST: $43, (%esp)
+  ; FAST: calll   f
+  ; FAST-NEXT: addl $[[AMT]], %esp
+  ; FAST: ret
+}
+
+declare void @f(%struct.s1* inreg sret, i32 inreg, i32 inreg, i32)
+
+%struct.s2 = type {}
+
+define void @g2(%struct.s2* inreg sret %agg.result) nounwind {
+entry:
+  ret void
+  ; DAG: g2
+  ; DAG-NOT: ret $4
+  ; DAG: .size g2
+
+  ; FAST: g2
+  ; FAST-NOT: ret $4
+  ; FAST: .size g2
+}
diff --git a/test/CodeGen/X86/isel-sink2.ll b/test/CodeGen/X86/isel-sink2.ll
index 5ed0e00fd8736..b162666362aa7 100644
--- a/test/CodeGen/X86/isel-sink2.ll
+++ b/test/CodeGen/X86/isel-sink2.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 > %t
-; RUN: grep {movb.7(%...)} %t
+; RUN: grep "movb.7(%...)" %t
 ; RUN: not grep leal %t
 
 define i8 @test(i32 *%P) nounwind {
diff --git a/test/CodeGen/X86/ispositive.ll b/test/CodeGen/X86/ispositive.ll
index 8adf723aabc38..b1d1a20c8eb69 100644
--- a/test/CodeGen/X86/ispositive.ll
+++ b/test/CodeGen/X86/ispositive.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {shrl.*31}
+; RUN: llc < %s -march=x86 | grep "shrl.*31"
 
 define i32 @test1(i32 %X) {
 entry:
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index dbd133cd9ab4f..48e21061d2099 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -22,6 +22,7 @@ declare i32 @bar(...)
 declare i32 @baz(...)
 
 ; rdar://10633221
+; rdar://11355268
 define i32 @g(i32 %a, i32 %b) nounwind {
 entry:
 ; CHECK: g:
@@ -32,3 +33,223 @@ entry:
   %cond = select i1 %cmp, i32 %sub, i32 0
   ret i32 %cond
 }
+
+; rdar://10734411
+define i32 @h(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: h:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp slt i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+define i32 @i(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: i:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+define i32 @j(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: j:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp ugt i32 %a, %b
+  %sub = sub i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+define i32 @k(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: k:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp ult i32 %b, %a
+  %sub = sub i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+; redundant cmp instruction
+define i32 @l(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l:
+; CHECK-NOT: cmp
+  %cmp = icmp slt i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 %a
+  ret i32 %cond
+}
+define i32 @m(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: m:
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %b, i32 %sub
+  ret i32 %cond
+}
+; If EFLAGS is live-out, we can't remove cmp if there exists
+; a swapped sub.
+define i32 @l2(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l2:
+; CHECK: cmp
+  %cmp = icmp eq i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp sgt i32 %b, %a
+  %sel = select i1 %cmp2, i32 %sub, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %sub
+}
+define i32 @l3(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l3:
+; CHECK: sub
+; CHECK-NOT: cmp
+; CHECK: jge
+  %cmp = icmp sgt i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  ret i32 %sub
+
+if.else:
+  %add = add nsw i32 %sub, 1
+  ret i32 %add
+}
+; rdar://11830760
+; When Movr0 is between sub and cmp, we need to move "Movr0" before sub.
+define i32 @l4(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l4:
+; CHECK: xor
+; CHECK: sub
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %b, %a
+  %sub = sub i32 %a, %b
+  %.sub = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %.sub
+}
+; rdar://11540023
+define i32 @n(i32 %x, i32 %y) nounwind {
+entry:
+; CHECK: n:
+; CHECK-NOT: sub
+; CHECK: cmp
+  %sub = sub nsw i32 %x, %y
+  %cmp = icmp slt i32 %sub, 0
+  %y.x = select i1 %cmp, i32 %y, i32 %x
+  ret i32 %y.x
+}
+; PR://13046
+define void @o() nounwind uwtable {
+entry:
+  %0 = load i16* undef, align 2
+  br i1 undef, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %entry
+  unreachable
+
+if.end.i:                                         ; preds = %entry
+  br i1 undef, label %sw.bb, label %sw.default
+
+sw.bb:                                            ; preds = %if.end.i
+  br i1 undef, label %if.then44, label %if.end29
+
+if.end29:                                         ; preds = %sw.bb
+; CHECK: o:
+; CHECK: cmp
+  %1 = urem i16 %0, 10
+  %cmp25 = icmp eq i16 %1, 0
+  %. = select i1 %cmp25, i16 2, i16 0
+  br i1 %cmp25, label %if.then44, label %sw.default
+
+sw.default:                                       ; preds = %if.end29, %if.end.i
+  br i1 undef, label %if.then.i96, label %if.else.i97
+
+if.then.i96:                                      ; preds = %sw.default
+  unreachable
+
+if.else.i97:                                      ; preds = %sw.default
+  unreachable
+
+if.then44:                                        ; preds = %if.end29, %sw.bb
+  %aModeRefSel.1.ph = phi i16 [ %., %if.end29 ], [ 3, %sw.bb ]
+  br i1 undef, label %if.then.i103, label %if.else.i104
+
+if.then.i103:                                     ; preds = %if.then44
+  unreachable
+
+if.else.i104:                                     ; preds = %if.then44
+  ret void
+}
+; rdar://11855129
+define i32 @p(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: p:
+; CHECK-NOT: test
+; CHECK: cmovs
+  %add = add nsw i32 %b, %a
+  %cmp = icmp sgt i32 %add, 0
+  %add. = select i1 %cmp, i32 %add, i32 0
+  ret i32 %add.
+}
+; PR13475
+; If we have sub a, b and cmp b, a and the result of cmp is used
+; by sbb, we should not optimize cmp away.
+define i32 @q(i32 %j.4, i32 %w, i32 %el) {
+; CHECK: q:
+; CHECK: sub
+; CHECK: cmp
+; CHECK-NEXT: sbb
+  %tmp532 = add i32 %j.4, %w
+  %tmp533 = icmp ugt i32 %tmp532, %el
+  %tmp534 = icmp ult i32 %w, %el
+  %or.cond = and i1 %tmp533, %tmp534
+  %tmp535 = sub i32 %el, %w
+  %j.5 = select i1 %or.cond, i32 %tmp535, i32 %j.4
+  ret i32 %j.5
+}
+; rdar://11873276
+define i8* @r(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
+entry:
+; CHECK: r:
+; CHECK: sub
+; CHECK-NOT: cmp
+; CHECK: j
+; CHECK-NOT: sub
+; CHECK: ret
+  %0 = load i32* %offset, align 8
+  %cmp = icmp slt i32 %0, %size
+  br i1 %cmp, label %return, label %if.end
+
+if.end:
+  %sub = sub nsw i32 %0, %size
+  store i32 %sub, i32* %offset, align 8
+  %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+  br label %return
+
+return:
+  %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+  ret i8* %retval.0
+}
diff --git a/test/CodeGen/X86/label-redefinition.ll b/test/CodeGen/X86/label-redefinition.ll
index 9ad33e0297667..9e88a18e87324 100644
--- a/test/CodeGen/X86/label-redefinition.ll
+++ b/test/CodeGen/X86/label-redefinition.ll
@@ -1,5 +1,5 @@
 ; PR7054
-; RUN: not llc %s -o - |& grep {'_foo' label emitted multiple times to assembly}
+; RUN: not llc %s -o - 2>&1 | grep "'_foo' label emitted multiple times to assembly"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin10.0.0"
 
diff --git a/test/CodeGen/X86/large-global.ll b/test/CodeGen/X86/large-global.ll
new file mode 100644
index 0000000000000..7cb974b21e739
--- /dev/null
+++ b/test/CodeGen/X86/large-global.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+; rdar://11729134
+
+; EmitZerofill was incorrectly expecting a 32-bit "size" so 26214400000
+; was printed as 444596224
+
+%struct.X = type { [25000 x i8] }
+
+@gArray = global [1048576 x %struct.X] zeroinitializer, align 16
+
+; CHECK: .zerofill __DATA,__common,_gArray,26214400000,4
diff --git a/test/CodeGen/X86/lea-2.ll b/test/CodeGen/X86/lea-2.ll
index 69303507d6e6c..43f69b0c6e93c 100644
--- a/test/CodeGen/X86/lea-2.ll
+++ b/test/CodeGen/X86/lea-2.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {lea	EAX, DWORD PTR \\\[... + 4\\*... - 5\\\]}
+; RUN:   grep "lea	EAX, DWORD PTR \[... + 4\*... - 5\]"
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
 ; RUN:   not grep add
 
diff --git a/test/CodeGen/X86/liveness-local-regalloc.ll b/test/CodeGen/X86/liveness-local-regalloc.ll
index b469d0837dc55..721f545985d7f 100644
--- a/test/CodeGen/X86/liveness-local-regalloc.ll
+++ b/test/CodeGen/X86/liveness-local-regalloc.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -O3 -regalloc=fast -mtriple=x86_64-apple-darwin10
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs -mtriple=x86_64-apple-darwin10
 ; <rdar://problem/7755473>
+; PR12821
 
 %0 = type { i32, i8*, i8*, %1*, i8*, i64, i64, i32, i32, i32, i32, [1024 x i8] }
 %1 = type { i8*, i32, i32, i16, i16, %2, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %2, %3*, i32, [3 x i8], [1 x i8], %2, i32, i64 }
@@ -58,3 +59,34 @@ infloop:                                          ; preds = %infloop, %bb3
 infloop1:                                         ; preds = %infloop1, %bb5
   br label %infloop1
 }
+
+
+; RAFast would forget to add a super-register <imp-def> when rewriting:
+;  %vreg10:sub_32bit<def,read-undef> = COPY %R9D<kill>
+; This trips up the machine code verifier.
+define void @autogen_SD24657(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <16 x i16>
+  %A3 = alloca double
+  %A2 = alloca <2 x i8>
+  %A1 = alloca i1
+  %A = alloca i32
+  %L = load i8* %0
+  store i8 -37, i8* %0
+  %E = extractelement <4 x i64> zeroinitializer, i32 2
+  %Shuff = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
+  %I = insertelement <2 x i8> <i8 -1, i8 -1>, i8 %5, i32 1
+  %B = fadd float 0x45CDF5B1C0000000, 0x45CDF5B1C0000000
+  %FC = uitofp i32 275048 to double
+  %Sl = select i1 true, <2 x i8> %I, <2 x i8> <i8 -1, i8 -1>
+  %Cmp = icmp slt i64 0, %E
+  br label %CF
+
+CF:                                               ; preds = %BB
+  store i8 %5, i8* %0
+  store <2 x i8> %I, <2 x i8>* %A2
+  store i8 %5, i8* %0
+  store i8 %5, i8* %0
+  store i8 %5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/X86/loop-blocks.ll b/test/CodeGen/X86/loop-blocks.ll
index d14102fe245bf..4bd162b452944 100644
--- a/test/CodeGen/X86/loop-blocks.ll
+++ b/test/CodeGen/X86/loop-blocks.ll
@@ -41,7 +41,6 @@ done:
 ; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB1_4:
 ; CHECK-NEXT:   callq bar99
-; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB1_1:
 ; CHECK-NEXT:   callq body
 
@@ -79,7 +78,6 @@ exit:
 ; CHECK-NEXT: .LBB2_5:
 ; CHECK-NEXT:   callq block_a_true_func
 ; CHECK-NEXT:   callq block_a_merge_func
-; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB2_1:
 ; CHECK-NEXT:   callq body
 ;
@@ -139,13 +137,13 @@ exit:
 ; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB3_7:
 ; CHECK-NEXT:   callq   bar100
-; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB3_1:
 ; CHECK-NEXT:   callq   loop_header
 ;      CHECK:   jl .LBB3_7
 ;      CHECK:   jge .LBB3_3
 ; CHECK-NEXT:   callq   bar101
 ; CHECK-NEXT:   jmp     .LBB3_1
+; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB3_3:
 ;      CHECK:   jge .LBB3_4
 ; CHECK-NEXT:   callq   bar102
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
index ebda9f201df96..8a81f70a8a2a0 100644
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -1,10 +1,16 @@
-; RUN: llc -mtriple=x86_64-darwin < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-darwin -mcpu=generic < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: t:
 ; CHECK: decq
-; CHECK-NEXT: movl (
+; CHECK-NEXT: movl (%r9,%rax,4), %eax
 ; CHECK-NEXT: jne
 
+; ATOM: t:
+; ATOM: movl (%r9,%rax,4), %eax
+; ATOM-NEXT: decq
+; ATOM-NEXT: jne
+
 @Te0 = external global [256 x i32]		; <[256 x i32]*> [#uses=5]
 @Te1 = external global [256 x i32]		; <[256 x i32]*> [#uses=4]
 @Te3 = external global [256 x i32]		; <[256 x i32]*> [#uses=2]
@@ -149,6 +155,13 @@ bb2:		; preds = %bb
 ; CHECK: jne
 ; CHECK: ret
 
+; ATOM: f:
+; ATOM: %for.body
+; ATOM: incl [[IV:%e..]]
+; ATOM: cmpl $1, [[IV]]
+; ATOM: jne
+; ATOM: ret
+
 define i32 @f(i32 %i, i32* nocapture %a) nounwind uwtable readonly ssp {
 entry:
   %cmp4 = icmp eq i32 %i, 1
diff --git a/test/CodeGen/X86/lsr-reuse-trunc.ll b/test/CodeGen/X86/lsr-reuse-trunc.ll
index 1f87089f80e70..276dab72f7cc6 100644
--- a/test/CodeGen/X86/lsr-reuse-trunc.ll
+++ b/test/CodeGen/X86/lsr-reuse-trunc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck %s
 
 ; Full strength reduction wouldn't reduce register pressure, so LSR should
 ; stick with indexing here.
diff --git a/test/CodeGen/X86/lsr-static-addr.ll b/test/CodeGen/X86/lsr-static-addr.ll
index c9ed3e553a466..6566f563784d8 100644
--- a/test/CodeGen/X86/lsr-static-addr.ll
+++ b/test/CodeGen/X86/lsr-static-addr.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=atom -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: xorl  %eax, %eax
 ; CHECK: movsd .LCPI0_0(%rip), %xmm0
@@ -9,6 +10,15 @@
 ; CHECK-NEXT: movsd
 ; CHECK-NEXT: incq %rax
 
+; ATOM: movsd .LCPI0_0(%rip), %xmm0
+; ATOM: xorl  %eax, %eax
+; ATOM: align
+; ATOM-NEXT: BB0_2:
+; ATOM-NEXT: movsd A(,%rax,8)
+; ATOM-NEXT: mulsd
+; ATOM-NEXT: movsd
+; ATOM-NEXT: incq %rax
+
 @A = external global [0 x double]
 
 define void @foo(i64 %n) nounwind {
diff --git a/test/CodeGen/X86/machine-cse.ll b/test/CodeGen/X86/machine-cse.ll
index a757cde6abe9a..d171fd5f1d9fa 100644
--- a/test/CodeGen/X86/machine-cse.ll
+++ b/test/CodeGen/X86/machine-cse.ll
@@ -99,3 +99,60 @@ return:                                           ; preds = %if.end, %entry
   %retval.0 = phi i32 [ 1, %entry ], [ %., %if.end ]
   ret i32 %retval.0
 }
+
+; rdar://11393714
+define i8* @bsd_memchr(i8* %s, i32 %a, i32 %c, i64 %n) nounwind ssp {
+; CHECK: %entry
+; CHECK: xorl
+; CHECK: %preheader
+; CHECK: %do.body
+; CHECK-NOT: xorl
+; CHECK: %do.cond
+; CHECK-NOT: xorl
+; CHECK: %return
+entry:
+  %cmp = icmp eq i64 %n, 0
+  br i1 %cmp, label %return, label %preheader
+
+preheader:
+  %conv2 = and i32 %c, 255
+  br label %do.body
+
+do.body:
+  %n.addr.0 = phi i64 [ %dec, %do.cond ], [ %n, %preheader ]
+  %p.0 = phi i8* [ %incdec.ptr, %do.cond ], [ %s, %preheader ]
+  %cmp3 = icmp eq i32 %a, %conv2
+  br i1 %cmp3, label %return, label %do.cond
+
+do.cond:
+  %incdec.ptr = getelementptr inbounds i8* %p.0, i64 1
+  %dec = add i64 %n.addr.0, -1
+  %cmp6 = icmp eq i64 %dec, 0
+  br i1 %cmp6, label %return, label %do.body
+
+return:
+  %retval.0 = phi i8* [ null, %entry ], [ null, %do.cond ], [ %p.0, %do.body ]
+  ret i8* %retval.0
+}
+
+; PR13578
+@t2_global = external global i32
+
+declare i1 @t2_func()
+
+define i32 @t2() {
+  store i32 42, i32* @t2_global
+  %c = call i1 @t2_func()
+  br i1 %c, label %a, label %b
+
+a:
+  %l = load i32* @t2_global
+  ret i32 %l
+
+b:
+  ret i32 0
+
+; CHECK: t2:
+; CHECK: t2_global@GOTPCREL(%rip)
+; CHECK-NOT: t2_global@GOTPCREL(%rip)
+}
diff --git a/test/CodeGen/X86/mem-promote-integers.ll b/test/CodeGen/X86/mem-promote-integers.ll
index 80103d10388ba..0015df0c1facb 100644
--- a/test/CodeGen/X86/mem-promote-integers.ll
+++ b/test/CodeGen/X86/mem-promote-integers.ll
@@ -1,8 +1,8 @@
 ; Test the basic functionality of integer element promotions of different types.
 ; This tests checks passing of arguments, loading and storing to memory and
 ; basic arithmetic.
-; RUN: llc -march=x86 -promote-elements < %s
-; RUN: llc -march=x86-64 -promote-elements < %s
+; RUN: llc -march=x86 < %s
+; RUN: llc -march=x86-64 < %s
 
 define <1 x i8> @test_1xi8(<1 x i8> %x, <1 x i8>* %b) {
   %bb = load <1 x i8>* %b
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index f4bc1bb7015a0..723d1d89427ee 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -disable-simplify-libcalls -mtriple=x86_64-linux | FileCheck %s --check-prefix=NOBUILTIN
 ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
 
 ; This tests codegen time inlining/optimization of memcmp
@@ -23,6 +24,8 @@ return:                                           ; preds = %entry
 ; CHECK: memcmp2:
 ; CHECK: movw    ([[A0:%rdi|%rcx]]), %ax
 ; CHECK: cmpw    ([[A1:%rsi|%rdx]]), %ax
+; NOBUILTIN: memcmp2:
+; NOBUILTIN: callq
 }
 
 define void @memcmp2a(i8* %X, i32* nocapture %P) nounwind {
diff --git a/test/CodeGen/X86/mmx-punpckhdq.ll b/test/CodeGen/X86/mmx-punpckhdq.ll
index 689f7bf595648..206cb33494cf3 100644
--- a/test/CodeGen/X86/mmx-punpckhdq.ll
+++ b/test/CodeGen/X86/mmx-punpckhdq.ll
@@ -3,7 +3,7 @@
 
 define void @bork(<1 x i64>* %x) {
 ; CHECK: bork
-; CHECK: pextrd
+; CHECK: movlpd
 entry:
 	%tmp2 = load <1 x i64>* %x		; <<1 x i64>> [#uses=1]
 	%tmp6 = bitcast <1 x i64> %tmp2 to <2 x i32>		; <<2 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index aeb540fe42327..65ee7b1d8e002 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll
@@ -55,4 +55,20 @@ entry:
 ; X64:	ret
 }
 
+; The two loads here both look identical to selection DAG, except for their
+; address spaces.  Make sure they aren't CSE'd.
+define i32 @test_no_cse() nounwind readonly {
+entry:
+	%tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31)		; <i32*> [#uses=1]
+	%tmp1 = load i32* %tmp		; <i32> [#uses=1]
+	%tmp2 = load i32* addrspace(257)* getelementptr (i32* addrspace(257)* inttoptr (i32 72 to i32* addrspace(257)*), i32 31)		; <i32*> [#uses=1]
+	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
+	%tmp4 = add i32 %tmp1, %tmp3
+	ret i32 %tmp4
+}
+; X32: test_no_cse:
+; X32: 	movl	%gs:196
+; X32: 	movl	%fs:196
+; X32: 	ret
+
 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/multiple-loop-post-inc.ll b/test/CodeGen/X86/multiple-loop-post-inc.ll
index 4f7e28ace3cdf..9f7d036cf141c 100644
--- a/test/CodeGen/X86/multiple-loop-post-inc.ll
+++ b/test/CodeGen/X86/multiple-loop-post-inc.ll
@@ -1,9 +1,9 @@
-; RUN: llc -asm-verbose=false -disable-branch-fold -disable-code-place -disable-tail-duplicate -march=x86-64 < %s | FileCheck %s
+; RUN: llc -asm-verbose=false -disable-branch-fold -disable-code-place -disable-tail-duplicate -march=x86-64 -mcpu=nehalem < %s | FileCheck %s
 ; rdar://7236213
-
-; Xfailed now that scheduler 2-address hack is disabled a lea is generated.
-; The code isn't any worse though.
-; XFAIL: *
+;
+; The scheduler's 2-address hack has been disabled, so there is
+; currently no good guarantee that this test will pass until the
+; machine scheduler develops an equivalent heuristic.
 
 ; CodeGen shouldn't require any lea instructions inside the marked loop.
 ; It should properly set up post-increment uses and do coalescing for
diff --git a/test/CodeGen/X86/neg_cmp.ll b/test/CodeGen/X86/neg_cmp.ll
new file mode 100644
index 0000000000000..866514ed9a2f7
--- /dev/null
+++ b/test/CodeGen/X86/neg_cmp.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+; rdar://11245199
+; PR12545
+define void @f(i32 %x, i32 %y) nounwind uwtable ssp {
+entry:
+; CHECK: f:
+; CHECK-NOT: neg
+; CHECK: add
+  %sub = sub i32 0, %y
+  %cmp = icmp eq i32 %x, %sub
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @g() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare void @g()
diff --git a/test/CodeGen/X86/opt-shuff-tstore.ll b/test/CodeGen/X86/opt-shuff-tstore.ll
index fc24913be5298..3e720844c437b 100644
--- a/test/CodeGen/X86/opt-shuff-tstore.ll
+++ b/test/CodeGen/X86/opt-shuff-tstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s -promote-elements -mattr=+sse2,+sse41 | FileCheck %s
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s  -mattr=+sse2,+sse41 | FileCheck %s
 
 ; CHECK: func_4_8
 ; A single memory write
diff --git a/test/CodeGen/X86/overlap-shift.ll b/test/CodeGen/X86/overlap-shift.ll
index d185af16b90bd..e987495f2c01a 100644
--- a/test/CodeGen/X86/overlap-shift.ll
+++ b/test/CodeGen/X86/overlap-shift.ll
@@ -7,7 +7,7 @@
 ; Check that the shift gets turned into an LEA.
 
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   not grep {mov E.X, E.X}
+; RUN:   not grep "mov E.X, E.X"
 
 @G = external global i32                ; <i32*> [#uses=1]
 
diff --git a/test/CodeGen/X86/pass-three.ll b/test/CodeGen/X86/pass-three.ll
new file mode 100644
index 0000000000000..23005c77c13d5
--- /dev/null
+++ b/test/CodeGen/X86/pass-three.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.3.0"
+
+
+define { i8*, i64, i64* } @copy_3(i8* %a, i64 %b, i64* %c) nounwind {
+entry:
+  %0 = insertvalue { i8*, i64, i64* } undef, i8* %a, 0
+  %1 = insertvalue { i8*, i64, i64* } %0, i64 %b, 1
+  %2 = insertvalue { i8*, i64, i64* } %1, i64* %c, 2
+  ret { i8*, i64, i64* } %2
+}
+
+; CHECK: copy_3:
+; CHECK-NOT: (%rdi)
+; CHECK: ret
diff --git a/test/CodeGen/X86/peep-vector-extract-insert.ll b/test/CodeGen/X86/peep-vector-extract-insert.ll
index d48a3318262c1..f958b6b2c0690 100644
--- a/test/CodeGen/X86/peep-vector-extract-insert.ll
+++ b/test/CodeGen/X86/peep-vector-extract-insert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep {xorps	%xmm0, %xmm0} | count 2
+; RUN: llc < %s -march=x86-64 | grep "xorps	%xmm0, %xmm0" | count 2
 
 define float @foo(<4 x float> %a) {
   %b = insertelement <4 x float> %a, float 0.0, i32 3
diff --git a/test/CodeGen/X86/phi-immediate-factoring.ll b/test/CodeGen/X86/phi-immediate-factoring.ll
index ef02af2d78511..476bb10998314 100644
--- a/test/CodeGen/X86/phi-immediate-factoring.ll
+++ b/test/CodeGen/X86/phi-immediate-factoring.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -stats |& grep {Number of blocks eliminated} | grep 6
+; RUN: llc < %s -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 6
 ; PR1296
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/phielim-split.ll b/test/CodeGen/X86/phielim-split.ll
new file mode 100644
index 0000000000000..aa477359d60e5
--- /dev/null
+++ b/test/CodeGen/X86/phielim-split.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target triple = "x86_64-apple-macosx10.8.0"
+
+; The critical edge from for.cond to if.end2 should be split to avoid injecting
+; copies into the loop. The use of %b after the loop causes interference that
+; makes a copy necessary.
+; <rdar://problem/11561842>
+;
+; CHECK: split_loop_exit
+; CHECK: %for.cond
+; CHECK-NOT: mov
+; CHECK: je
+
+define i32 @split_loop_exit(i32 %a, i32 %b, i8* nocapture %p) nounwind uwtable readonly ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 10
+  br i1 %cmp, label %for.cond, label %if.end2
+
+for.cond:                                         ; preds = %entry, %for.cond
+  %p.addr.0 = phi i8* [ %incdec.ptr, %for.cond ], [ %p, %entry ]
+  %incdec.ptr = getelementptr inbounds i8* %p.addr.0, i64 1
+  %0 = load i8* %p.addr.0, align 1
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %for.cond, label %if.end2
+
+if.end2:                                          ; preds = %for.cond, %entry
+  %r.0 = phi i32 [ %a, %entry ], [ %b, %for.cond ]
+  %add = add nsw i32 %r.0, %b
+  ret i32 %add
+}
diff --git a/test/CodeGen/X86/phys-reg-local-regalloc.ll b/test/CodeGen/X86/phys-reg-local-regalloc.ll
index 8b9ea17c4e232..37eca1ce0a729 100644
--- a/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ b/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -regalloc=fast | FileCheck %s
-; RUN: llc -O0 < %s -march=x86 -mtriple=i386-apple-darwin9 -regalloc=fast | FileCheck %s
-; CHECKed instructions should be the same with or without -O0.
+; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s
+; RUN: llc -O0 < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s
+; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s
+; CHECKed instructions should be the same with or without -O0 except on Intel Atom due to instruction scheduling.
 
 @.str = private constant [12 x i8] c"x + y = %i\0A\00", align 1 ; <[12 x i8]*> [#uses=1]
 
@@ -15,6 +16,19 @@ entry:
 ; CHECK: movl	%ebx, 40(%esp)
 ; CHECK-NOT: movl
 ; CHECK: addl %ebx, %eax
+
+; On Intel Atom the scheduler moves a movl instruction
+; used for the printf call to follow movl 24(%esp), %eax
+; ATOM: movl 24(%esp), %eax
+; ATOM: movl
+; ATOM: movl   %eax, 36(%esp)
+; ATOM-NOT: movl
+; ATOM: movl 28(%esp), %ebx
+; ATOM-NOT: movl
+; ATOM: movl   %ebx, 40(%esp)
+; ATOM-NOT: movl
+; ATOM: addl %ebx, %eax
+
   %retval = alloca i32                            ; <i32*> [#uses=2]
   %"%ebx" = alloca i32                            ; <i32*> [#uses=1]
   %"%eax" = alloca i32                            ; <i32*> [#uses=2]
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 4162015ea88e3..984d7e57e0c62 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,10 +1,14 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -join-physregs | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; XFAIL: *
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
 ; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
 ; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
 ; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
+;
+; This test is XFAIL until the register allocator understands trivial physreg
+; interference. <rdar://9802098>
 
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index d8ed4c097e00c..da4af81959dc2 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,9 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -stack-alignment=16 -join-physregs > %t
+; RUN: llc < %s -march=x86 -mattr=sse41 -mcpu=nehalem -stack-alignment=16 > %t
 ; RUN: grep pmul %t | count 12
 ; RUN: grep mov %t | count 11
 
-; The f() arguments in %xmm0 and %xmm1 cause an extra movdqa without -join-physregs.
-
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
         %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
         ret <4 x i32> %A
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index cc1df2fffcc5b..800fbedb4f99a 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -105,8 +105,7 @@ define <2 x i32*> @BITCAST1(<2 x i8*>* %p) nounwind {
 entry:
   %G = load <2 x i8*>* %p
 ;CHECK: movl
-;CHECK: movd
-;CHECK: pinsrd
+;CHECK: movsd
   %T = bitcast <2 x i8*> %G to <2 x i32*>
 ;CHECK: ret
   ret <2 x i32*> %T
diff --git a/test/CodeGen/X86/pr11415.ll b/test/CodeGen/X86/pr11415.ll
index e1fa0326b7621..6c32a2206a7ea 100644
--- a/test/CodeGen/X86/pr11415.ll
+++ b/test/CodeGen/X86/pr11415.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-pc-linux %s -o - -regalloc=fast | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux %s -o - -regalloc=fast -optimize-regalloc=0 | FileCheck %s
 
 ; We used to consider the early clobber in the second asm statement as
 ; defining %0 before it was read. This caused us to omit the
diff --git a/test/CodeGen/X86/pr11468.ll b/test/CodeGen/X86/pr11468.ll
new file mode 100644
index 0000000000000..f7e9adb4a211d
--- /dev/null
+++ b/test/CodeGen/X86/pr11468.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -force-align-stack -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; PR11468
+
+define void @f(i64 %sz) uwtable {
+entry:
+  %a = alloca i32, align 32
+  store volatile i32 0, i32* %a, align 32
+  ; force to push r14 on stack
+  call void asm sideeffect "nop", "~{r14},~{dirflag},~{fpsr},~{flags}"() nounwind, !srcloc !0
+  ret void
+
+; CHECK: _f
+; CHECK: pushq %rbp
+; CHECK: .cfi_offset %rbp, -16
+; CHECK: movq %rsp, %rbp
+; CHECK: .cfi_def_cfa_register %rbp
+
+; We first push register on stack, and then realign it, so that
+; .cfi_offset value is correct
+; CHECK: pushq %r14
+; CHECK: andq $-32, %rsp
+; CHECK: .cfi_offset %r14, -24
+
+; Restore %rsp from %rbp and subtract the total size of saved regsiters.
+; CHECK: leaq -8(%rbp), %rsp
+
+; Pop saved registers.
+; CHECK: popq %r14
+; CHECK: popq %rbp
+}
+
+!0 = metadata !{i32 125}
+
diff --git a/test/CodeGen/X86/pr12889.ll b/test/CodeGen/X86/pr12889.ll
new file mode 100644
index 0000000000000..331d8f907d58d
--- /dev/null
+++ b/test/CodeGen/X86/pr12889.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=x86
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@c0 = common global i8 0, align 1
+
+define void @func() nounwind uwtable {
+entry:
+  %0 = load i8* @c0, align 1, !tbaa !0
+  %tobool = icmp ne i8 %0, 0
+  %conv = zext i1 %tobool to i8
+  %storemerge = shl nuw nsw i8 %conv, %conv
+  store i8 %storemerge, i8* @c0, align 1
+  ret void
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/pr13209.ll b/test/CodeGen/X86/pr13209.ll
new file mode 100644
index 0000000000000..1c93163659596
--- /dev/null
+++ b/test/CodeGen/X86/pr13209.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
+
+; CHECK: pr13209:
+; CHECK-NOT: mov
+; CHECK: .size pr13209
+
+define zeroext i1 @pr13209(i8** %x, i8*** %jumpTable) nounwind {
+if.end51:
+  br label %indirectgoto.preheader
+indirectgoto.preheader:
+  %frombool.i5915.ph = phi i8 [ undef, %if.end51 ], [ %frombool.i5917, %jit_return ]
+  br label %indirectgoto
+do.end165:
+  %tmp92 = load i8** %x, align 8
+  br label %indirectgoto
+do.end209:
+  %tmp104 = load i8** %x, align 8
+  br label %indirectgoto
+do.end220:
+  %tmp107 = load i8** %x, align 8
+  br label %indirectgoto
+do.end231:
+  %tmp110 = load i8** %x, align 8
+  br label %indirectgoto
+do.end242:
+  %tmp113 = load i8** %x, align 8
+  br label %indirectgoto
+do.end253:
+  %tmp116 = load i8** %x, align 8
+  br label %indirectgoto
+do.end286:
+  %tmp125 = load i8** %x, align 8
+  br label %indirectgoto
+do.end297:
+  %tmp128 = load i8** %x, align 8
+  br label %indirectgoto
+do.end308:
+  %tmp131 = load i8** %x, align 8
+  br label %indirectgoto
+do.end429:
+  %tmp164 = load i8** %x, align 8
+  br label %indirectgoto
+do.end440:
+  %tmp167 = load i8** %x, align 8
+  br label %indirectgoto
+do.body482:
+  br i1 false, label %indirectgoto, label %do.body495
+do.body495:
+  br label %indirectgoto
+do.end723:
+  br label %inline_return
+inline_return:
+  %frombool.i5917 = phi i8 [ 0, %if.end5571 ], [ %frombool.i5915, %do.end723 ]
+  br label %jit_return
+jit_return:
+  br label %indirectgoto.preheader
+L_JSOP_UINT24:
+  %tmp864 = load i8** %x, align 8
+  br label %indirectgoto
+L_JSOP_THROWING:
+  %tmp1201 = load i8** %x, align 8
+  br label %indirectgoto
+do.body4936:
+  %tmp1240 = load i8** %x, align 8
+  br label %indirectgoto
+do.body5184:
+  %tmp1340 = load i8** %x, align 8
+  br label %indirectgoto
+if.end5571:
+  br  label %inline_return
+indirectgoto:
+  %frombool.i5915 = phi i8  [ 0, %do.body495 ],[ 0, %do.body482 ] , [ %frombool.i5915, %do.body4936 ],[ %frombool.i5915, %do.body5184 ], [ %frombool.i5915, %L_JSOP_UINT24 ], [ %frombool.i5915, %do.end286 ], [ %frombool.i5915, %do.end297 ], [ %frombool.i5915, %do.end308 ], [ %frombool.i5915, %do.end429 ], [ %frombool.i5915, %do.end440 ], [ %frombool.i5915, %L_JSOP_THROWING ], [ %frombool.i5915, %do.end253 ], [ %frombool.i5915, %do.end242 ], [ %frombool.i5915, %do.end231 ], [ %frombool.i5915, %do.end220 ], [ %frombool.i5915, %do.end209 ],[ %frombool.i5915, %do.end165 ], [ %frombool.i5915.ph, %indirectgoto.preheader ]
+  indirectbr i8* null, [ label %if.end5571, label %do.end165, label %do.end209, label %do.end220, label %do.end231, label %do.end242, label %do.end253, label %do.end723, label %L_JSOP_THROWING, label %do.end440, label %do.end429, label %do.end308, label %do.end297, label %do.end286, label %L_JSOP_UINT24, label %do.body5184, label %do.body4936, label %do.body482]
+}
diff --git a/test/CodeGen/X86/pr13220.ll b/test/CodeGen/X86/pr13220.ll
new file mode 100644
index 0000000000000..b9ac4b63ecf0a
--- /dev/null
+++ b/test/CodeGen/X86/pr13220.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=x86 < %s
+; PR13220
+
+define <8 x i32> @foo(<8 x i96> %x) {
+  %a = lshr <8 x i96> %x, <i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1>
+  %b = trunc <8 x i96> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <8 x i32> @bar(<8 x i97> %x) {
+  %a = lshr <8 x i97> %x, <i97 1, i97 1, i97 1, i97 1, i97 1, i97 1, i97 1, i97 1>
+  %b = trunc <8 x i97> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <8 x i32> @bax() {
+  %a = lshr <8 x i96> <i96 4, i96 4, i96 4, i96 4, i96 4, i96 4, i96 4, i96 4>, <i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1>
+  %b = trunc <8 x i96> %a to <8 x i32>
+  ret <8 x i32> %b
+}
diff --git a/test/CodeGen/X86/pr13577.ll b/test/CodeGen/X86/pr13577.ll
new file mode 100644
index 0000000000000..faaec262cb917
--- /dev/null
+++ b/test/CodeGen/X86/pr13577.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=x86-64
+
+define x86_fp80 @foo(x86_fp80 %a) {
+  %1 = tail call x86_fp80 @copysignl(x86_fp80 0xK7FFF8000000000000000, x86_fp80 %a) nounwind readnone
+  ret x86_fp80 %1
+}
+
+declare x86_fp80 @copysignl(x86_fp80, x86_fp80) nounwind readnone
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll
index afd71143c4588..f0e31f7f5fdc6 100644
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {xorps.\*sp} | count 1
+; RUN: llc < %s -march=x86 -mattr=+sse2 | grep "xorps.*sp" | count 1
 ; PR2656
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/pr3522.ll b/test/CodeGen/X86/pr3522.ll
index 112253038b797..d8f37781fc6e3 100644
--- a/test/CodeGen/X86/pr3522.ll
+++ b/test/CodeGen/X86/pr3522.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -stats |& not grep {instructions sunk}
+; RUN: llc < %s -march=x86 -stats 2>&1 | not grep "instructions sunk"
 ; PR3522
 
 target triple = "i386-pc-linux-gnu"
diff --git a/test/CodeGen/X86/promote-trunc.ll b/test/CodeGen/X86/promote-trunc.ll
index 4211d82268d27..40a58b0739247 100644
--- a/test/CodeGen/X86/promote-trunc.ll
+++ b/test/CodeGen/X86/promote-trunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -promote-elements < %s -march=x86-64
+; RUN: llc < %s -march=x86-64
 
 define<4 x i8> @func_8_64() {
   %F = load <4 x i64>* undef
diff --git a/test/CodeGen/X86/rd-mod-wr-eflags.ll b/test/CodeGen/X86/rd-mod-wr-eflags.ll
index faca3d7bacdb1..8ef9b5dec0d5c 100644
--- a/test/CodeGen/X86/rd-mod-wr-eflags.ll
+++ b/test/CodeGen/X86/rd-mod-wr-eflags.ll
@@ -177,3 +177,49 @@ if.end4:
 return:
   ret void
 }
+
+; Deal with TokenFactor chain
+; rdar://11236106
+@foo = external global i64*, align 8
+
+define void @test3() nounwind ssp {
+entry:
+; CHECK: test3:
+; CHECK: decq 16(%rax)
+  %0 = load i64** @foo, align 8
+  %arrayidx = getelementptr inbounds i64* %0, i64 2
+  %1 = load i64* %arrayidx, align 8
+  %dec = add i64 %1, -1
+  store i64 %dec, i64* %arrayidx, align 8
+  %cmp = icmp eq i64 %dec, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @baz() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @baz()
+
+; Avoid creating a cycle in the DAG which would trigger an assert in the
+; scheduler.
+; PR12565
+; rdar://11451474
+@x = external global i32, align 4
+@y = external global i32, align 4
+@z = external global i32, align 4
+
+define void @test4() nounwind uwtable ssp {
+entry:
+  %0 = load i32* @x, align 4
+  %1 = load i32* @y, align 4
+  %dec = add nsw i32 %1, -1
+  store i32 %dec, i32* @y, align 4
+  %tobool.i = icmp ne i32 %dec, 0
+  %cond.i = select i1 %tobool.i, i32 %0, i32 0
+  store i32 %cond.i, i32* @z, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll
new file mode 100644
index 0000000000000..e2224a6196769
--- /dev/null
+++ b/test/CodeGen/X86/rdrand.ll
@@ -0,0 +1,85 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx-i -mattr=+rdrand | FileCheck %s
+declare {i16, i32} @llvm.x86.rdrand.16()
+declare {i32, i32} @llvm.x86.rdrand.32()
+declare {i64, i32} @llvm.x86.rdrand.64()
+
+define i32 @_rdrand16_step(i16* %random_val) {
+  %call = call {i16, i32} @llvm.x86.rdrand.16()
+  %randval = extractvalue {i16, i32} %call, 0
+  store i16 %randval, i16* %random_val
+  %isvalid = extractvalue {i16, i32} %call, 1
+  ret i32 %isvalid
+; CHECK: _rdrand16_step:
+; CHECK: rdrandw	%ax
+; CHECK: movw	%ax, (%r[[A0:di|cx]])
+; CHECK: movzwl	%ax, %ecx
+; CHECK: movl	$1, %eax
+; CHECK: cmovael	%ecx, %eax
+; CHECK: ret
+}
+
+define i32 @_rdrand32_step(i32* %random_val) {
+  %call = call {i32, i32} @llvm.x86.rdrand.32()
+  %randval = extractvalue {i32, i32} %call, 0
+  store i32 %randval, i32* %random_val
+  %isvalid = extractvalue {i32, i32} %call, 1
+  ret i32 %isvalid
+; CHECK: _rdrand32_step:
+; CHECK: rdrandl	%e[[T0:[a-z]+]]
+; CHECK: movl	%e[[T0]], (%r[[A0]])
+; CHECK: movl	$1, %eax
+; CHECK: cmovael	%e[[T0]], %eax
+; CHECK: ret
+}
+
+define i32 @_rdrand64_step(i64* %random_val) {
+  %call = call {i64, i32} @llvm.x86.rdrand.64()
+  %randval = extractvalue {i64, i32} %call, 0
+  store i64 %randval, i64* %random_val
+  %isvalid = extractvalue {i64, i32} %call, 1
+  ret i32 %isvalid
+; CHECK: _rdrand64_step:
+; CHECK: rdrandq	%r[[T1:[[a-z]+]]
+; CHECK: movq	%r[[T1]], (%r[[A0]])
+; CHECK: movl	$1, %eax
+; CHECK: cmovael	%e[[T1]], %eax
+; CHECK: ret
+}
+
+; Check that MachineCSE doesn't eliminate duplicate rdrand instructions.
+define i32 @CSE() nounwind {
+ %rand1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+ %v1 = extractvalue { i32, i32 } %rand1, 0
+ %rand2 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+ %v2 = extractvalue { i32, i32 } %rand2, 0
+ %add = add i32 %v2, %v1
+ ret i32 %add
+; CHECK: CSE:
+; CHECK: rdrandl
+; CHECK: rdrandl
+}
+
+; Check that MachineLICM doesn't hoist rdrand instructions.
+define void @loop(i32* %p, i32 %n) nounwind {
+entry:
+  %tobool1 = icmp eq i32 %n, 0
+  br i1 %tobool1, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %p.addr.03 = phi i32* [ %incdec.ptr, %while.body ], [ %p, %entry ]
+  %n.addr.02 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+  %dec = add nsw i32 %n.addr.02, -1
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.03, i64 1
+  %rand = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+  %v1 = extractvalue { i32, i32 } %rand, 0
+  store i32 %v1, i32* %p.addr.03, align 4
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+; CHECK: loop:
+; CHECK-NOT: rdrandl
+; CHECK: This Inner Loop Header: Depth=1
+; CHECK: rdrandl
+}
diff --git a/test/CodeGen/X86/regpressure.ll b/test/CodeGen/X86/regpressure.ll
index e0b5f7a870bb2..52d7b56f182ec 100644
--- a/test/CodeGen/X86/regpressure.ll
+++ b/test/CodeGen/X86/regpressure.ll
@@ -1,8 +1,8 @@
 ;; Both functions in this testcase should codegen to the same function, and
 ;; neither of them should require spilling anything to the stack.
 
-; RUN: llc < %s -march=x86 -stats |& \
-; RUN:   not grep {Number of register spills}
+; RUN: llc < %s -march=x86 -stats 2>&1 | \
+; RUN:   not grep "Number of register spills"
 
 ;; This can be compiled to use three registers if the loads are not
 ;; folded into the multiplies, 2 registers otherwise.
diff --git a/test/CodeGen/X86/remat-fold-load.ll b/test/CodeGen/X86/remat-fold-load.ll
new file mode 100644
index 0000000000000..de77ad3756729
--- /dev/null
+++ b/test/CodeGen/X86/remat-fold-load.ll
@@ -0,0 +1,143 @@
+; RUN: llc < %s -disable-fp-elim -verify-coalescing
+; PR13414
+;
+; During coalescing, remat triggers DCE which deletes the penultimate use of a
+; load. This load should not be folded into the remaining use because it is not
+; safe to move, and it would extend the live range of the address.
+;
+; LiveRangeEdit::foldAsLoad() doesn't extend live ranges, so -verify-coalescing
+; catches the problem.
+
+target triple = "i386-unknown-linux-gnu"
+
+%type_a = type { %type_a*, %type_b }
+%type_b = type { %type_c, i32 }
+%type_c = type { i32, %type_d }
+%type_d = type { i64 }
+%type_e = type { %type_c, i64 }
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define linkonce_odr void @test() nounwind {
+entry:
+  br i1 undef, label %while.end.while.end26_crit_edge, label %while.body12.lr.ph
+
+while.end.while.end26_crit_edge:                  ; preds = %entry
+  br label %while.end26
+
+while.body12.lr.ph:                               ; preds = %entry
+  br label %while.body12
+
+while.body12:                                     ; preds = %if.end24, %while.body12.lr.ph
+  %tmp = phi %type_a* [ undef, %while.body12.lr.ph ], [ %tmp18, %if.end24 ]
+  %ins151154161 = phi i128 [ 0, %while.body12.lr.ph ], [ %phitmp, %if.end24 ]
+  %ins135156160 = phi i128 [ 0, %while.body12.lr.ph ], [ %phitmp158, %if.end24 ]
+  %ins151 = or i128 0, %ins151154161
+  %cmp.i.i.i.i.i67 = icmp sgt i32 undef, 8
+  br i1 %cmp.i.i.i.i.i67, label %if.then.i.i.i.i71, label %if.else.i.i.i.i74
+
+if.then.i.i.i.i71:                                ; preds = %while.body12
+  %call4.i.i.i.i68 = call noalias i8* @malloc(i32 undef) nounwind
+  %tmp1 = getelementptr inbounds %type_a* %tmp, i32 0, i32 1, i32 0, i32 1
+  %buf_6.i.i.i.i70 = bitcast %type_d* %tmp1 to i8**
+  %tmp2 = load i8** %buf_6.i.i.i.i70, align 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* undef, i8* %tmp2, i32 undef, i32 1, i1 false) nounwind
+  unreachable
+
+if.else.i.i.i.i74:                                ; preds = %while.body12
+  %i_.i.i.i.i72 = getelementptr inbounds %type_a* %tmp, i32 0, i32 1, i32 0, i32 1, i32 0
+  %tmp3 = load i64* %i_.i.i.i.i72, align 4
+  %tmp4 = zext i64 %tmp3 to i128
+  %tmp5 = shl nuw nsw i128 %tmp4, 32
+  %ins148 = or i128 %tmp5, %ins151
+  %second3.i.i76 = getelementptr inbounds %type_a* %tmp, i32 0, i32 1, i32 1
+  %tmp6 = load i32* %second3.i.i76, align 4
+  %tmp7 = zext i32 %tmp6 to i128
+  %tmp8 = shl nuw i128 %tmp7, 96
+  %mask144 = and i128 %ins148, 79228162495817593519834398720
+  %tmp9 = load %type_e** undef, align 4
+  %len_.i.i.i.i86 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 0, i32 0
+  %tmp10 = load i32* %len_.i.i.i.i86, align 4
+  %tmp11 = zext i32 %tmp10 to i128
+  %ins135 = or i128 %tmp11, %ins135156160
+  %cmp.i.i.i.i.i88 = icmp sgt i32 %tmp10, 8
+  br i1 %cmp.i.i.i.i.i88, label %if.then.i.i.i.i92, label %if.else.i.i.i.i95
+
+if.then.i.i.i.i92:                                ; preds = %if.else.i.i.i.i74
+  %call4.i.i.i.i89 = call noalias i8* @malloc(i32 %tmp10) nounwind
+  %ins126 = or i128 0, %ins135
+  %tmp12 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 0, i32 1
+  %buf_6.i.i.i.i91 = bitcast %type_d* %tmp12 to i8**
+  %tmp13 = load i8** %buf_6.i.i.i.i91, align 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %call4.i.i.i.i89, i8* %tmp13, i32 %tmp10, i32 1, i1 false) nounwind
+  br label %A
+
+if.else.i.i.i.i95:                                ; preds = %if.else.i.i.i.i74
+  %i_.i.i.i.i93 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 0, i32 1, i32 0
+  br label %A
+
+A:                                                ; preds = %if.else.i.i.i.i95, %if.then.i.i.i.i92
+  %ins135157 = phi i128 [ %ins126, %if.then.i.i.i.i92 ], [ undef, %if.else.i.i.i.i95 ]
+  %second3.i.i97 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 1
+  %tmp14 = load i64* %second3.i.i97, align 4
+  %tmp15 = trunc i64 %tmp14 to i32
+  %cmp.i99 = icmp sgt i32 %tmp6, %tmp15
+  %tmp16 = trunc i128 %ins135157 to i32
+  %cmp.i.i.i.i.i.i101 = icmp sgt i32 %tmp16, 8
+  br i1 %cmp.i.i.i.i.i.i101, label %if.then.i.i.i.i.i103, label %B
+
+if.then.i.i.i.i.i103:                             ; preds = %A
+  unreachable
+
+B:                                                ; preds = %A
+  %tmp17 = trunc i128 %ins148 to i32
+  %cmp.i.i.i.i.i.i83 = icmp sgt i32 %tmp17, 8
+  br i1 %cmp.i.i.i.i.i.i83, label %if.then.i.i.i.i.i85, label %C
+
+if.then.i.i.i.i.i85:                              ; preds = %B
+  unreachable
+
+C:                                                ; preds = %B
+  br i1 %cmp.i99, label %if.then17, label %if.end24
+
+if.then17:                                        ; preds = %C
+  br i1 false, label %if.then.i.i.i.i.i43, label %D
+
+if.then.i.i.i.i.i43:                              ; preds = %if.then17
+  unreachable
+
+D:                                                ; preds = %if.then17
+  br i1 undef, label %if.then.i.i.i.i.i, label %E
+
+if.then.i.i.i.i.i:                                ; preds = %D
+  unreachable
+
+E:                                                ; preds = %D
+  br label %if.end24
+
+if.end24:                                         ; preds = %E, %C
+  %phitmp = or i128 %tmp8, %mask144
+  %phitmp158 = or i128 undef, undef
+  %tmp18 = load %type_a** undef, align 4
+  %tmp19 = load %type_a** undef, align 4
+  %cmp.i49 = icmp eq %type_a* %tmp18, %tmp19
+  br i1 %cmp.i49, label %while.cond10.while.end26_crit_edge, label %while.body12
+
+while.cond10.while.end26_crit_edge:               ; preds = %if.end24
+  %.pre = load %type_e** undef, align 4
+  br label %while.end26
+
+while.end26:                                      ; preds = %while.cond10.while.end26_crit_edge, %while.end.while.end26_crit_edge
+  br i1 undef, label %while.body.lr.ph.i, label %F
+
+while.body.lr.ph.i:                               ; preds = %while.end26
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %while.body.i, %while.body.lr.ph.i
+  br i1 false, label %while.body.i, label %F
+
+F:                                                ; preds = %while.body.i, %while.end26
+  ret void
+}
+
+declare noalias i8* @malloc(i32) nounwind
diff --git a/test/CodeGen/X86/remat-scalar-zero.ll b/test/CodeGen/X86/remat-scalar-zero.ll
index 75f438d26cd0c..f6095a75561c8 100644
--- a/test/CodeGen/X86/remat-scalar-zero.ll
+++ b/test/CodeGen/X86/remat-scalar-zero.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu > %t
 ; RUN: not grep xor %t
 ; RUN: not grep movap %t
-; RUN: grep {\\.quad.*0} %t
+; RUN: grep "\.quad.*0" %t
 
 ; Remat should be able to fold the zero constant into the div instructions
 ; as a constant-pool load.
diff --git a/test/CodeGen/X86/reverse_branches.ll b/test/CodeGen/X86/reverse_branches.ll
new file mode 100644
index 0000000000000..97721250377e2
--- /dev/null
+++ b/test/CodeGen/X86/reverse_branches.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+@.str2 = private unnamed_addr constant [7 x i8] c"memchr\00", align 1
+@.str3 = private unnamed_addr constant [11 x i8] c"bsd_memchr\00", align 1
+@str4 = private unnamed_addr constant [5 x i8] c"Bug!\00"
+
+; Make sure at end of do.cond.i, we jump to do.body.i first to have a tighter
+; inner loop.
+define i32 @test_branches_order() uwtable ssp {
+; CHECK: test_branches_order:
+; CHECK: [[L0:LBB0_[0-9]+]]: ## %do.body.i
+; CHECK: je
+; CHECK: %do.cond.i
+; CHECK: jne [[L0]]
+; CHECK: jmp
+; CHECK: %exit
+entry:
+  %strs = alloca [1000 x [1001 x i8]], align 16
+  br label %for.cond
+
+for.cond:
+  %j.0 = phi i32 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  %cmp = icmp slt i32 %j.0, 1000
+  br i1 %cmp, label %for.cond1, label %for.end11
+
+for.cond1:
+  %indvars.iv50 = phi i64 [ %indvars.iv.next51, %for.body3 ], [ 0, %for.cond ]
+  %0 = trunc i64 %indvars.iv50 to i32
+  %cmp2 = icmp slt i32 %0, 1000
+  br i1 %cmp2, label %for.body3, label %for.inc9
+
+for.body3:
+  %arraydecay = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv50, i64 0
+  %call = call i8* @memchr(i8* %arraydecay, i32 120, i64 1000)
+  %add.ptr = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv50, i64 %indvars.iv50
+  %cmp7 = icmp eq i8* %call, %add.ptr
+  %indvars.iv.next51 = add i64 %indvars.iv50, 1
+  br i1 %cmp7, label %for.cond1, label %if.then
+
+if.then:
+  %puts = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @str4, i64 0, i64 0))
+  call void @exit(i32 1) noreturn
+  unreachable
+
+for.inc9:
+  %inc10 = add nsw i32 %j.0, 1
+  br label %for.cond
+
+for.end11:
+  %puts42 = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str2, i64 0, i64 0))
+  br label %for.cond14
+
+for.cond14:
+  %j13.0 = phi i32 [ 0, %for.end11 ], [ %inc39, %for.inc38 ]
+  %cmp15 = icmp slt i32 %j13.0, 1000
+  br i1 %cmp15, label %for.cond18, label %for.end40
+
+for.cond18:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %exit ], [ 0, %for.cond14 ]
+  %1 = trunc i64 %indvars.iv to i32
+  %cmp19 = icmp slt i32 %1, 1000
+  br i1 %cmp19, label %for.body20, label %for.inc38
+
+for.body20:
+  %arraydecay24 = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv, i64 0
+  br label %do.body.i
+
+do.body.i:
+  %n.addr.0.i = phi i64 [ %dec.i, %do.cond.i ], [ 1000, %for.body20 ]
+  %p.0.i = phi i8* [ %incdec.ptr.i, %do.cond.i ], [ %arraydecay24, %for.body20 ]
+  %2 = load i8* %p.0.i, align 1
+  %cmp3.i = icmp eq i8 %2, 120
+  br i1 %cmp3.i, label %exit, label %do.cond.i
+
+do.cond.i:
+  %incdec.ptr.i = getelementptr inbounds i8* %p.0.i, i64 1
+  %dec.i = add i64 %n.addr.0.i, -1
+  %cmp5.i = icmp eq i64 %dec.i, 0
+  br i1 %cmp5.i, label %if.then32, label %do.body.i
+
+exit:
+  %add.ptr30 = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv, i64 %indvars.iv
+  %cmp31 = icmp eq i8* %p.0.i, %add.ptr30
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br i1 %cmp31, label %for.cond18, label %if.then32
+
+if.then32:
+  %puts43 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @str4, i64 0, i64 0))
+  call void @exit(i32 1) noreturn
+  unreachable
+
+for.inc38:
+  %inc39 = add nsw i32 %j13.0, 1
+  br label %for.cond14
+
+for.end40:
+  %puts44 = call i32 @puts(i8* getelementptr inbounds ([11 x i8]* @.str3, i64 0, i64 0))
+  ret i32 0
+}
+
+declare i8* @memchr(i8*, i32, i64) nounwind readonly
+declare void @exit(i32) noreturn
+declare i32 @puts(i8* nocapture) nounwind
+
diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll
index 1e20273194d58..117300110b411 100644
--- a/test/CodeGen/X86/rotate.ll
+++ b/test/CodeGen/X86/rotate.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {ro\[rl\]} | count 12
+; RUN:   grep "ro[rl]" | count 12
 
 define i32 @rotl32(i32 %A, i8 %Amt) {
 	%shift.upgrd.1 = zext i8 %Amt to i32		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/rounding-ops.ll b/test/CodeGen/X86/rounding-ops.ll
index 0dd74ea0791e3..51fcf6418429d 100644
--- a/test/CodeGen/X86/rounding-ops.ll
+++ b/test/CodeGen/X86/rounding-ops.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41 | FileCheck -check-prefix=CHECK-SSE %s
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck -check-prefix=CHECK-AVX %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse41 | FileCheck -check-prefix=CHECK-SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck -check-prefix=CHECK-AVX %s
 
 define float @test1(float %x) nounwind  {
   %call = tail call float @floorf(float %x) nounwind readnone
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index 5ce08aa51c76b..d68b00b69a2bb 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -51,14 +51,14 @@ false:
 ; X64-NEXT: callq __morestack
 ; X64-NEXT: ret
 
-; X64:      movq %rsp, %rdi
-; X64-NEXT: subq %rax, %rdi
-; X64-NEXT: cmpq %rdi, %fs:112
+; X64:      movq %rsp, %[[RDI:rdi|rax]]
+; X64-NEXT: subq %{{.*}}, %[[RDI]]
+; X64-NEXT: cmpq %[[RDI]], %fs:112
 
-; X64:      movq %rdi, %rsp
+; X64:      movq %[[RDI]], %rsp
 
-; X64:      movq %rax, %rdi
+; X64:      movq %{{.*}}, %rdi
 ; X64-NEXT: callq __morestack_allocate_stack_space
-; X64-NEXT: movq %rax, %rdi
+; X64:      movq %rax, %rdi
 
 }
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index f465a4ffc5847..2e39473057b12 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 ; PR5757
 
 %0 = type { i64, i32 }
@@ -12,6 +13,10 @@ define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind {
 ; CHECK: test1:
 ; CHECK: cmovneq %rdi, %rsi
 ; CHECK: movl (%rsi), %eax
+
+; ATOM: test1:
+; ATOM: cmovneq %rdi, %rsi
+; ATOM: movl (%rsi), %eax
 }
 
 
@@ -31,6 +36,10 @@ bb91:		; preds = %bb84
 ; CHECK: test2:
 ; CHECK: movnew
 ; CHECK: movswl
+
+; ATOM: test2:
+; ATOM: movnew
+; ATOM: movswl
 }
 
 declare i1 @return_false()
@@ -44,6 +53,9 @@ entry:
 	ret float %iftmp.0.0
 ; CHECK: test3:
 ; CHECK: movss	{{.*}},4), %xmm0
+
+; ATOM: test3:
+; ATOM: movss  {{.*}},4), %xmm0
 }
 
 define signext i8 @test4(i8* nocapture %P, double %F) nounwind readonly {
@@ -55,6 +67,9 @@ entry:
 	ret i8 %2
 ; CHECK: test4:
 ; CHECK: movsbl	({{.*}},4), %eax
+
+; ATOM: test4:
+; ATOM: movsbl ({{.*}},4), %eax
 }
 
 define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
@@ -62,6 +77,8 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
   store <2 x i16> %x, <2 x i16>* %p
   ret void
 ; CHECK: test5:
+
+; ATOM: test5:
 }
 
 define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
@@ -79,6 +96,12 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
 ; CHECK: ret
 ; CHECK: mulps
 ; CHECK: ret
+
+; ATOM: test6:
+; ATOM: je
+; ATOM: ret
+; ATOM: mulps
+; ATOM: ret
 }
 
 ; Select with fp80's
@@ -89,6 +112,10 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
 ; CHECK: test7:
 ; CHECK: leaq
 ; CHECK: fldt (%r{{.}}x,%r{{.}}x)
+
+; ATOM: test7:
+; ATOM: leaq
+; ATOM: fldt (%r{{.}}x,%r{{.}}x)
 }
 
 ; widening select v6i32 and then a sub
@@ -97,8 +124,10 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 	%val = sub <6 x i32> %x, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
 	store <6 x i32> %val, <6 x i32>* %dst.addr
 	ret void
-        
+
 ; CHECK: test8:
+
+; ATOM: test8:
 }
 
 
@@ -113,6 +142,12 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test9:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 ;; Same as test9
@@ -125,6 +160,12 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test9a:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
@@ -137,6 +178,12 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test9b:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 ;; Select between -1 and 1.
@@ -149,6 +196,12 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	$1, %rax
 ; CHECK: ret
+
+; ATOM: test10:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    $1, %rax
+; ATOM: ret
 }
 
 
@@ -163,6 +216,13 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: notq %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test11:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: notq %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
@@ -175,6 +235,13 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: notq %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test11a:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: notq %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 
@@ -189,10 +256,16 @@ entry:
   %call = tail call noalias i8* @_Znam(i64 %D) nounwind noredzone
   ret i8* %call
 ; CHECK: test12:
-; CHECK: mulq
 ; CHECK: movq $-1, %rdi
+; CHECK: mulq
 ; CHECK: cmovnoq	%rax, %rdi
 ; CHECK: jmp	__Znam
+
+; ATOM: test12:
+; ATOM: mulq
+; ATOM: movq $-1, %rdi
+; ATOM: cmovnoq        %rax, %rdi
+; ATOM: jmp    __Znam
 }
 
 declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
@@ -205,6 +278,11 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
 ; CHECK: cmpl
 ; CHECK-NEXT: sbbl
 ; CHECK-NEXT: ret
+
+; ATOM: test13:
+; ATOM: cmpl
+; ATOM-NEXT: sbbl
+; ATOM-NEXT: ret
 }
 
 define i32 @test14(i32 %a, i32 %b) nounwind {
@@ -216,5 +294,53 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT: sbbl
 ; CHECK-NEXT: notl
 ; CHECK-NEXT: ret
+
+; ATOM: test14:
+; ATOM: cmpl
+; ATOM-NEXT: sbbl
+; ATOM-NEXT: notl
+; ATOM-NEXT: ret
+}
+
+; rdar://10961709
+define i32 @test15(i32 %x) nounwind {
+entry:
+  %cmp = icmp ne i32 %x, 0
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+; CHECK: test15:
+; CHECK: negl
+; CHECK: sbbl
+
+; ATOM: test15:
+; ATOM: negl
+; ATOM: sbbl
 }
 
+define i64 @test16(i64 %x) nounwind uwtable readnone ssp {
+entry:
+  %cmp = icmp ne i64 %x, 0
+  %conv1 = sext i1 %cmp to i64
+  ret i64 %conv1
+; CHECK: test16:
+; CHECK: negq
+; CHECK: sbbq
+
+; ATOM: test16:
+; ATOM: negq
+; ATOM: sbbq
+}
+
+define i16 @test17(i16 %x) nounwind {
+entry:
+  %cmp = icmp ne i16 %x, 0
+  %sub = sext i1 %cmp to i16
+  ret i16 %sub
+; CHECK: test17:
+; CHECK: negw
+; CHECK: sbbw
+
+; ATOM: test17:
+; ATOM: negw
+; ATOM: sbbw
+}
diff --git a/test/CodeGen/X86/selectiondag-cse.ll b/test/CodeGen/X86/selectiondag-cse.ll
new file mode 100644
index 0000000000000..a653a1c8ca3ba
--- /dev/null
+++ b/test/CodeGen/X86/selectiondag-cse.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s
+; PR12599
+;
+; This bitcode causes the X86 target to make changes to the DAG during
+; selection in MatchAddressRecursively. The edit triggers CSE which causes both
+; the current node and yet-to-be-selected nodes to be deleted.
+;
+; SelectionDAGISel::DoInstructionSelection must handle that.
+;
+target triple = "x86_64-apple-macosx"
+
+%0 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i8**, i32, i32***, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [9 x [16 x [16 x i16]]], [5 x [16 x [16 x i16]]], [9 x [8 x [8 x i16]]], [2 x [4 x [16 x [16 x i16]]]], [16 x [16 x i16]], [16 x [16 x i32]], i32****, i32***, i32***, i32***, i32****, i32****, %1*, %2*, %9*, i32*, i32*, i32, i32, i32, i32, [4 x [4 x i32]], i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i16******, i16******, i16******, i16******, [15 x i16], i32, i32, i32, i32, i32, i32, i32, i32, [6 x [32 x i32]], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [1 x i32], i32, i32, [2 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %10*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double**, double***, i32***, double**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [3 x [2 x i32]], [2 x i32], i32, i32, i16, i32, i32, i32, i32, i32 }
+%1 = type { i32, i32, [100 x %2*], i32, float, float, float }
+%2 = type { i32, i32, i32, i32, i32, i32, %3*, %6*, %8*, i32, i32*, i32*, i32*, i32, i32*, i32*, i32*, i32 (i32)*, [3 x [2 x i32]] }
+%3 = type { %4*, %5, %5 }
+%4 = type { i32, i32, i8, i32, i32, i8, i8, i32, i32, i8*, i32 }
+%5 = type { i32, i32, i32, i32, i32, i8*, i32*, i32, i32 }
+%6 = type { [3 x [11 x %7]], [2 x [9 x %7]], [2 x [10 x %7]], [2 x [6 x %7]], [4 x %7], [4 x %7], [3 x %7] }
+%7 = type { i16, i8, i64 }
+%8 = type { [2 x %7], [4 x %7], [3 x [4 x %7]], [10 x [4 x %7]], [10 x [15 x %7]], [10 x [15 x %7]], [10 x [5 x %7]], [10 x [5 x %7]], [10 x [15 x %7]], [10 x [15 x %7]] }
+%9 = type { i32, i32, i32, [2 x i32], i32, [8 x i32], %9*, %9*, i32, [2 x [4 x [4 x [2 x i32]]]], [16 x i8], [16 x i8], i32, i64, [4 x i32], [4 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i16, double, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+%10 = type { i32, i32, i32, i32, i32, %10* }
+
+@images = external hidden global %0, align 8
+
+define hidden fastcc void @Mode_Decision_for_4x4IntraBlocks() nounwind uwtable ssp {
+bb4:
+  %tmp = or i208 undef, 0
+  br i1 undef, label %bb35, label %bb5
+
+bb5:
+  %tmp6 = add i32 0, 2
+  %tmp7 = lshr i208 %tmp, 80
+  %tmp8 = trunc i208 %tmp7 to i32
+  %tmp9 = and i32 %tmp8, 65535
+  %tmp10 = shl nuw nsw i32 %tmp9, 1
+  %tmp11 = add i32 0, 2
+  %tmp12 = add i32 %tmp11, 0
+  %tmp13 = add i32 %tmp12, %tmp10
+  %tmp14 = lshr i32 %tmp13, 2
+  %tmp15 = trunc i32 %tmp14 to i16
+  store i16 %tmp15, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 3, i64 0, i64 3), align 2
+  %tmp16 = lshr i208 %tmp, 96
+  %tmp17 = trunc i208 %tmp16 to i32
+  %tmp18 = and i32 %tmp17, 65535
+  %tmp19 = add i32 %tmp18, 2
+  %tmp20 = add i32 %tmp19, 0
+  %tmp21 = add i32 %tmp20, 0
+  %tmp22 = lshr i32 %tmp21, 2
+  %tmp23 = trunc i32 %tmp22 to i16
+  store i16 %tmp23, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 3, i64 2, i64 3), align 2
+  %tmp24 = add i32 %tmp6, %tmp9
+  %tmp25 = add i32 %tmp24, 0
+  %tmp26 = lshr i32 %tmp25, 2
+  %tmp27 = trunc i32 %tmp26 to i16
+  store i16 %tmp27, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 7, i64 1, i64 2), align 4
+  %tmp28 = lshr i208 %tmp, 80
+  %tmp29 = shl nuw nsw i208 %tmp28, 1
+  %tmp30 = trunc i208 %tmp29 to i32
+  %tmp31 = and i32 %tmp30, 131070
+  %tmp32 = add i32 %tmp12, %tmp31
+  %tmp33 = lshr i32 %tmp32, 2
+  %tmp34 = trunc i32 %tmp33 to i16
+  store i16 %tmp34, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 7, i64 1, i64 3), align 2
+  br label %bb35
+
+bb35:                                             ; preds = %bb5, %bb4
+  unreachable
+}
diff --git a/test/CodeGen/X86/sext-setcc-self.ll b/test/CodeGen/X86/sext-setcc-self.ll
new file mode 100644
index 0000000000000..23d66a24724d8
--- /dev/null
+++ b/test/CodeGen/X86/sext-setcc-self.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=x86-64 -mcpu=nehalem -asm-verbose=false < %s | FileCheck %s
+
+define <4 x i32> @test_ueq(<4 x float> %in) {
+entry:
+  ; CHECK: pcmpeqd %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp ueq <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_uge(<4 x float> %in) {
+entry:
+  ; CHECK: pcmpeqd %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp uge <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_ule(<4 x float> %in) {
+entry:
+  ; CHECK: pcmpeqd %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp ule <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_one(<4 x float> %in) {
+entry:
+  ; CHECK: xorps %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp one <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_ogt(<4 x float> %in) {
+entry:
+  ; CHECK: xorps %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp ogt <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_olt(<4 x float> %in) {
+entry:
+  ; CHECK: xorps %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp olt <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
diff --git a/test/CodeGen/X86/shift-and.ll b/test/CodeGen/X86/shift-and.ll
index b747cc5580ca9..1de915164f0cd 100644
--- a/test/CodeGen/X86/shift-and.ll
+++ b/test/CodeGen/X86/shift-and.ll
@@ -1,13 +1,27 @@
-; RUN: llc < %s -march=x86    | grep and | count 2
-; RUN: llc < %s -march=x86-64 | not grep and 
+; RUN: llc < %s -mtriple=i386-apple-macosx   | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s --check-prefix=X64
 
 define i32 @t1(i32 %t, i32 %val) nounwind {
+; X32: t1:
+; X32-NOT: andl
+; X32: shll
+
+; X64: t1:
+; X64-NOT: andl
+; X64: shll
        %shamt = and i32 %t, 31
        %res = shl i32 %val, %shamt
        ret i32 %res
 }
 
 define i32 @t2(i32 %t, i32 %val) nounwind {
+; X32: t2:
+; X32-NOT: andl
+; X32: shll
+
+; X64: t2:
+; X64-NOT: andl
+; X64: shll
        %shamt = and i32 %t, 63
        %res = shl i32 %val, %shamt
        ret i32 %res
@@ -16,6 +30,13 @@ define i32 @t2(i32 %t, i32 %val) nounwind {
 @X = internal global i16 0
 
 define void @t3(i16 %t) nounwind {
+; X32: t3:
+; X32-NOT: andl
+; X32: sarw
+
+; X64: t3:
+; X64-NOT: andl
+; X64: sarw
        %shamt = and i16 %t, 31
        %tmp = load i16* @X
        %tmp1 = ashr i16 %tmp, %shamt
@@ -24,13 +45,34 @@ define void @t3(i16 %t) nounwind {
 }
 
 define i64 @t4(i64 %t, i64 %val) nounwind {
+; X64: t4:
+; X64-NOT: and
+; X64: shrq
        %shamt = and i64 %t, 63
        %res = lshr i64 %val, %shamt
        ret i64 %res
 }
 
 define i64 @t5(i64 %t, i64 %val) nounwind {
+; X64: t5:
+; X64-NOT: and
+; X64: shrq
        %shamt = and i64 %t, 191
        %res = lshr i64 %val, %shamt
        ret i64 %res
 }
+
+
+; rdar://11866926
+define i64 @t6(i64 %key, i64* nocapture %val) nounwind {
+entry:
+; X64: t6:
+; X64-NOT: movabsq
+; X64: decq
+; X64: andq
+  %shr = lshr i64 %key, 3
+  %0 = load i64* %val, align 8
+  %sub = add i64 %0, 2305843009213693951
+  %and = and i64 %sub, %shr
+  ret i64 %and
+}
diff --git a/test/CodeGen/X86/shift-coalesce.ll b/test/CodeGen/X86/shift-coalesce.ll
index d38f9a88fcd6c..4f27e97fb390b 100644
--- a/test/CodeGen/X86/shift-coalesce.ll
+++ b/test/CodeGen/X86/shift-coalesce.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {shld.*CL}
+; RUN:   grep "shld.*CL"
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   not grep {mov CL, BL}
+; RUN:   not grep "mov CL, BL"
 
 ; PR687
 
diff --git a/test/CodeGen/X86/shift-double.ll b/test/CodeGen/X86/shift-double.ll
index 5adee7c76941a..8d2b2907c5a76 100644
--- a/test/CodeGen/X86/shift-double.ll
+++ b/test/CodeGen/X86/shift-double.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {sh\[lr\]d} | count 5
+; RUN:   grep "sh[lr]d" | count 5
 
 define i64 @test1(i64 %X, i8 %C) {
         %shift.upgrd.1 = zext i8 %C to i64              ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/shift-folding.ll b/test/CodeGen/X86/shift-folding.ll
index 3ea601147bb09..c518cdd3aa4ea 100644
--- a/test/CodeGen/X86/shift-folding.ll
+++ b/test/CodeGen/X86/shift-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -verify-coalescing | FileCheck %s
 
 define i32* @test1(i32* %P, i32 %X) {
 ; CHECK: test1:
diff --git a/test/CodeGen/X86/shl_elim.ll b/test/CodeGen/X86/shl_elim.ll
index 0827221875b1d..83e1eb5c39e77 100644
--- a/test/CodeGen/X86/shl_elim.ll
+++ b/test/CodeGen/X86/shl_elim.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 | grep {movl	8(.esp), %eax}
-; RUN: llc < %s -march=x86 | grep {shrl	.eax}
-; RUN: llc < %s -march=x86 | grep {movswl	.ax, .eax}
+; RUN: llc < %s -march=x86 | grep "movl	8(.esp), %eax"
+; RUN: llc < %s -march=x86 | grep "shrl	.eax"
+; RUN: llc < %s -march=x86 | grep "movswl	.ax, .eax"
 
 define i32 @test1(i64 %a) nounwind {
         %tmp29 = lshr i64 %a, 24                ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/sincos.ll b/test/CodeGen/X86/sincos.ll
index 13f932982f14c..1479be1f56ba5 100644
--- a/test/CodeGen/X86/sincos.ll
+++ b/test/CodeGen/X86/sincos.ll
@@ -1,8 +1,6 @@
 ; Make sure this testcase codegens to the sin and cos instructions, not calls
-; RUN: llc < %s -march=x86 -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | \
-; RUN:   grep sin\$ | count 3
-; RUN: llc < %s -march=x86 -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | \
-; RUN:   grep cos\$ | count 3
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=SIN
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=COS
 
 declare float  @sinf(float) readonly
 
@@ -10,39 +8,59 @@ declare double @sin(double) readonly
 
 declare x86_fp80 @sinl(x86_fp80) readonly
 
+; SIN: test1:
 define float @test1(float %X) {
         %Y = call float @sinf(float %X) readonly
         ret float %Y
 }
+; SIN: {{^[ \t]*fsin$}}
 
+; SIN-NOT: fsin
+
+; SIN: test2:
 define double @test2(double %X) {
         %Y = call double @sin(double %X) readonly
         ret double %Y
 }
+; SIN: {{^[ \t]*fsin$}}
+
+; SIN-NOT: fsin
 
+; SIN: test3:
 define x86_fp80 @test3(x86_fp80 %X) {
         %Y = call x86_fp80 @sinl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
 }
+; SIN: {{^[ \t]*fsin$}}
 
+; SIN-NOT: fsin
+; COS-NOT: fcos
 declare float @cosf(float) readonly
 
 declare double @cos(double) readonly
 
 declare x86_fp80 @cosl(x86_fp80) readonly
 
+
+; SIN: test4:
+; COS: test3:
 define float @test4(float %X) {
         %Y = call float @cosf(float %X) readonly
         ret float %Y
 }
+; COS: {{^[ \t]*fcos}}
 
 define double @test5(double %X) {
         %Y = call double @cos(double %X) readonly
         ret double %Y
 }
+; COS: {{^[ \t]*fcos}}
 
 define x86_fp80 @test6(x86_fp80 %X) {
         %Y = call x86_fp80 @cosl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
 }
+; COS: {{^[ \t]*fcos}}
 
+; SIN-NOT: fsin
+; COS-NOT: fcos
diff --git a/test/CodeGen/X86/sink-hoist.ll b/test/CodeGen/X86/sink-hoist.ll
index 7957eb849673d..649cd61ab78c6 100644
--- a/test/CodeGen/X86/sink-hoist.ll
+++ b/test/CodeGen/X86/sink-hoist.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -post-RA-scheduler=true | FileCheck %s
+; RUN: llc < %s -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -mcpu=nehalem -post-RA-scheduler=true | FileCheck %s
 
 ; Currently, floating-point selects are lowered to CFG triangles.
 ; This means that one side of the select is always unconditionally
diff --git a/test/CodeGen/X86/sink-out-of-loop.ll b/test/CodeGen/X86/sink-out-of-loop.ll
new file mode 100644
index 0000000000000..c600f925a32bc
--- /dev/null
+++ b/test/CodeGen/X86/sink-out-of-loop.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+; A MOV32ri is inside a loop, it has two successors, one successor is inside the
+; same loop, the other successor is outside the loop. We should be able to sink
+; MOV32ri outside the loop.
+; rdar://11980766
+define i32 @sink_succ(i32 %argc, i8** nocapture %argv) nounwind uwtable ssp {
+; CHECK: sink_succ
+; CHECK: [[OUTER_LN1:LBB0_[0-9]+]]: ## %preheader
+; CHECK: %exit
+; CHECK-NOT: movl
+; CHECK: jne [[OUTER_LN1]]
+; CHECK: movl
+; CHECK: [[LN2:LBB0_[0-9]+]]: ## %for.body2
+; CHECK: jne [[LN2]]
+; CHECK: ret
+entry:
+  br label %preheader
+
+preheader:
+  %i.127 = phi i32 [ 0, %entry ], [ %inc9, %exit ]
+  br label %for.body1.lr
+
+for.body1.lr:
+  %iv30 = phi i32 [ 1, %preheader ], [ %iv.next31, %for.inc40.i ]
+  br label %for.body1
+
+for.body1:
+  %iv.i = phi i64 [ 0, %for.body1.lr ], [ %iv.next.i, %for.body1 ]
+  %iv.next.i = add i64 %iv.i, 1
+  %lftr.wideiv32 = trunc i64 %iv.next.i to i32
+  %exitcond33 = icmp eq i32 %lftr.wideiv32, %iv30
+  br i1 %exitcond33, label %for.inc40.i, label %for.body1
+
+for.inc40.i:
+  %iv.next31 = add i32 %iv30, 1
+  %exitcond49.i = icmp eq i32 %iv.next31, 32
+  br i1 %exitcond49.i, label %exit, label %for.body1.lr
+
+exit:
+  %inc9 = add nsw i32 %i.127, 1
+  %exitcond34 = icmp eq i32 %inc9, 10
+  br i1 %exitcond34, label %for.body2, label %preheader
+
+for.body2:
+  %iv = phi i64 [ %iv.next, %for.body2 ], [ 0, %exit ]
+  %iv.next = add i64 %iv, 1
+  %lftr.wideiv = trunc i64 %iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 2048
+  br i1 %exitcond, label %for.end20, label %for.body2
+
+for.end20:
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll
index 81a072fb396a7..980f18c8b9115 100644
--- a/test/CodeGen/X86/splat-scalar-load.ll
+++ b/test/CodeGen/X86/splat-scalar-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -mcpu=nehalem | FileCheck %s
 ; rdar://7434544
 
 define <2 x i64> @t2() nounwind {
diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll
index 118e393b7baad..71a42f4db34ab 100644
--- a/test/CodeGen/X86/sse-align-12.ll
+++ b/test/CodeGen/X86/sse-align-12.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=nehalem | FileCheck %s
 
 ; CHECK: a:
 ; CHECK: movdqu
diff --git a/test/CodeGen/X86/sse-domains.ll b/test/CodeGen/X86/sse-domains.ll
index d1e07c8563640..c99287bdfb9f5 100644
--- a/test/CodeGen/X86/sse-domains.ll
+++ b/test/CodeGen/X86/sse-domains.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -mcpu=nehalem | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7"
 
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 11124409f058a..3839e875615f6 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -promote-elements | FileCheck %s
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -enable-unsafe-fp-math -enable-no-nans-fp-math -promote-elements | FileCheck -check-prefix=UNSAFE %s
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -enable-no-nans-fp-math -promote-elements | FileCheck -check-prefix=FINITE %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false  | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math  | FileCheck -check-prefix=UNSAFE %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-no-nans-fp-math  | FileCheck -check-prefix=FINITE %s
 
 ; Some of these patterns can be matched as SSE min or max. Some of
 ; then can be matched provided that the operands are swapped.
@@ -8,13 +8,10 @@
 ; and a conditional branch.
 
 ; The naming convention is {,x_,y_}{o,u}{gt,lt,ge,le}{,_inverse}
-; x_ : use 0.0 instead of %y
-; y_ : use -0.0 instead of %y
+;  _x: use 0.0 instead of %y
+;  _y: use -0.0 instead of %y
 ; _inverse : swap the arms of the select.
 
-; Some of these tests depend on -join-physregs commuting instructions to
-; eliminate copies.
-
 ; CHECK:      ogt:
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
@@ -139,147 +136,147 @@ define double @ole_inverse(double %x, double %y) nounwind {
   ret double %d
 }
 
-; CHECK:      x_ogt:
+; CHECK:      ogt_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ogt:
+; UNSAFE:      ogt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ogt:
+; FINITE:      ogt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: maxsd %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ogt(double %x) nounwind {
+define double @ogt_x(double %x) nounwind {
   %c = fcmp ogt double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_olt:
+; CHECK:      olt_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: minsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_olt:
+; UNSAFE:      olt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_olt:
+; FINITE:      olt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: minsd %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_olt(double %x) nounwind {
+define double @olt_x(double %x) nounwind {
   %c = fcmp olt double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ogt_inverse:
+; CHECK:      ogt_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ogt_inverse:
+; UNSAFE:      ogt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ogt_inverse:
+; FINITE:      ogt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: minsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ogt_inverse(double %x) nounwind {
+define double @ogt_inverse_x(double %x) nounwind {
   %c = fcmp ogt double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_olt_inverse:
+; CHECK:      olt_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_olt_inverse:
+; UNSAFE:      olt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_olt_inverse:
+; FINITE:      olt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: maxsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_olt_inverse(double %x) nounwind {
+define double @olt_inverse_x(double %x) nounwind {
   %c = fcmp olt double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_oge:
+; CHECK:      oge_x:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_oge:
+; UNSAFE:      oge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_oge:
+; FINITE:      oge_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_oge(double %x) nounwind {
+define double @oge_x(double %x) nounwind {
   %c = fcmp oge double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ole:
+; CHECK:      ole_x:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ole:
+; UNSAFE:      ole_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ole:
+; FINITE:      ole_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: minsd %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ole(double %x) nounwind {
+define double @ole_x(double %x) nounwind {
   %c = fcmp ole double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_oge_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_oge_inverse:
+; CHECK:      oge_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      oge_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_oge_inverse:
+; FINITE:      oge_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_oge_inverse(double %x) nounwind {
+define double @oge_inverse_x(double %x) nounwind {
   %c = fcmp oge double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_ole_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ole_inverse:
+; CHECK:      ole_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ole_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ole_inverse:
+; FINITE:      ole_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ole_inverse(double %x) nounwind {
+define double @ole_inverse_x(double %x) nounwind {
   %c = fcmp ole double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
@@ -411,419 +408,419 @@ define double @ule_inverse(double %x, double %y) nounwind {
   ret double %d
 }
 
-; CHECK:      x_ugt:
+; CHECK:      ugt_x:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ugt:
+; UNSAFE:      ugt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ugt:
+; FINITE:      ugt_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ugt(double %x) nounwind {
+define double @ugt_x(double %x) nounwind {
   %c = fcmp ugt double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ult:
+; CHECK:      ult_x:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_ult:
+; UNSAFE:      ult_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ult:
+; FINITE:      ult_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: minsd   %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ult(double %x) nounwind {
+define double @ult_x(double %x) nounwind {
   %c = fcmp ult double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ugt_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ugt_inverse:
+; CHECK:      ugt_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ugt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ugt_inverse:
+; FINITE:      ugt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ugt_inverse(double %x) nounwind {
+define double @ugt_inverse_x(double %x) nounwind {
   %c = fcmp ugt double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_ult_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_ult_inverse:
+; CHECK:      ult_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ult_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ult_inverse:
+; FINITE:      ult_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ult_inverse(double %x) nounwind {
+define double @ult_inverse_x(double %x) nounwind {
   %c = fcmp ult double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_uge:
+; CHECK:      uge_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_uge:
+; UNSAFE:      uge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_uge:
+; FINITE:      uge_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: maxsd  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_uge(double %x) nounwind {
+define double @uge_x(double %x) nounwind {
   %c = fcmp uge double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ule:
+; CHECK:      ule_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ule:
+; UNSAFE:      ule_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ule:
+; FINITE:      ule_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: minsd  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ule(double %x) nounwind {
+define double @ule_x(double %x) nounwind {
   %c = fcmp ule double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_uge_inverse:
+; CHECK:      uge_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: minsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_uge_inverse:
+; UNSAFE:      uge_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_uge_inverse:
+; FINITE:      uge_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: minsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_uge_inverse(double %x) nounwind {
+define double @uge_inverse_x(double %x) nounwind {
   %c = fcmp uge double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_ule_inverse:
+; CHECK:      ule_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ule_inverse:
+; UNSAFE:      ule_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ule_inverse:
+; FINITE:      ule_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ule_inverse(double %x) nounwind {
+define double @ule_inverse_x(double %x) nounwind {
   %c = fcmp ule double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ogt:
+; CHECK:      ogt_y:
 ; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ogt:
+; UNSAFE:      ogt_y:
 ; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ogt:
+; FINITE:      ogt_y:
 ; FINITE-NEXT: maxsd {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ogt(double %x) nounwind {
+define double @ogt_y(double %x) nounwind {
   %c = fcmp ogt double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_olt:
+; CHECK:      olt_y:
 ; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_olt:
+; UNSAFE:      olt_y:
 ; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_olt:
+; FINITE:      olt_y:
 ; FINITE-NEXT: minsd {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_olt(double %x) nounwind {
+define double @olt_y(double %x) nounwind {
   %c = fcmp olt double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ogt_inverse:
+; CHECK:      ogt_inverse_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ogt_inverse:
+; UNSAFE:      ogt_inverse_y:
 ; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ogt_inverse:
+; FINITE:      ogt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ogt_inverse(double %x) nounwind {
+define double @ogt_inverse_y(double %x) nounwind {
   %c = fcmp ogt double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_olt_inverse:
+; CHECK:      olt_inverse_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_olt_inverse:
+; UNSAFE:      olt_inverse_y:
 ; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_olt_inverse:
+; FINITE:      olt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_olt_inverse(double %x) nounwind {
+define double @olt_inverse_y(double %x) nounwind {
   %c = fcmp olt double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_oge:
+; CHECK:      oge_y:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_oge:
+; UNSAFE:      oge_y:
 ; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_oge:
+; FINITE:      oge_y:
 ; FINITE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_oge(double %x) nounwind {
+define double @oge_y(double %x) nounwind {
   %c = fcmp oge double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ole:
+; CHECK:      ole_y:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ole:
+; UNSAFE:      ole_y:
 ; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ole:
+; FINITE:      ole_y:
 ; FINITE-NEXT: minsd {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ole(double %x) nounwind {
+define double @ole_y(double %x) nounwind {
   %c = fcmp ole double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_oge_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_oge_inverse:
+; CHECK:      oge_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      oge_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_oge_inverse:
+; FINITE:      oge_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_oge_inverse(double %x) nounwind {
+define double @oge_inverse_y(double %x) nounwind {
   %c = fcmp oge double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ole_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ole_inverse:
+; CHECK:      ole_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ole_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ole_inverse:
+; FINITE:      ole_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ole_inverse(double %x) nounwind {
+define double @ole_inverse_y(double %x) nounwind {
   %c = fcmp ole double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ugt:
+; CHECK:      ugt_y:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ugt:
+; UNSAFE:      ugt_y:
 ; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ugt:
+; FINITE:      ugt_y:
 ; FINITE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ugt(double %x) nounwind {
+define double @ugt_y(double %x) nounwind {
   %c = fcmp ugt double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ult:
+; CHECK:      ult_y:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_ult:
+; UNSAFE:      ult_y:
 ; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ult:
+; FINITE:      ult_y:
 ; FINITE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ult(double %x) nounwind {
+define double @ult_y(double %x) nounwind {
   %c = fcmp ult double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ugt_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ugt_inverse:
+; CHECK:      ugt_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ugt_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ugt_inverse:
+; FINITE:      ugt_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ugt_inverse(double %x) nounwind {
+define double @ugt_inverse_y(double %x) nounwind {
   %c = fcmp ugt double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ult_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_ult_inverse:
+; CHECK:      ult_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ult_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ult_inverse:
+; FINITE:      ult_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ult_inverse(double %x) nounwind {
+define double @ult_inverse_y(double %x) nounwind {
   %c = fcmp ult double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_uge:
+; CHECK:      uge_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_uge:
+; UNSAFE:      uge_y:
 ; UNSAFE-NEXT: maxsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_uge:
+; FINITE:      uge_y:
 ; FINITE-NEXT: maxsd  {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_uge(double %x) nounwind {
+define double @uge_y(double %x) nounwind {
   %c = fcmp uge double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ule:
+; CHECK:      ule_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ule:
+; UNSAFE:      ule_y:
 ; UNSAFE-NEXT: minsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ule:
+; FINITE:      ule_y:
 ; FINITE-NEXT: minsd  {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ule(double %x) nounwind {
+define double @ule_y(double %x) nounwind {
   %c = fcmp ule double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_uge_inverse:
+; CHECK:      uge_inverse_y:
 ; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_uge_inverse:
+; UNSAFE:      uge_inverse_y:
 ; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_uge_inverse:
+; FINITE:      uge_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_uge_inverse(double %x) nounwind {
+define double @uge_inverse_y(double %x) nounwind {
   %c = fcmp uge double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ule_inverse:
+; CHECK:      ule_inverse_y:
 ; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ule_inverse:
+; UNSAFE:      ule_inverse_y:
 ; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ule_inverse:
+; FINITE:      ule_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ule_inverse(double %x) nounwind {
+define double @ule_inverse_y(double %x) nounwind {
   %c = fcmp ule double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 5ea1b4dff1c15..48638b3b696c6 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -249,9 +249,10 @@ entry:
 ; X64: 	t16:
 ; X64: 		pextrw	$8, %xmm0, %eax
 ; X64: 		pslldq	$2, %xmm0
-; X64: 		movd	%xmm0, %ecx
-; X64: 		pextrw	$1, %xmm0, %edx
-; X64: 		pinsrw	$0, %ecx, %xmm0
+; X64: 		pextrw	$1, %xmm0, %ecx
+; X64: 		movzbl	%cl, %ecx
+; X64: 		orl	%eax, %ecx
+; X64: 		pinsrw	$1, %ecx, %xmm0
 ; X64: 		ret
 }
 
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index 1a1017d2c1762..a2a0debf9e957 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse41 | FileCheck %s
 
 ;CHECK: vsel_float
 ;CHECK: blendvps
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 54264b16aea0c..c6f9f0c873af4 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse41 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse41 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse41 -mcpu=penryn | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse41 -mcpu=penryn | FileCheck %s -check-prefix=X64
 
 @g16 = external global i16
 
diff --git a/test/CodeGen/X86/sse4a.ll b/test/CodeGen/X86/sse4a.ll
new file mode 100644
index 0000000000000..076e213364929
--- /dev/null
+++ b/test/CodeGen/X86/sse4a.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4a | FileCheck %s
+
+define void @test1(i8* %p, <4 x float> %a) nounwind optsize ssp {
+; CHECK: test1:
+; CHECK: movntss
+  tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind
+  ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
+
+define void @test2(i8* %p, <2 x double> %a) nounwind optsize ssp {
+; CHECK: test2:
+; CHECK: movntsd
+  tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind
+  ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
+
+define <2 x i64> @test3(<2 x i64> %x) nounwind uwtable ssp {
+; CHECK: test3:
+; CHECK: extrq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
+  ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
+
+define <2 x i64> @test4(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; CHECK: test4:
+; CHECK: extrq
+  %1 = bitcast <2 x i64> %y to <16 x i8>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind
+  ret <2 x i64> %2
+}
+
+declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind
+
+define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; CHECK: test5:
+; CHECK: insertq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 5, i8 6)
+  ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
+
+define <2 x i64> @test6(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; CHECK: test6:
+; CHECK: insertq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
+  ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind
diff --git a/test/CodeGen/X86/sse_reload_fold.ll b/test/CodeGen/X86/sse_reload_fold.ll
index a57fa588f054a..fd8db3be10639 100644
--- a/test/CodeGen/X86/sse_reload_fold.ll
+++ b/test/CodeGen/X86/sse_reload_fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic |& FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic 2>&1 | FileCheck %s
 ; CHECK: fail
 ; CHECK-NOT: fail
 
diff --git a/test/CodeGen/X86/stack-align.ll b/test/CodeGen/X86/stack-align.ll
index f6c13ec0adf72..0ddb2378ef2f2 100644
--- a/test/CodeGen/X86/stack-align.ll
+++ b/test/CodeGen/X86/stack-align.ll
@@ -10,11 +10,11 @@ target triple = "i686-apple-darwin8"
 define void @test({ double, double }* byval  %z, double* %P) nounwind {
 entry:
 	%tmp3 = load double* @G, align 16		; <double> [#uses=1]
-	%tmp4 = tail call double @fabs( double %tmp3 )		; <double> [#uses=1]
+	%tmp4 = tail call double @fabs( double %tmp3 ) readnone	; <double> [#uses=1]
         store volatile double %tmp4, double* %P
 	%tmp = getelementptr { double, double }* %z, i32 0, i32 0		; <double*> [#uses=1]
 	%tmp1 = load volatile double* %tmp, align 8		; <double> [#uses=1]
-	%tmp2 = tail call double @fabs( double %tmp1 )		; <double> [#uses=1]
+	%tmp2 = tail call double @fabs( double %tmp1 ) readnone	; <double> [#uses=1]
     ; CHECK: andpd{{.*}}4(%esp), %xmm
 	%tmp6 = fadd double %tmp4, %tmp2		; <double> [#uses=1]
 	store volatile double %tmp6, double* %P, align 8
diff --git a/test/CodeGen/X86/stack-protector-linux.ll b/test/CodeGen/X86/stack-protector.ll
index fe2a9c5d57a1e..c07511443bceb 100644
--- a/test/CodeGen/X86/stack-protector-linux.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | grep %gs:
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | grep %fs:
 ; RUN: llc -code-model=kernel -mtriple=x86_64-pc-linux-gnu < %s -o - | grep %gs:
-; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep {__stack_chk_guard}
-; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep {__stack_chk_fail}
+; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep "__stack_chk_guard"
+; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep "__stack_chk_fail"
 
 @"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00"		; <[11 x i8]*> [#uses=1]
 
diff --git a/test/CodeGen/X86/store_op_load_fold2.ll b/test/CodeGen/X86/store_op_load_fold2.ll
index 8313166a90cc3..6e4fe90053f10 100644
--- a/test/CodeGen/X86/store_op_load_fold2.ll
+++ b/test/CodeGen/X86/store_op_load_fold2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=att | FileCheck %s -check-prefix=ATT
-; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=intel | FileCheck %s -check-prefix=INTEL
+; RUN: llc < %s -mtriple=i686-linux -mcpu=corei7 -x86-asm-syntax=att | FileCheck %s -check-prefix=ATT
+; RUN: llc < %s -mtriple=i686-linux -mcpu=corei7 -x86-asm-syntax=intel | FileCheck %s -check-prefix=INTEL
 
 target datalayout = "e-p:32:32"
         %struct.Macroblock = type { i32, i32, i32, i32, i32, [8 x i32], %struct.Macroblock*, %struct.Macroblock*, i32, [2 x [4 x [4 x [2 x i32]]]], [16 x i8], [16 x i8], i32, i64, [4 x i32], [4 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i16, double, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/subreg-to-reg-1.ll b/test/CodeGen/X86/subreg-to-reg-1.ll
index a297728aee897..4f31ab5a92291 100644
--- a/test/CodeGen/X86/subreg-to-reg-1.ll
+++ b/test/CodeGen/X86/subreg-to-reg-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep {leal	.*), %e.\*} | count 1
+; RUN: llc < %s -march=x86-64 | grep "leal	.*), %e.*" | count 1
 
 ; Don't eliminate or coalesce away the explicit zero-extension!
 ; This is currently using an leal because of a 3-addressification detail,
diff --git a/test/CodeGen/X86/subreg-to-reg-4.ll b/test/CodeGen/X86/subreg-to-reg-4.ll
index 0ea5541c89dca..0693789fe5d54 100644
--- a/test/CodeGen/X86/subreg-to-reg-4.ll
+++ b/test/CodeGen/X86/subreg-to-reg-4.ll
@@ -5,7 +5,7 @@
 ; RUN: not grep negq %t
 ; RUN: not grep addq %t
 ; RUN: not grep subq %t
-; RUN: not grep {movl	%} %t
+; RUN: not grep "movl	%" %t
 
 ; Utilize implicit zero-extension on x86-64 to eliminate explicit
 ; zero-extensions. Shrink 64-bit adds to 32-bit when the high
diff --git a/test/CodeGen/X86/switch-order-weight.ll b/test/CodeGen/X86/switch-order-weight.ll
new file mode 100644
index 0000000000000..0fdd56d4e1d3b
--- /dev/null
+++ b/test/CodeGen/X86/switch-order-weight.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=x86_64-apple-darwin11 < %s | FileCheck %s
+
+; Check that the cases which lead to unreachable are checked after "10"
+
+define void @test1(i32 %x) nounwind uwtable ssp {
+entry:
+  switch i32 %x, label %if.end7 [
+    i32 0, label %if.then
+    i32 10, label %if.then2
+    i32 20, label %if.then5
+  ]
+
+; CHECK: test1:
+; CHECK-NOT: unr
+; CHECK: cmpl $10
+; CHECK: bar
+; CHECK: cmpl $20
+
+if.then:
+  tail call void @unr(i32 23) noreturn nounwind
+  unreachable
+
+if.then2:
+  tail call void @bar(i32 42) nounwind
+  br label %if.end7
+
+if.then5:
+  tail call void @unr(i32 5) noreturn nounwind
+  unreachable
+
+if.end7:
+  ret void
+}
+
+declare void @unr(i32) noreturn
+
+declare void @bar(i32)
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
new file mode 100644
index 0000000000000..70307534156e0
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.4.0"
+
+declare i64 @testi()
+
+define i64 @test_trivial() {
+ %A = tail call i64 @testi()
+ ret i64 %A
+}
+; CHECK: test_trivial:
+; CHECK: jmp	_testi                  ## TAILCALL
+
+
+define i64 @test_noop_bitcast() {
+ %A = tail call i64 @testi()
+ %B = bitcast i64 %A to i64
+ ret i64 %B
+}
+; CHECK: test_noop_bitcast:
+; CHECK: jmp	_testi                  ## TAILCALL
+
+
+; Tail call shouldn't be blocked by no-op inttoptr.
+define i8* @test_inttoptr() {
+  %A = tail call i64 @testi()
+  %B = inttoptr i64 %A to i8*
+  ret i8* %B
+}
+
+; CHECK: test_inttoptr:
+; CHECK: jmp	_testi                  ## TAILCALL
+
+
+declare <4 x float> @testv()
+
+define <4 x i32> @test_vectorbitcast() {
+  %A = tail call <4 x float> @testv()
+  %B = bitcast <4 x float> %A to <4 x i32>
+  ret <4 x i32> %B
+}
+; CHECK: test_vectorbitcast:
+; CHECK: jmp	_testv                  ## TAILCALL
+
+
+declare { i64, i64 } @testp()
+
+define {i64, i64} @test_pair_trivial() {
+  %A = tail call { i64, i64} @testp()
+  ret { i64, i64} %A
+}
+; CHECK: test_pair_trivial:
+; CHECK: jmp	_testp                  ## TAILCALL
+
+
+
+define {i64, i64} @test_pair_trivial_extract() {
+  %A = tail call { i64, i64} @testp()
+  %x = extractvalue { i64, i64} %A, 0
+  %y = extractvalue { i64, i64} %A, 1
+  
+  %b = insertvalue {i64, i64} undef, i64 %x, 0
+  %c = insertvalue {i64, i64} %b, i64 %y, 1
+  
+  ret { i64, i64} %c
+}
+
+; CHECK: test_pair_trivial_extract:
+; CHECK: jmp	_testp                  ## TAILCALL
+
+define {i8*, i64} @test_pair_conv_extract() {
+  %A = tail call { i64, i64} @testp()
+  %x = extractvalue { i64, i64} %A, 0
+  %y = extractvalue { i64, i64} %A, 1
+  
+  %x1 = inttoptr i64 %x to i8*
+  
+  %b = insertvalue {i8*, i64} undef, i8* %x1, 0
+  %c = insertvalue {i8*, i64} %b, i64 %y, 1
+  
+  ret { i8*, i64} %c
+}
+
+; CHECK: test_pair_conv_extract:
+; CHECK: jmp	_testp                  ## TAILCALL
+
+
+
+; PR13006
+define { i64, i64 } @crash(i8* %this) {
+  %c = tail call { i64, i64 } @testp()
+  %mrv7 = insertvalue { i64, i64 } %c, i64 undef, 1
+  ret { i64, i64 } %mrv7
+}
+
+
diff --git a/test/CodeGen/X86/tailcall-cgp-dup.ll b/test/CodeGen/X86/tailcall-cgp-dup.ll
new file mode 100644
index 0000000000000..a80b90f9eee2c
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -0,0 +1,87 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+; Teach CGP to dup returns to enable tail call optimization.
+; rdar://9147433
+
+define i32 @foo(i32 %x) nounwind ssp {
+; CHECK: foo:
+entry:
+  switch i32 %x, label %return [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb5
+    i32 5, label %sw.bb7
+    i32 6, label %sw.bb9
+  ]
+
+sw.bb:                                            ; preds = %entry
+; CHECK: jmp _f1
+  %call = tail call i32 @f1() nounwind
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+; CHECK: jmp _f2
+  %call2 = tail call i32 @f2() nounwind
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+; CHECK: jmp _f3
+  %call4 = tail call i32 @f3() nounwind
+  br label %return
+
+sw.bb5:                                           ; preds = %entry
+; CHECK: jmp _f4
+  %call6 = tail call i32 @f4() nounwind
+  br label %return
+
+sw.bb7:                                           ; preds = %entry
+; CHECK: jmp _f5
+  %call8 = tail call i32 @f5() nounwind
+  br label %return
+
+sw.bb9:                                           ; preds = %entry
+; CHECK: jmp _f6
+  %call10 = tail call i32 @f6() nounwind
+  br label %return
+
+return:                                           ; preds = %entry, %sw.bb9, %sw.bb7, %sw.bb5, %sw.bb3, %sw.bb1, %sw.bb
+  %retval.0 = phi i32 [ %call10, %sw.bb9 ], [ %call8, %sw.bb7 ], [ %call6, %sw.bb5 ], [ %call4, %sw.bb3 ], [ %call2, %sw.bb1 ], [ %call, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @f1()
+
+declare i32 @f2()
+
+declare i32 @f3()
+
+declare i32 @f4()
+
+declare i32 @f5()
+
+declare i32 @f6()
+
+; rdar://11958338
+%0 = type opaque
+
+declare i8* @bar(i8*) uwtable optsize noinline ssp
+
+define hidden %0* @thingWithValue(i8* %self) uwtable ssp {
+entry:
+; CHECK: thingWithValue:
+; CHECK: jmp _bar
+  br i1 undef, label %if.then.i, label %if.else.i
+
+if.then.i:                                        ; preds = %entry
+  br label %someThingWithValue.exit
+
+if.else.i:                                        ; preds = %entry
+  %call4.i = tail call i8* @bar(i8* undef) optsize
+  br label %someThingWithValue.exit
+
+someThingWithValue.exit:                          ; preds = %if.else.i, %if.then.i
+  %retval.0.in.i = phi i8* [ undef, %if.then.i ], [ %call4.i, %if.else.i ]
+  %retval.0.i = bitcast i8* %retval.0.in.i to %0*
+  ret %0* %retval.0.i
+}
diff --git a/test/CodeGen/X86/tailcall-i1.ll b/test/CodeGen/X86/tailcall-i1.ll
deleted file mode 100644
index 8ef1f11383be9..0000000000000
--- a/test/CodeGen/X86/tailcall-i1.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
-define fastcc i1 @i1test(i32, i32, i32, i32) {
-  entry:
-  %4 = tail call fastcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
-  ret i1 %4
-}
diff --git a/test/CodeGen/X86/tailcall-largecode.ll b/test/CodeGen/X86/tailcall-largecode.ll
index c3f4278aecbe5..e9b8721e6608a 100644
--- a/test/CodeGen/X86/tailcall-largecode.ll
+++ b/test/CodeGen/X86/tailcall-largecode.ll
@@ -49,6 +49,11 @@ define fastcc i32 @direct_manyargs() {
 ;  CHECK: pushq
 ; Pass the stack argument.
 ;  CHECK: movl $7, 16(%rsp)
+; This is the large code model, so &manyargs_callee may not fit into
+; the jmp instruction.  Put it into a register which won't be clobbered
+; while restoring callee-saved registers and won't be used for passing
+; arguments.
+;  CHECK: movabsq $manyargs_callee, %rax
 ; Pass the register arguments, in the right registers.
 ;  CHECK: movl $1, %edi
 ;  CHECK: movl $2, %esi
@@ -56,11 +61,6 @@ define fastcc i32 @direct_manyargs() {
 ;  CHECK: movl $4, %ecx
 ;  CHECK: movl $5, %r8d
 ;  CHECK: movl $6, %r9d
-; This is the large code model, so &manyargs_callee may not fit into
-; the jmp instruction.  Put it into R11, which won't be clobbered
-; while restoring callee-saved registers and won't be used for passing
-; arguments.
-;  CHECK: movabsq $manyargs_callee, %rax
 ; Adjust the stack to "return".
 ;  CHECK: popq
 ; And tail-call to the target.
diff --git a/test/CodeGen/X86/tailcall-void.ll b/test/CodeGen/X86/tailcall-void.ll
deleted file mode 100644
index 4e578d1b6410d..0000000000000
--- a/test/CodeGen/X86/tailcall-void.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
-define fastcc void @i1test(i32, i32, i32, i32) {
-  entry:
-   tail call fastcc void @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
-   ret void 
-}
diff --git a/test/CodeGen/X86/tailcall1.ll b/test/CodeGen/X86/tailcall.ll
index f7ff5d5308d66..36a38e0b69d06 100644
--- a/test/CodeGen/X86/tailcall1.ll
+++ b/test/CodeGen/X86/tailcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL | count 5
+; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL | count 7
 
 ; With -tailcallopt, CodeGen guarantees a tail call optimization
 ; for all of these.
@@ -38,3 +38,15 @@ define fastcc i32 @noret() nounwind {
   tail call fastcc void @does_not_return()
   unreachable
 }
+
+define fastcc void @void_test(i32, i32, i32, i32) {
+  entry:
+   tail call fastcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3)
+   ret void 
+}
+
+define fastcc i1 @i1test(i32, i32, i32, i32) {
+  entry:
+  %4 = tail call fastcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
+  ret i1 %4
+}
diff --git a/test/CodeGen/X86/tailcallbyval.ll b/test/CodeGen/X86/tailcallbyval.ll
index 03d6f9411e68e..118eee6ba6cd1 100644
--- a/test/CodeGen/X86/tailcallbyval.ll
+++ b/test/CodeGen/X86/tailcallbyval.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
-; RUN: llc < %s -march=x86 -tailcallopt | grep {movl\[\[:space:\]\]*4(%esp), %eax} | count 1
+; RUN: llc < %s -march=x86 -tailcallopt | grep "movl[[:space:]]*4(%esp), %eax" | count 1
 %struct.s = type {i32, i32, i32, i32, i32, i32, i32, i32,
                   i32, i32, i32, i32, i32, i32, i32, i32,
                   i32, i32, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/targetLoweringGeneric.ll b/test/CodeGen/X86/targetLoweringGeneric.ll
new file mode 100644
index 0000000000000..ba5f8f83619fa
--- /dev/null
+++ b/test/CodeGen/X86/targetLoweringGeneric.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=i386-apple-darwin9 -fast-isel=false -O0 < %s | FileCheck %s
+
+; Gather non-machine specific tests for the transformations in
+; CodeGen/SelectionDAG/TargetLowering.  Currently, these
+; can't be tested easily by checking the SDNodes that are
+; the data structures that these transformations act on.
+; Therefore, use X86 assembler output to check against.
+
+; rdar://11195364 A problem with the transformation:
+;  If all of the demanded bits on one side are known, and all of the set
+;  bits on that side are also known to be set on the other side, turn this
+;  into an AND, as we know the bits will be cleared.
+; The known set (one) bits for the arguments %xor1 are not the same, so the
+; transformation should not occur
+define void @foo(i32 %i32In1, i32 %i32In2, i32 %i32In3, i32 %i32In4, 
+                 i32 %i32In5, i32 %i32In6, i32* %i32StarOut, i1 %i1In1, 
+                 i32* %i32SelOut) nounwind {
+    %and3 = and i32 %i32In1, 1362779777
+    %or2 = or i32 %i32In2, %i32In3
+    %and2 = and i32 %or2, 1362779777
+    %xor3 = xor i32 %and3, %and2
+    ; CHECK: shll
+    %shl1 = shl i32 %xor3, %i32In4
+    %sub1 = sub i32 %or2, %shl1
+    %add1 = add i32 %sub1, %i32In5
+    %and1 = and i32 %add1, 1
+    %xor2 = xor i32 %and1, 1
+    %or1 = or i32 %xor2, 364806994 ;0x15BE8352
+    ; CHECK-NOT: andl $96239955
+    %xor1 = xor i32 %or1, 268567040 ;0x10020200
+    ; force an output so not DCE'd
+    store i32 %xor1, i32* %i32StarOut
+    ; force not fast isel by using a select
+    %i32SelVal = select i1 %i1In1, i32 %i32In1, i32 %xor1
+    store i32 %i32SelVal, i32* %i32SelOut
+    ; CHECK: ret
+    ret void
+}
diff --git a/test/CodeGen/X86/thiscall-struct-return.ll b/test/CodeGen/X86/thiscall-struct-return.ll
index a7be48355f697..0507cb890cd23 100644
--- a/test/CodeGen/X86/thiscall-struct-return.ll
+++ b/test/CodeGen/X86/thiscall-struct-return.ll
@@ -10,7 +10,7 @@ declare x86_thiscallcc void @_ZNK1C6MediumEv(%struct.M* noalias sret %agg.result
 
 define void @testv() nounwind {
 ; CHECK: testv:
-; CHECK: leal
+; CHECK: leal 16(%esp), %esi
 ; CHECK-NEXT: movl	%esi, (%esp)
 ; CHECK-NEXT: calll _ZN1CC1Ev
 ; CHECK: leal 8(%esp), %eax
@@ -29,7 +29,7 @@ entry:
 
 define void @test2v() nounwind {
 ; CHECK: test2v:
-; CHECK: leal
+; CHECK: leal 16(%esp), %esi
 ; CHECK-NEXT: movl	%esi, (%esp)
 ; CHECK-NEXT: calll _ZN1CC1Ev
 ; CHECK: leal 8(%esp), %eax
diff --git a/test/CodeGen/X86/tls-local-dynamic.ll b/test/CodeGen/X86/tls-local-dynamic.ll
new file mode 100644
index 0000000000000..c5fd16bbec22f
--- /dev/null
+++ b/test/CodeGen/X86/tls-local-dynamic.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck  %s
+
+@x = internal thread_local global i32 0, align 4
+@y = internal thread_local global i32 0, align 4
+
+; get_x and get_y are here to prevent x and y to be optimized away as 0
+
+define i32* @get_x() {
+entry:
+  ret i32* @x
+; FIXME: This function uses a single thread-local variable,
+; so we might want to fall back to general-dynamic here.
+; CHECK:       get_x:
+; CHECK:       leaq x@TLSLD(%rip), %rdi
+; CHECK-NEXT:  callq __tls_get_addr@PLT
+; CHECK:       x@DTPOFF
+}
+
+define i32* @get_y() {
+entry:
+  ret i32* @y
+}
+
+define i32 @f(i32 %i) {
+entry:
+  %cmp = icmp eq i32 %i, 1
+  br i1 %cmp, label %return, label %if.else
+; This bb does not access TLS, so should not call __tls_get_addr.
+; CHECK:       f:
+; CHECK-NOT:   __tls_get_addr
+; CHECK:       je
+
+
+if.else:
+  %0 = load i32* @x, align 4
+  %cmp1 = icmp eq i32 %i, 2
+  br i1 %cmp1, label %if.then2, label %return
+; Now we call __tls_get_addr.
+; CHECK:       # %if.else
+; CHECK:       leaq x@TLSLD(%rip), %rdi
+; CHECK-NEXT:  callq __tls_get_addr@PLT
+; CHECK:       x@DTPOFF
+
+
+if.then2:
+  %1 = load i32* @y, align 4
+  %add = add nsw i32 %1, %0
+  br label %return
+; This accesses TLS, but is dominated by the previous block,
+; so should not have to call __tls_get_addr again.
+; CHECK:       # %if.then2
+; CHECK-NOT:   __tls_get_addr
+; CHECK:       y@DTPOFF
+
+
+return:
+  %retval.0 = phi i32 [ %add, %if.then2 ], [ 5, %entry ], [ %0, %if.else ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/X86/tls-models.ll b/test/CodeGen/X86/tls-models.ll
new file mode 100644
index 0000000000000..7c527e210a90a
--- /dev/null
+++ b/test/CodeGen/X86/tls-models.ll
@@ -0,0 +1,166 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64_PIC %s
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X32_PIC %s
+
+; Darwin always uses the same model.
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=DARWIN %s
+
+@external_gd = external thread_local global i32
+@internal_gd = internal thread_local global i32 42
+
+@external_ld = external thread_local(localdynamic) global i32
+@internal_ld = internal thread_local(localdynamic) global i32 42
+
+@external_ie = external thread_local(initialexec) global i32
+@internal_ie = internal thread_local(initialexec) global i32 42
+
+@external_le = external thread_local(localexec) global i32
+@internal_le = internal thread_local(localexec) global i32 42
+
+; ----- no model specified -----
+
+define i32* @f1() {
+entry:
+  ret i32* @external_gd
+
+  ; Non-PIC code can use initial-exec, PIC code has to use general dynamic.
+  ; X64:     f1:
+  ; X64:     external_gd@GOTTPOFF
+  ; X32:     f1:
+  ; X32:     external_gd@INDNTPOFF
+  ; X64_PIC: f1:
+  ; X64_PIC: external_gd@TLSGD
+  ; X32_PIC: f1:
+  ; X32_PIC: external_gd@TLSGD
+  ; DARWIN:  f1:
+  ; DARWIN:  _external_gd@TLVP
+}
+
+define i32* @f2() {
+entry:
+  ret i32* @internal_gd
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic.
+  ; X64:     f2:
+  ; X64:     internal_gd@TPOFF
+  ; X32:     f2:
+  ; X32:     internal_gd@NTPOFF
+  ; X64_PIC: f2:
+  ; X64_PIC: internal_gd@TLSLD
+  ; X32_PIC: f2:
+  ; X32_PIC: internal_gd@TLSLDM
+  ; DARWIN:  f2:
+  ; DARWIN:  _internal_gd@TLVP
+}
+
+
+; ----- localdynamic specified -----
+
+define i32* @f3() {
+entry:
+  ret i32* @external_ld
+
+  ; Non-PIC code can use initial exec, PIC code use local dynamic as specified.
+  ; X64:     f3:
+  ; X64:     external_ld@GOTTPOFF
+  ; X32:     f3:
+  ; X32:     external_ld@INDNTPOFF
+  ; X64_PIC: f3:
+  ; X64_PIC: external_ld@TLSLD
+  ; X32_PIC: f3:
+  ; X32_PIC: external_ld@TLSLDM
+  ; DARWIN:  f3:
+  ; DARWIN:  _external_ld@TLVP
+}
+
+define i32* @f4() {
+entry:
+  ret i32* @internal_ld
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic.
+  ; X64:     f4:
+  ; X64:     internal_ld@TPOFF
+  ; X32:     f4:
+  ; X32:     internal_ld@NTPOFF
+  ; X64_PIC: f4:
+  ; X64_PIC: internal_ld@TLSLD
+  ; X32_PIC: f4:
+  ; X32_PIC: internal_ld@TLSLDM
+  ; DARWIN:  f4:
+  ; DARWIN:  _internal_ld@TLVP
+}
+
+
+; ----- initialexec specified -----
+
+define i32* @f5() {
+entry:
+  ret i32* @external_ie
+
+  ; Non-PIC and PIC code will use initial exec as specified.
+  ; X64:     f5:
+  ; X64:     external_ie@GOTTPOFF
+  ; X32:     f5:
+  ; X32:     external_ie@INDNTPOFF
+  ; X64_PIC: f5:
+  ; X64_PIC: external_ie@GOTTPOFF
+  ; X32_PIC: f5:
+  ; X32_PIC: external_ie@GOTNTPOFF
+  ; DARWIN:  f5:
+  ; DARWIN:  _external_ie@TLVP
+}
+
+define i32* @f6() {
+entry:
+  ret i32* @internal_ie
+
+  ; Non-PIC code can use local exec, PIC code use initial exec as specified.
+  ; X64:     f6:
+  ; X64:     internal_ie@TPOFF
+  ; X32:     f6:
+  ; X32:     internal_ie@NTPOFF
+  ; X64_PIC: f6:
+  ; X64_PIC: internal_ie@GOTTPOFF
+  ; X32_PIC: f6:
+  ; X32_PIC: internal_ie@GOTNTPOFF
+  ; DARWIN:  f6:
+  ; DARWIN:  _internal_ie@TLVP
+}
+
+
+; ----- localexec specified -----
+
+define i32* @f7() {
+entry:
+  ret i32* @external_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; X64:     f7:
+  ; X64:     external_le@TPOFF
+  ; X32:     f7:
+  ; X32:     external_le@NTPOFF
+  ; X64_PIC: f7:
+  ; X64_PIC: external_le@TPOFF
+  ; X32_PIC: f7:
+  ; X32_PIC: external_le@NTPOFF
+  ; DARWIN:  f7:
+  ; DARWIN:  _external_le@TLVP
+}
+
+define i32* @f8() {
+entry:
+  ret i32* @internal_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; X64:     f8:
+  ; X64:     internal_le@TPOFF
+  ; X32:     f8:
+  ; X32:     internal_le@NTPOFF
+  ; X64_PIC: f8:
+  ; X64_PIC: internal_le@TPOFF
+  ; X32_PIC: f8:
+  ; X32_PIC: internal_le@NTPOFF
+  ; DARWIN:  f8:
+  ; DARWIN:  _internal_le@TLVP
+}
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index b83416d4b32b8..51c3d2363f8bf 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64 %s
 
 @i = thread_local global i32 15
+@j = internal thread_local global i32 42
+@k = internal thread_local global i32 42
 
 define i32 @f1() {
 entry:
@@ -64,4 +66,22 @@ entry:
 ; X64:   callq __tls_get_addr@PLT
 
 
+define i32 @f5() nounwind {
+entry:
+	%0 = load i32* @j, align 4
+	%1 = load i32* @k, align 4
+	%add = add nsw i32 %0, %1
+	ret i32 %add
+}
 
+; X32:    f5:
+; X32:      leal {{[jk]}}@TLSLDM(%ebx)
+; X32-NEXT: calll ___tls_get_addr@PLT
+; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
+; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+
+; X64:    f5:
+; X64:      leaq {{[jk]}}@TLSLD(%rip), %rdi
+; X64-NEXT: callq	__tls_get_addr@PLT
+; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
+; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll
index e2e58a541a4cd..3fca9f5a37910 100644
--- a/test/CodeGen/X86/tls-pie.ll
+++ b/test/CodeGen/X86/tls-pie.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
 ; RUN:   | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
 ; RUN:   | FileCheck -check-prefix=X64 %s
 
 @i = thread_local global i32 15
@@ -35,7 +35,12 @@ entry:
 
 define i32 @f3() {
 ; X32: f3:
-; X32:      movl i2@INDNTPOFF, %eax
+; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .L{{[0-9]+}}$pb:
+; X32-NEXT: popl %eax
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %eax
+; X32-NEXT: movl i2@GOTNTPOFF(%eax), %eax
 ; X32-NEXT: movl %gs:(%eax), %eax
 ; X32-NEXT: ret
 ; X64: f3:
@@ -50,8 +55,13 @@ entry:
 
 define i32* @f4() {
 ; X32: f4:
-; X32:      movl %gs:0, %eax
-; X32-NEXT: addl i2@INDNTPOFF, %eax
+; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .L{{[0-9]+}}$pb:
+; X32-NEXT: popl %ecx
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %ecx
+; X32-NEXT: movl %gs:0, %eax
+; X32-NEXT: addl i2@GOTNTPOFF(%ecx), %eax
 ; X32-NEXT: ret
 ; X64: f4:
 ; X64:      movq %fs:0, %rax
diff --git a/test/CodeGen/X86/trap.ll b/test/CodeGen/X86/trap.ll
index 03ae6bfc869ec..3f44be0b500c6 100644
--- a/test/CodeGen/X86/trap.ll
+++ b/test/CodeGen/X86/trap.ll
@@ -1,9 +1,21 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep ud2
-define i32 @test() noreturn nounwind  {
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+
+; CHECK: test0:
+; CHECK: ud2
+define i32 @test0() noreturn nounwind  {
 entry:
 	tail call void @llvm.trap( )
 	unreachable
 }
 
+; CHECK: test1:
+; CHECK: int3
+define i32 @test1() noreturn nounwind  {
+entry:
+	tail call void @llvm.debugtrap( )
+	unreachable
+}
+
 declare void @llvm.trap() nounwind 
+declare void @llvm.debugtrap() nounwind 
 
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 57d6e97767b8f..9877d7be169b2 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41 | FileCheck %s
 
 ;CHECK: load_2_i8
 ; A single 16-bit load
diff --git a/test/CodeGen/X86/twoaddr-coalesce-2.ll b/test/CodeGen/X86/twoaddr-coalesce-2.ll
index 6f16a2548aa67..af6d47af7a0fb 100644
--- a/test/CodeGen/X86/twoaddr-coalesce-2.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats |& \
-; RUN:   grep {twoaddrinstr} | grep {Number of instructions aggressively commuted}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -stats 2>&1 | \
+; RUN:   grep "twoaddrinstr" | grep "Number of instructions aggressively commuted"
 ; rdar://6480363
 
 target triple = "i386-apple-darwin9.6"
diff --git a/test/CodeGen/X86/twoaddr-pass-sink.ll b/test/CodeGen/X86/twoaddr-pass-sink.ll
index 077fee0773926..513c304e3bf84 100644
--- a/test/CodeGen/X86/twoaddr-pass-sink.ll
+++ b/test/CodeGen/X86/twoaddr-pass-sink.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats |& grep {Number of 3-address instructions sunk}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -stats 2>&1 | grep "Number of 3-address instructions sunk"
 
 define void @t2(<2 x i64>* %vDct, <2 x i64>* %vYp, i8* %skiplist, <2 x i64> %a1) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/uint_to_fp.ll b/test/CodeGen/X86/uint_to_fp.ll
index 41ee1947edc49..0536eb05222c5 100644
--- a/test/CodeGen/X86/uint_to_fp.ll
+++ b/test/CodeGen/X86/uint_to_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep {sub.*esp}
+; RUN: llc < %s -march=x86 -mcpu=yonah | not grep "sub.*esp"
 ; RUN: llc < %s -march=x86 -mcpu=yonah | grep cvtsi2ss
 ; rdar://6034396
 
diff --git a/test/CodeGen/X86/umul-with-carry.ll b/test/CodeGen/X86/umul-with-carry.ll
index 7416051693be0..56fdadbf937bc 100644
--- a/test/CodeGen/X86/umul-with-carry.ll
+++ b/test/CodeGen/X86/umul-with-carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {jc} | count 1
+; RUN: llc < %s -march=x86 | grep "jc" | count 1
 ; XFAIL: *
 
 ; FIXME: umul-with-overflow not supported yet.
diff --git a/test/CodeGen/X86/unwindraise.ll b/test/CodeGen/X86/unwindraise.ll
new file mode 100644
index 0000000000000..a438723d9bd41
--- /dev/null
+++ b/test/CodeGen/X86/unwindraise.ll
@@ -0,0 +1,252 @@
+; RUN: llc < %s -verify-machineinstrs
+; PR13188
+;
+; The _Unwind_RaiseException function can return normally and via eh.return.
+; This causes confusion about the function live-out registers, since the two
+; different ways of returning have different return values.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-freebsd9.0"
+
+%struct._Unwind_Context = type { [18 x i8*], i8*, i8*, i8*, %struct.dwarf_eh_bases, i64, i64, i64, [18 x i8] }
+%struct.dwarf_eh_bases = type { i8*, i8*, i8* }
+%struct._Unwind_FrameState = type { %struct.frame_state_reg_info, i64, i64, i8*, i32, i8*, i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)*, i64, i64, i64, i8, i8, i8, i8, i8* }
+%struct.frame_state_reg_info = type { [18 x %struct.anon], %struct.frame_state_reg_info* }
+%struct.anon = type { %union.anon, i32 }
+%union.anon = type { i64 }
+%struct._Unwind_Exception = type { i64, void (i32, %struct._Unwind_Exception*)*, i64, i64 }
+
+@dwarf_reg_size_table = external hidden unnamed_addr global [18 x i8], align 16
+
+declare void @abort() noreturn
+
+declare fastcc i32 @uw_frame_state_for(%struct._Unwind_Context*, %struct._Unwind_FrameState*) uwtable
+
+define hidden i32 @_Unwind_RaiseException(%struct._Unwind_Exception* %exc) uwtable {
+entry:
+  %fs.i = alloca %struct._Unwind_FrameState, align 8
+  %this_context = alloca %struct._Unwind_Context, align 8
+  %cur_context = alloca %struct._Unwind_Context, align 8
+  %fs = alloca %struct._Unwind_FrameState, align 8
+  call void @llvm.eh.unwind.init()
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  %1 = call i8* @llvm.returnaddress(i32 0)
+  call fastcc void @uw_init_context_1(%struct._Unwind_Context* %this_context, i8* %0, i8* %1)
+  %2 = bitcast %struct._Unwind_Context* %cur_context to i8*
+  %3 = bitcast %struct._Unwind_Context* %this_context to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 240, i32 8, i1 false)
+  %personality = getelementptr inbounds %struct._Unwind_FrameState* %fs, i64 0, i32 6
+  %retaddr_column.i = getelementptr inbounds %struct._Unwind_FrameState* %fs, i64 0, i32 9
+  %flags.i.i.i.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 5
+  %ra.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 2
+  %exception_class = getelementptr inbounds %struct._Unwind_Exception* %exc, i64 0, i32 0
+  br label %while.body
+
+while.body:                                       ; preds = %uw_update_context.exit, %entry
+  %call = call fastcc i32 @uw_frame_state_for(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs)
+  switch i32 %call, label %do.end21 [
+    i32 5, label %do.end21.loopexit46
+    i32 0, label %if.end3
+  ]
+
+if.end3:                                          ; preds = %while.body
+  %4 = load i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)** %personality, align 8, !tbaa !0
+  %tobool = icmp eq i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)* %4, null
+  br i1 %tobool, label %if.end13, label %if.then4
+
+if.then4:                                         ; preds = %if.end3
+  %5 = load i64* %exception_class, align 8, !tbaa !3
+  %call6 = call i32 %4(i32 1, i32 1, i64 %5, %struct._Unwind_Exception* %exc, %struct._Unwind_Context* %cur_context)
+  switch i32 %call6, label %do.end21.loopexit46 [
+    i32 6, label %while.end
+    i32 8, label %if.end13
+  ]
+
+if.end13:                                         ; preds = %if.then4, %if.end3
+  call fastcc void @uw_update_context_1(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs)
+  %6 = load i64* %retaddr_column.i, align 8, !tbaa !3
+  %conv.i = trunc i64 %6 to i32
+  %cmp.i.i.i = icmp slt i32 %conv.i, 18
+  br i1 %cmp.i.i.i, label %cond.end.i.i.i, label %cond.true.i.i.i
+
+cond.true.i.i.i:                                  ; preds = %if.end13
+  call void @abort() noreturn
+  unreachable
+
+cond.end.i.i.i:                                   ; preds = %if.end13
+  %sext.i = shl i64 %6, 32
+  %idxprom.i.i.i = ashr exact i64 %sext.i, 32
+  %arrayidx.i.i.i = getelementptr inbounds [18 x i8]* @dwarf_reg_size_table, i64 0, i64 %idxprom.i.i.i
+  %7 = load i8* %arrayidx.i.i.i, align 1, !tbaa !1
+  %arrayidx2.i.i.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 0, i64 %idxprom.i.i.i
+  %8 = load i8** %arrayidx2.i.i.i, align 8, !tbaa !0
+  %9 = load i64* %flags.i.i.i.i, align 8, !tbaa !3
+  %and.i.i.i.i = and i64 %9, 4611686018427387904
+  %tobool.i.i.i = icmp eq i64 %and.i.i.i.i, 0
+  br i1 %tobool.i.i.i, label %if.end.i.i.i, label %land.lhs.true.i.i.i
+
+land.lhs.true.i.i.i:                              ; preds = %cond.end.i.i.i
+  %arrayidx4.i.i.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 8, i64 %idxprom.i.i.i
+  %10 = load i8* %arrayidx4.i.i.i, align 1, !tbaa !1
+  %tobool6.i.i.i = icmp eq i8 %10, 0
+  br i1 %tobool6.i.i.i, label %if.end.i.i.i, label %if.then.i.i.i
+
+if.then.i.i.i:                                    ; preds = %land.lhs.true.i.i.i
+  %11 = ptrtoint i8* %8 to i64
+  br label %uw_update_context.exit
+
+if.end.i.i.i:                                     ; preds = %land.lhs.true.i.i.i, %cond.end.i.i.i
+  %cmp8.i.i.i = icmp eq i8 %7, 8
+  br i1 %cmp8.i.i.i, label %if.then10.i.i.i, label %cond.true14.i.i.i
+
+if.then10.i.i.i:                                  ; preds = %if.end.i.i.i
+  %12 = bitcast i8* %8 to i64*
+  %13 = load i64* %12, align 8, !tbaa !3
+  br label %uw_update_context.exit
+
+cond.true14.i.i.i:                                ; preds = %if.end.i.i.i
+  call void @abort() noreturn
+  unreachable
+
+uw_update_context.exit:                           ; preds = %if.then10.i.i.i, %if.then.i.i.i
+  %retval.0.i.i.i = phi i64 [ %11, %if.then.i.i.i ], [ %13, %if.then10.i.i.i ]
+  %14 = inttoptr i64 %retval.0.i.i.i to i8*
+  store i8* %14, i8** %ra.i, align 8, !tbaa !0
+  br label %while.body
+
+while.end:                                        ; preds = %if.then4
+  %private_1 = getelementptr inbounds %struct._Unwind_Exception* %exc, i64 0, i32 2
+  store i64 0, i64* %private_1, align 8, !tbaa !3
+  %15 = load i8** %ra.i, align 8, !tbaa !0
+  %16 = ptrtoint i8* %15 to i64
+  %private_2 = getelementptr inbounds %struct._Unwind_Exception* %exc, i64 0, i32 3
+  store i64 %16, i64* %private_2, align 8, !tbaa !3
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 240, i32 8, i1 false)
+  %17 = bitcast %struct._Unwind_FrameState* %fs.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %17)
+  %personality.i = getelementptr inbounds %struct._Unwind_FrameState* %fs.i, i64 0, i32 6
+  %retaddr_column.i22 = getelementptr inbounds %struct._Unwind_FrameState* %fs.i, i64 0, i32 9
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %uw_update_context.exit44, %while.end
+  %call.i = call fastcc i32 @uw_frame_state_for(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs.i)
+  %18 = load i8** %ra.i, align 8, !tbaa !0
+  %19 = ptrtoint i8* %18 to i64
+  %20 = load i64* %private_2, align 8, !tbaa !3
+  %cmp.i = icmp eq i64 %19, %20
+  %cmp2.i = icmp eq i32 %call.i, 0
+  br i1 %cmp2.i, label %if.end.i, label %do.end21
+
+if.end.i:                                         ; preds = %while.body.i
+  %21 = load i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)** %personality.i, align 8, !tbaa !0
+  %tobool.i = icmp eq i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)* %21, null
+  br i1 %tobool.i, label %if.end12.i, label %if.then3.i
+
+if.then3.i:                                       ; preds = %if.end.i
+  %or.i = select i1 %cmp.i, i32 6, i32 2
+  %22 = load i64* %exception_class, align 8, !tbaa !3
+  %call5.i = call i32 %21(i32 1, i32 %or.i, i64 %22, %struct._Unwind_Exception* %exc, %struct._Unwind_Context* %cur_context)
+  switch i32 %call5.i, label %do.end21 [
+    i32 7, label %do.body19
+    i32 8, label %if.end12.i
+  ]
+
+if.end12.i:                                       ; preds = %if.then3.i, %if.end.i
+  br i1 %cmp.i, label %cond.true.i, label %cond.end.i
+
+cond.true.i:                                      ; preds = %if.end12.i
+  call void @abort() noreturn
+  unreachable
+
+cond.end.i:                                       ; preds = %if.end12.i
+  call fastcc void @uw_update_context_1(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs.i)
+  %23 = load i64* %retaddr_column.i22, align 8, !tbaa !3
+  %conv.i23 = trunc i64 %23 to i32
+  %cmp.i.i.i24 = icmp slt i32 %conv.i23, 18
+  br i1 %cmp.i.i.i24, label %cond.end.i.i.i33, label %cond.true.i.i.i25
+
+cond.true.i.i.i25:                                ; preds = %cond.end.i
+  call void @abort() noreturn
+  unreachable
+
+cond.end.i.i.i33:                                 ; preds = %cond.end.i
+  %sext.i26 = shl i64 %23, 32
+  %idxprom.i.i.i27 = ashr exact i64 %sext.i26, 32
+  %arrayidx.i.i.i28 = getelementptr inbounds [18 x i8]* @dwarf_reg_size_table, i64 0, i64 %idxprom.i.i.i27
+  %24 = load i8* %arrayidx.i.i.i28, align 1, !tbaa !1
+  %arrayidx2.i.i.i29 = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 0, i64 %idxprom.i.i.i27
+  %25 = load i8** %arrayidx2.i.i.i29, align 8, !tbaa !0
+  %26 = load i64* %flags.i.i.i.i, align 8, !tbaa !3
+  %and.i.i.i.i31 = and i64 %26, 4611686018427387904
+  %tobool.i.i.i32 = icmp eq i64 %and.i.i.i.i31, 0
+  br i1 %tobool.i.i.i32, label %if.end.i.i.i39, label %land.lhs.true.i.i.i36
+
+land.lhs.true.i.i.i36:                            ; preds = %cond.end.i.i.i33
+  %arrayidx4.i.i.i34 = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 8, i64 %idxprom.i.i.i27
+  %27 = load i8* %arrayidx4.i.i.i34, align 1, !tbaa !1
+  %tobool6.i.i.i35 = icmp eq i8 %27, 0
+  br i1 %tobool6.i.i.i35, label %if.end.i.i.i39, label %if.then.i.i.i37
+
+if.then.i.i.i37:                                  ; preds = %land.lhs.true.i.i.i36
+  %28 = ptrtoint i8* %25 to i64
+  br label %uw_update_context.exit44
+
+if.end.i.i.i39:                                   ; preds = %land.lhs.true.i.i.i36, %cond.end.i.i.i33
+  %cmp8.i.i.i38 = icmp eq i8 %24, 8
+  br i1 %cmp8.i.i.i38, label %if.then10.i.i.i40, label %cond.true14.i.i.i41
+
+if.then10.i.i.i40:                                ; preds = %if.end.i.i.i39
+  %29 = bitcast i8* %25 to i64*
+  %30 = load i64* %29, align 8, !tbaa !3
+  br label %uw_update_context.exit44
+
+cond.true14.i.i.i41:                              ; preds = %if.end.i.i.i39
+  call void @abort() noreturn
+  unreachable
+
+uw_update_context.exit44:                         ; preds = %if.then10.i.i.i40, %if.then.i.i.i37
+  %retval.0.i.i.i42 = phi i64 [ %28, %if.then.i.i.i37 ], [ %30, %if.then10.i.i.i40 ]
+  %31 = inttoptr i64 %retval.0.i.i.i42 to i8*
+  store i8* %31, i8** %ra.i, align 8, !tbaa !0
+  br label %while.body.i
+
+do.body19:                                        ; preds = %if.then3.i
+  call void @llvm.lifetime.end(i64 -1, i8* %17)
+  %call20 = call fastcc i64 @uw_install_context_1(%struct._Unwind_Context* %this_context, %struct._Unwind_Context* %cur_context)
+  %32 = load i8** %ra.i, align 8, !tbaa !0
+  call void @llvm.eh.return.i64(i64 %call20, i8* %32)
+  unreachable
+
+do.end21.loopexit46:                              ; preds = %if.then4, %while.body
+  %retval.0.ph = phi i32 [ 3, %if.then4 ], [ 5, %while.body ]
+  br label %do.end21
+
+do.end21:                                         ; preds = %do.end21.loopexit46, %if.then3.i, %while.body.i, %while.body
+  %retval.0 = phi i32 [ %retval.0.ph, %do.end21.loopexit46 ], [ 3, %while.body ], [ 2, %while.body.i ], [ 2, %if.then3.i ]
+  ret i32 %retval.0
+}
+
+declare void @llvm.eh.unwind.init() nounwind
+
+declare fastcc void @uw_init_context_1(%struct._Unwind_Context*, i8*, i8*) uwtable
+
+declare i8* @llvm.eh.dwarf.cfa(i32) nounwind
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+declare fastcc i64 @uw_install_context_1(%struct._Unwind_Context*, %struct._Unwind_Context*) uwtable
+
+declare void @llvm.eh.return.i64(i64, i8*) nounwind
+
+declare fastcc void @uw_update_context_1(%struct._Unwind_Context*, %struct._Unwind_FrameState* nocapture) uwtable
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"long", metadata !1}
diff --git a/test/CodeGen/X86/v-binop-widen2.ll b/test/CodeGen/X86/v-binop-widen2.ll
index ae3f55a316fa5..569586af4983e 100644
--- a/test/CodeGen/X86/v-binop-widen2.ll
+++ b/test/CodeGen/X86/v-binop-widen2.ll
@@ -1,9 +1,16 @@
-; RUN: llc -march=x86 -mattr=+sse < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=generic -mattr=+sse < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s
 
 %vec = type <6 x float>
 ; CHECK: divss
 ; CHECK: divss
 ; CHECK: divps
+
+; Scheduler causes a different instruction order to be produced on Intel Atom
+; ATOM: divps
+; ATOM: divss
+; ATOM: divss
+
 define %vec @vecdiv( %vec %p1, %vec %p2)
 {
   %result = fdiv %vec %p1, %p2
diff --git a/test/CodeGen/X86/vec_call.ll b/test/CodeGen/X86/vec_call.ll
index f2fc7e7d9d5dc..e0862ca8d1c43 100644
--- a/test/CodeGen/X86/vec_call.ll
+++ b/test/CodeGen/X86/vec_call.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8 | \
-; RUN:   grep {subl.*60}
+; RUN:   grep "subl.*60"
 ; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8 | \
-; RUN:   grep {movaps.*32}
+; RUN:   grep "movaps.*32"
 
 
 define void @test() {
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
new file mode 100644
index 0000000000000..08eb16f6313ba
--- /dev/null
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+;CHECK: foo1_8
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <8 x float> @foo1_8(<8 x i8> %src) {
+  %res = sitofp <8 x i8> %src to <8 x float>
+  ret <8 x float> %res
+}
+
+;CHECK: foo1_4
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <4 x float> @foo1_4(<4 x i8> %src) {
+  %res = sitofp <4 x i8> %src to <4 x float>
+  ret <4 x float> %res
+}
+
+;CHECK: foo2_8
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <8 x float> @foo2_8(<8 x i8> %src) {
+  %res = uitofp <8 x i8> %src to <8 x float>
+  ret <8 x float> %res
+}
+
+;CHECK: foo2_4
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <4 x float> @foo2_4(<4 x i8> %src) {
+  %res = uitofp <4 x i8> %src to <4 x float>
+  ret <4 x float> %res
+}
+
+;CHECK: foo3_8
+;CHECK: vcvttps2dq
+;CHECK: ret
+define <8 x i8> @foo3_8(<8 x float> %src) {
+  %res = fptosi <8 x float> %src to <8 x i8>
+  ret <8 x i8> %res
+}
+;CHECK: foo3_4
+;CHECK: vcvttps2dq
+;CHECK: ret
+define <4 x i8> @foo3_4(<4 x float> %src) {
+  %res = fptosi <4 x float> %src to <4 x i8>
+  ret <4 x i8> %res
+}
+
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
index 91777f7aa6b40..46d6a23554f4a 100644
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ b/test/CodeGen/X86/vec_compare-2.ll
@@ -10,8 +10,7 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
 entry:
 ; CHECK: cfi_def_cfa_offset
 ; CHECK-NOT: set
-; CHECK: movzwl
-; CHECK: movzwl
+; CHECK: punpcklwd
 ; CHECK: pshufd
 ; CHECK: pshufb
   %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll
index 39c9b770d5f49..367dd27f30769 100644
--- a/test/CodeGen/X86/vec_compare.ll
+++ b/test/CodeGen/X86/vec_compare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=yonah -mtriple=i386-apple-darwin | FileCheck %s
 
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
diff --git a/test/CodeGen/X86/vec_ins_extract-1.ll b/test/CodeGen/X86/vec_ins_extract-1.ll
index 29511934af019..565be7a6cc709 100644
--- a/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep {(%esp,%eax,4)} | count 4
+; RUN: llc < %s -march=x86 -mcpu=yonah | grep "(%esp,%eax,4)" | count 4
 
 ; Inserts and extracts with variable indices must be lowered
 ; to memory accesses.
diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll
index de3b36ff126c2..2a4864a48a257 100644
--- a/test/CodeGen/X86/vec_insert-6.ll
+++ b/test/CodeGen/X86/vec_insert-6.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pslldq
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | grep pslldq
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6
 
 define <4 x float> @t3(<4 x float>* %P) nounwind  {
 	%tmp1 = load <4 x float>* %P
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index ada17e0092a80..d1d7608a04117 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -o %t
 ; RUN: grep pshufd %t | count 2
 
 define <4 x float> @test(float %a) nounwind {
diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll
index 3656e5f6ca47b..b8ec0cf080957 100644
--- a/test/CodeGen/X86/vec_set-9.ll
+++ b/test/CodeGen/X86/vec_set-9.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86-64 | grep movd | count 1
-; RUN: llc < %s -march=x86-64 | grep {movlhps.*%xmm0, %xmm0}
+; RUN: llc < %s -march=x86-64 | grep "movlhps.*%xmm0, %xmm0"
 
 define <2 x i64> @test3(i64 %A) nounwind {
 entry:
diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll
index 06f38ed842723..09d4c1a64a01c 100644
--- a/test/CodeGen/X86/vec_shuffle-16.ll
+++ b/test/CodeGen/X86/vec_shuffle-16.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
 
 ; sse:  t1:
 ; sse2: t1:
diff --git a/test/CodeGen/X86/vec_shuffle-19.ll b/test/CodeGen/X86/vec_shuffle-19.ll
index 861a1cc5b93cf..b26f920e5e230 100644
--- a/test/CodeGen/X86/vec_shuffle-19.ll
+++ b/test/CodeGen/X86/vec_shuffle-19.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o /dev/null -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4
+; RUN: llc < %s -o /dev/null -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4
 ; PR2485
 
 define <4 x i32> @t(<4 x i32> %a, <4 x i32> %b) nounwind  {
diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll
index dec98c7400a55..0aff822850c09 100644
--- a/test/CodeGen/X86/vec_shuffle-27.ll
+++ b/test/CodeGen/X86/vec_shuffle-27.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s
 
 ; ModuleID = 'vec_shuffle-27.bc'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
@@ -35,4 +35,4 @@ entry:
   store <4 x i64> %vect1487, <4 x i64>* %ap
   store <4 x i64> %vect1488, <4 x i64>* %bp
   ret void;
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll
index 7f0fcb5969e4f..f5083b4b8011a 100644
--- a/test/CodeGen/X86/vec_shuffle-35.ll
+++ b/test/CodeGen/X86/vec_shuffle-35.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mcpu=yonah -stack-alignment=16 -o %t
-; RUN: grep pextrw %t | count 13
-; RUN: grep pinsrw %t | count 14
+; RUN: grep pextrw %t | count 12
+; RUN: grep pinsrw %t | count 13
 ; RUN: grep rolw %t | count 13
 ; RUN: not grep esp %t
 ; RUN: not grep ebp %t
diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll
index 8090afc7434df..9a06015745ed3 100644
--- a/test/CodeGen/X86/vec_shuffle-36.ll
+++ b/test/CodeGen/X86/vec_shuffle-36.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse41 | FileCheck %s
 
 define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; CHECK: pshufb
diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll
index 430aa046afab6..ed285f93fe1bd 100644
--- a/test/CodeGen/X86/vec_shuffle-37.ll
+++ b/test/CodeGen/X86/vec_shuffle-37.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 | FileCheck %s
 ; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0
 
 define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {
diff --git a/test/CodeGen/X86/vec_shuffle-38.ll b/test/CodeGen/X86/vec_shuffle-38.ll
index 96ef883c4e1e4..ec196df7aeff8 100644
--- a/test/CodeGen/X86/vec_shuffle-38.ll
+++ b/test/CodeGen/X86/vec_shuffle-38.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
 
 define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp {
 ; CHECK: unpcklpd
diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll
index 55531e305cb86..ee8d2d5e0b3ef 100644
--- a/test/CodeGen/X86/vec_shuffle-39.ll
+++ b/test/CodeGen/X86/vec_shuffle-39.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s
 ; rdar://10050222, rdar://10134392
 
 define <4 x float> @t1(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll
index cde5ae99563e5..f105de4d977d5 100644
--- a/test/CodeGen/X86/vec_splat-2.ll
+++ b/test/CodeGen/X86/vec_splat-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pshufd | count 1
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 | grep pshufd | count 1
 
 define void @test(<2 x i64>* %P, i8 %x) nounwind {
 	%tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0		; <<16 x i8>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll
index 649b85c5dadd6..feacc42406df7 100644
--- a/test/CodeGen/X86/vec_splat-3.ll
+++ b/test/CodeGen/X86/vec_splat-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 -o %t
 ; RUN: grep punpcklwd %t | count 4
 ; RUN: grep punpckhwd %t | count 4
 ; RUN: grep "pshufd" %t | count 8
diff --git a/test/CodeGen/X86/vec_splat-4.ll b/test/CodeGen/X86/vec_splat-4.ll
index d9941e65bde35..374acfa4e094c 100644
--- a/test/CodeGen/X86/vec_splat-4.ll
+++ b/test/CodeGen/X86/vec_splat-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 -o %t
 ; RUN: grep punpcklbw %t | count 16
 ; RUN: grep punpckhbw %t | count 16
 ; RUN: grep "pshufd" %t | count 16
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
index a87fbd0dc6555..24d8487f17bd4 100644
--- a/test/CodeGen/X86/vec_splat.ll
+++ b/test/CodeGen/X86/vec_splat.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pshufd
-; RUN: llc < %s -march=x86 -mattr=+sse3 | grep movddup
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 | grep pshufd
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse3 | grep movddup
 
 define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
 	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll
index 3bd3f7b60b3bb..c294df575a10c 100644
--- a/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -70,3 +70,17 @@ define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
 ; CHECK: call
 ; CHECK: roundss $4, %xmm{{.*}}, %xmm0
 }
+
+; PR13576 
+define  <2 x double> @test5() nounwind uwtable readnone noinline {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double
+4.569870e+02, double 1.233210e+02>, i32 128) nounwind readnone
+  ret <2 x double> %0
+; CHECK: test5:
+; CHECK: mov
+; CHECK: mov
+; CHECK: cvtsi2sd
+}
+
+declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/vshift-1.ll b/test/CodeGen/X86/vshift-1.ll
index 49551562c5ae7..e775750bbea5f 100644
--- a/test/CodeGen/X86/vshift-1.ll
+++ b/test/CodeGen/X86/vshift-1.ll
@@ -16,7 +16,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
 entry:
 ; CHECK: shift1b:
 ; CHECK: movd
-; CHECK-NEXT: psllq
+; CHECK: psllq
   %0 = insertelement <2 x i64> undef, i64 %amt, i32 0
   %1 = insertelement <2 x i64> %0, i64 %amt, i32 1
   %shl = shl <2 x i64> %val, %1
@@ -38,7 +38,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift2b:
 ; CHECK: movd
-; CHECK-NEXT: pslld
+; CHECK: pslld
   %0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %1 = insertelement <4 x i32> %0, i32 %amt, i32 1
   %2 = insertelement <4 x i32> %1, i32 %amt, i32 2
diff --git a/test/CodeGen/X86/vshift-2.ll b/test/CodeGen/X86/vshift-2.ll
index 9a9b419abea5b..9496893bd1a73 100644
--- a/test/CodeGen/X86/vshift-2.ll
+++ b/test/CodeGen/X86/vshift-2.ll
@@ -16,7 +16,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
 entry:
 ; CHECK: shift1b:
 ; CHECK: movd
-; CHECK-NEXT: psrlq
+; CHECK: psrlq
   %0 = insertelement <2 x i64> undef, i64 %amt, i32 0
   %1 = insertelement <2 x i64> %0, i64 %amt, i32 1
   %lshr = lshr <2 x i64> %val, %1
@@ -37,7 +37,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift2b:
 ; CHECK: movd
-; CHECK-NEXT: psrld
+; CHECK: psrld
   %0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %1 = insertelement <4 x i32> %0, i32 %amt, i32 1
   %2 = insertelement <4 x i32> %1, i32 %amt, i32 2
@@ -63,7 +63,7 @@ entry:
 ; CHECK: shift3b:
 ; CHECK: movzwl
 ; CHECK: movd
-; CHECK-NEXT: psrlw
+; CHECK: psrlw
   %0 = insertelement <8 x i16> undef, i16 %amt, i32 0
   %1 = insertelement <8 x i16> %0, i16 %amt, i32 1
   %2 = insertelement <8 x i16> %0, i16 %amt, i32 2
diff --git a/test/CodeGen/X86/vshift-3.ll b/test/CodeGen/X86/vshift-3.ll
index 8e8a9aa04b279..b2b48b9da9351 100644
--- a/test/CodeGen/X86/vshift-3.ll
+++ b/test/CodeGen/X86/vshift-3.ll
@@ -28,7 +28,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift2b:
 ; CHECK: movd
-; CHECK-NEXT: psrad
+; CHECK: psrad
   %0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %1 = insertelement <4 x i32> %0, i32 %amt, i32 1
   %2 = insertelement <4 x i32> %1, i32 %amt, i32 2
@@ -52,7 +52,7 @@ entry:
 ; CHECK: shift3b:
 ; CHECK: movzwl
 ; CHECK: movd
-; CHECK-NEXT: psraw
+; CHECK: psraw
   %0 = insertelement <8 x i16> undef, i16 %amt, i32 0
   %1 = insertelement <8 x i16> %0, i16 %amt, i32 1
   %2 = insertelement <8 x i16> %0, i16 %amt, i32 2
diff --git a/test/CodeGen/X86/vshift-5.ll b/test/CodeGen/X86/vshift-5.ll
index cb254aeb57350..f6c311dee5211 100644
--- a/test/CodeGen/X86/vshift-5.ll
+++ b/test/CodeGen/X86/vshift-5.ll
@@ -6,7 +6,7 @@ define void @shift5a(<4 x i32> %val, <4 x i32>* %dst, i32* %pamt) nounwind {
 entry:
 ; CHECK: shift5a:
 ; CHECK: movd
-; CHECK-NEXT: pslld
+; CHECK: pslld
   %amt = load i32* %pamt 
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer 
@@ -20,7 +20,7 @@ define void @shift5b(<4 x i32> %val, <4 x i32>* %dst, i32* %pamt) nounwind {
 entry:
 ; CHECK: shift5b:
 ; CHECK: movd
-; CHECK-NEXT: psrad
+; CHECK: psrad
   %amt = load i32* %pamt 
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer 
@@ -34,7 +34,7 @@ define void @shift5c(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift5c:
 ; CHECK: movd
-; CHECK-NEXT: pslld
+; CHECK: pslld
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer
   %shl = shl <4 x i32> %val, %shamt
@@ -47,7 +47,7 @@ define void @shift5d(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift5d:
 ; CHECK: movd
-; CHECK-NEXT: psrad
+; CHECK: psrad
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer
   %shr = ashr <4 x i32> %val, %shamt
diff --git a/test/CodeGen/X86/widen_arith-3.ll b/test/CodeGen/X86/widen_arith-3.ll
index f55b184f3acc1..d86042a44806f 100644
--- a/test/CodeGen/X86/widen_arith-3.ll
+++ b/test/CodeGen/X86/widen_arith-3.ll
@@ -2,7 +2,6 @@
 ; CHECK: incl
 ; CHECK: incl
 ; CHECK: incl
-; CHECK: addl
 
 ; Widen a v3i16 to v8i16 to do a vector add
 
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index 4330aae8ec823..ebdfea9a37f7b 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -1,7 +1,14 @@
-; RUN: llc -march=x86 -mattr=+sse42 < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=generic -mattr=+sse42 < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=atom -mattr=+sse42 < %s | FileCheck -check-prefix=ATOM %s
+
 ; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movd
+; CHECK: movl
+; CHECK: movlpd
+
+; Scheduler causes produce a different instruction order
+; ATOM: movl
+; ATOM: paddd
+; ATOM: movlpd
 
 ; bitcast a v4i16 to v2i32
 
diff --git a/test/CodeGen/X86/widen_cast-2.ll b/test/CodeGen/X86/widen_cast-2.ll
index 5c695ea00033c..3979ce466d1e4 100644
--- a/test/CodeGen/X86/widen_cast-2.ll
+++ b/test/CodeGen/X86/widen_cast-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
 ; CHECK: pextrd
 ; CHECK: pextrd
 ; CHECK: movd
diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll
index 136578df1e8e8..9086d3a9cfd2d 100644
--- a/test/CodeGen/X86/widen_cast-5.ll
+++ b/test/CodeGen/X86/widen_cast-5.ll
@@ -1,9 +1,8 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
 ; CHECK: movl
-; CHECK: movd
+; CHECK: movlpd
 
 ; bitcast a i64 to v2i32
-
 define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind {
 entry:
 	%conv = bitcast i64 %src to <2 x i32>
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index affd796ffc3fa..1158e04553921 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
 ; CHECK-NOT: cvtsi2ss
 
 ; unsigned to float v7i16 to v7f32
diff --git a/test/CodeGen/X86/widen_extract-1.ll b/test/CodeGen/X86/widen_extract-1.ll
index 4bcac58f2b6c8..86727421ce032 100644
--- a/test/CodeGen/X86/widen_extract-1.ll
+++ b/test/CodeGen/X86/widen_extract-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
 ; widen extract subvector
 
 define void @convert(<2 x double>* %dst.addr, <3 x double> %src)  {
diff --git a/test/CodeGen/X86/widen_load-0.ll b/test/CodeGen/X86/widen_load-0.ll
index 4aeec9136d0ee..d5437281b274f 100644
--- a/test/CodeGen/X86/widen_load-0.ll
+++ b/test/CodeGen/X86/widen_load-0.ll
@@ -1,18 +1,12 @@
 ; RUN: llc < %s -o - -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
-; RUN: llc < %s -o - -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s -check-prefix=WIN64
 ; PR4891
 
 ; Both loads should happen before either store.
 
-; CHECK: movd  ({{.*}}), {{.*}}
-; CHECK: movd  ({{.*}}), {{.*}}
-; CHECK: movd  {{.*}}, ({{.*}})
-; CHECK: movd  {{.*}}, ({{.*}})
-
-; WIN64: movd  ({{.*}}), {{.*}}
-; WIN64: movd  ({{.*}}), {{.*}}
-; WIN64: movd  {{.*}}, ({{.*}})
-; WIN64: movd  {{.*}}, ({{.*}})
+; CHECK: movl  ({{.*}}), {{.*}}
+; CHECK: movl  ({{.*}}), {{.*}}
+; CHECK: movl  {{.*}}, ({{.*}})
+; CHECK: movl  {{.*}}, ({{.*}})
 
 define void @short2_int_swap(<2 x i16>* nocapture %b, i32* nocapture %c) nounwind {
 entry:
diff --git a/test/CodeGen/X86/win64_alloca_dynalloca.ll b/test/CodeGen/X86/win64_alloca_dynalloca.ll
index a961c6af18844..cc11e4c28e212 100644
--- a/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -1,12 +1,9 @@
-; RUN: llc < %s -join-physregs -mcpu=generic -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
-; RUN: llc < %s -join-physregs -mcpu=generic -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -join-physregs -mcpu=generic -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
 ; PR8777
 ; PR8778
 
-; Passing the same value in two registers creates a false interference that
-; only -join-physregs resolves. It could also be handled by a parallel copy.
-
 define i64 @foo(i64 %n, i64 %x) nounwind {
 entry:
 
@@ -31,19 +28,19 @@ entry:
 
   %buf1 = alloca i8, i64 %n, align 1
 
-; M64: leaq  15(%rcx), %rax
+; M64: leaq  15(%{{.*}}), %rax
 ; M64: andq  $-16, %rax
 ; M64: callq ___chkstk
 ; M64-NOT:   %rsp
 ; M64: movq  %rsp, %rax
 
-; W64: leaq  15(%rcx), %rax
+; W64: leaq  15(%{{.*}}), %rax
 ; W64: andq  $-16, %rax
 ; W64: callq __chkstk
 ; W64: subq  %rax, %rsp
 ; W64: movq  %rsp, %rax
 
-; EFI: leaq  15(%rcx), [[R1:%r.*]]
+; EFI: leaq  15(%{{.*}}), [[R1:%r.*]]
 ; EFI: andq  $-16, [[R1]]
 ; EFI: movq  %rsp, [[R64:%r.*]]
 ; EFI: subq  [[R1]], [[R64]]
diff --git a/test/CodeGen/X86/x86-64-arg.ll b/test/CodeGen/X86/x86-64-arg.ll
index ec8dd8edb6342..9a959e839a954 100644
--- a/test/CodeGen/X86/x86-64-arg.ll
+++ b/test/CodeGen/X86/x86-64-arg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {movl	%edi, %eax}
+; RUN: llc < %s | grep "movl	%edi, %eax"
 ; The input value is already sign extended, don't re-extend it.
 ; This testcase corresponds to:
 ;   int test(short X) { return (int)X; }
diff --git a/test/CodeGen/X86/x86-64-dead-stack-adjust.ll b/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
index 79316f29de37a..902c9d5ae081d 100644
--- a/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
+++ b/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s | not grep rsp
-; RUN: llc < %s | grep cvttsd2siq
+; RUN: llc < %s -mcpu=nehalem | not grep rsp
+; RUN: llc < %s -mcpu=nehalem | grep cvttsd2siq
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin8"
diff --git a/test/CodeGen/X86/x86-64-pic-1.ll b/test/CodeGen/X86/x86-64-pic-1.ll
index 46f6d335d05c7..46cd4f81bcf9d 100644
--- a/test/CodeGen/X86/x86-64-pic-1.ll
+++ b/test/CodeGen/X86/x86-64-pic-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	f@PLT} %t1
+; RUN: grep "callq	f@PLT" %t1
 
 define void @g() {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll
index b6f82e23b7e7b..3ec172b2b656e 100644
--- a/test/CodeGen/X86/x86-64-pic-10.ll
+++ b/test/CodeGen/X86/x86-64-pic-10.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	g@PLT} %t1
+; RUN: grep "callq	g@PLT" %t1
 
 @g = alias weak i32 ()* @f
 
diff --git a/test/CodeGen/X86/x86-64-pic-11.ll b/test/CodeGen/X86/x86-64-pic-11.ll
index 4db331cee43f7..fd64beb696b25 100644
--- a/test/CodeGen/X86/x86-64-pic-11.ll
+++ b/test/CodeGen/X86/x86-64-pic-11.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	__fixunsxfti@PLT} %t1
+; RUN: grep "callq	__fixunsxfti@PLT" %t1
 
 define i128 @f(x86_fp80 %a) nounwind {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-2.ll b/test/CodeGen/X86/x86-64-pic-2.ll
index 1ce2de7209c4f..f3f7b1dffd1f0 100644
--- a/test/CodeGen/X86/x86-64-pic-2.ll
+++ b/test/CodeGen/X86/x86-64-pic-2.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	f} %t1
-; RUN: not grep {callq	f@PLT} %t1
+; RUN: grep "callq	f" %t1
+; RUN: not grep "callq	f@PLT" %t1
 
 define void @g() {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-3.ll b/test/CodeGen/X86/x86-64-pic-3.ll
index aa3c888ed600e..ba933788a3a5f 100644
--- a/test/CodeGen/X86/x86-64-pic-3.ll
+++ b/test/CodeGen/X86/x86-64-pic-3.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	f} %t1
-; RUN: not grep {callq	f@PLT} %t1
+; RUN: grep "callq	f" %t1
+; RUN: not grep "callq	f@PLT" %t1
 
 define void @g() {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-4.ll b/test/CodeGen/X86/x86-64-pic-4.ll
index 90fc1194a33bb..33b08c4b4b049 100644
--- a/test/CodeGen/X86/x86-64-pic-4.ll
+++ b/test/CodeGen/X86/x86-64-pic-4.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movq	a@GOTPCREL(%rip),} %t1
+; RUN: grep "movq	a@GOTPCREL(%rip)," %t1
 
 @a = global i32 0
 
diff --git a/test/CodeGen/X86/x86-64-pic-5.ll b/test/CodeGen/X86/x86-64-pic-5.ll
index 6369bde6943da..234bc0d2f4f1f 100644
--- a/test/CodeGen/X86/x86-64-pic-5.ll
+++ b/test/CodeGen/X86/x86-64-pic-5.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movl	a(%rip),} %t1
+; RUN: grep "movl	a(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 @a = hidden global i32 0
diff --git a/test/CodeGen/X86/x86-64-pic-6.ll b/test/CodeGen/X86/x86-64-pic-6.ll
index 6e19ad35bcf44..ae5b5835928d4 100644
--- a/test/CodeGen/X86/x86-64-pic-6.ll
+++ b/test/CodeGen/X86/x86-64-pic-6.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movl	a(%rip),} %t1
+; RUN: grep "movl	a(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 @a = internal global i32 0
diff --git a/test/CodeGen/X86/x86-64-pic-7.ll b/test/CodeGen/X86/x86-64-pic-7.ll
index 4d98ee614026d..de240a38d63a4 100644
--- a/test/CodeGen/X86/x86-64-pic-7.ll
+++ b/test/CodeGen/X86/x86-64-pic-7.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movq	f@GOTPCREL(%rip),} %t1
+; RUN: grep "movq	f@GOTPCREL(%rip)," %t1
 
 define void ()* @g() nounwind {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-8.ll b/test/CodeGen/X86/x86-64-pic-8.ll
index d3b567c610763..db35c33623fec 100644
--- a/test/CodeGen/X86/x86-64-pic-8.ll
+++ b/test/CodeGen/X86/x86-64-pic-8.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {leaq	f(%rip),} %t1
+; RUN: grep "leaq	f(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 define void ()* @g() {
diff --git a/test/CodeGen/X86/x86-64-pic-9.ll b/test/CodeGen/X86/x86-64-pic-9.ll
index 076103133fa96..6daea84e1a73c 100644
--- a/test/CodeGen/X86/x86-64-pic-9.ll
+++ b/test/CodeGen/X86/x86-64-pic-9.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {leaq	f(%rip),} %t1
+; RUN: grep "leaq	f(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 define void ()* @g() nounwind {
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index a2521b0a66db0..8af782cd2f190 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -875,37 +875,37 @@ define <8 x i16> @test_int_x86_xop_vpshlw_mr(<8 x i16>* %a0, <8 x i16> %a1) {
 }
 declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <4 x float> @test_int_x86_xop_vfrcz_ss(<4 x float> %a0, <4 x float> %a1) {
+define <4 x float> @test_int_x86_xop_vfrcz_ss(<4 x float> %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczss
-  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0, <4 x float> %a1) ;
+  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0) ;
   ret <4 x float> %res
 }
-define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(<4 x float> %a0, float* %a1) {
+define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(float* %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczss
-  %elem = load float* %a1
+  %elem = load float* %a0
   %vec = insertelement <4 x float> undef, float %elem, i32 0
-  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0, <4 x float> %vec) ;
+  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %vec) ;
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
 
-define <2 x double> @test_int_x86_xop_vfrcz_sd(<2 x double> %a0, <2 x double> %a1) {
+define <2 x double> @test_int_x86_xop_vfrcz_sd(<2 x double> %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczsd
-  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0, <2 x double> %a1) ;
+  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0) ;
   ret <2 x double> %res
 }
-define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(<2 x double> %a0, double* %a1) {
+define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(double* %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczsd
-  %elem = load double* %a1
+  %elem = load double* %a0
   %vec = insertelement <2 x double> undef, double %elem, i32 0
-  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0, <2 x double> %vec) ;
+  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %vec) ;
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
 
 define <2 x double> @test_int_x86_xop_vfrcz_pd(<2 x double> %a0) {
   ; CHECK: vfrczpd
@@ -967,3 +967,59 @@ define <8 x float> @test_int_x86_xop_vfrcz_ps_256_mem(<8 x float>* %a0) {
 }
 declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
 
+define <16 x i8> @test_int_x86_xop_vpcomb(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK:vpcomb
+  %res = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: vpcomw
+  %res = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomd(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: vpcomd
+  %res = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomq(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: vpcomq
+  %res = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomub(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK:vpcomub
+  %res = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: vpcomuw
+  %res = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomud(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: vpcomud
+  %res = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomuq(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: vpcomuq
+  %res = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index ddc4cab14a4c5..996bfc40ee564 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2  | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2  | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
 
 ; Though it is undefined, we want xor undef,undef to produce zero.
 define <4 x i32> @test1() nounwind {
@@ -31,7 +31,7 @@ entry:
 ; X64: test3:
 ; X64:	notl
 ; X64:	andl
-; X64:	shrl	%eax
+; X64:	shrl
 ; X64:	ret
 
 ; X32: test3: